In [None]:
import pandas as pd
import numpy as np
from textblob import TextBlob
from sklearn.metrics.pairwise import cosine_similarity

TextBlob is a library for processing textual data. It provides a simple API for common NLP tasks such as sentiment analysis, part-of-speech tagging, noun phrase extraction, and translation.

cosine_similarity is a mathematical function used to measure the similarity between two non-zero vectors (usually representing text documents or features).



In [None]:
df = pd.read_csv("amazon.csv")

Unnamed: 0,product_id,rating,user_id,review_title
0,B07JW9H4J1,4.2,"AG3D6O4STAQKAY2UVGEUV46KN35Q,AHMY5CWJMMK5BJRBB...","Satisfied,Charging is really fast,Value for mo..."
1,B098NS6PVG,4.0,"AECPFYFQVRUWC3KGNLJIOREFP5LQ,AGYYVPDD7YG7FYNBX...","A Good Braided Cable for Your Type C Device,Go..."
2,B096MSW6CT,3.9,"AGU3BBQ2V2DDAMOAKGFAWDDQ6QHA,AESFLDV2PT363T2AQ...","Good speed for earlier versions,Good Product,W..."
3,B08HDJ86NZ,4.2,"AEWAZDZZJLQUYVOVGBEUKSLXHQ5A,AG5HTSFRRE6NL3M5S...","Good product,Good one,Nice,Really nice product..."
4,B08CF3B7N1,4.2,"AE3Q6KSUK5P75D5HFYHCRAOLODSA,AFUGIFH5ZAFXRDSZH...","As good as original,Decent,Good one for second..."
...,...,...,...,...
994,B08WKCTFF3,4.4,"AHMBY2YCZ6C6D5ZPODSHKAMFGXJQ,AG3O6DYHU7RR4V2YE...","Overall good product,Perfect,Amazing product f..."
995,B08498D67S,4.3,"AEVZ5C4WDFLWANNAZDB3Q33OK6JQ,AGEHBUZ4FXMTXQ5W2...","Good keyboard with some cons,Wrist pain,Worth ..."
996,B00C3GBCIS,4.2,"AH63HFCY2DBQCGPIVKPHXNHTA7WA,AFWFWVCRK5WBT2KNQ...","quality is awesome trust me guys 👍,Nice to pur..."
997,B00URH5E34,3.6,"AGA4V2SLJ744MITK2FWWGPXOFB7A,AHDOHVS266NLKERWU...","it worked properly for almost one year,ok,USB ..."


In [None]:
df1 = df.groupby('product_id')['rating'].mean()
display(df1.head())

Unnamed: 0_level_0,rating
product_id,Unnamed: 1_level_1
B002PD61Y4,4.1
B002SZEOLG,4.2
B003B00484,4.3
B003L62T7W,4.3
B004IO5BMQ,4.5


This groups the DataFrame df by the unique values in the 'product_id' column. This means that all rows with the same 'product_id' are grouped together.
 Then calculates the mean (average) of the 'rating' values for each group (each unique 'product_id').

In [None]:
len(df1[df1>3])

887

In [None]:
def sentiment_calc(review_title):
    try:
        return TextBlob(str(review_title)).sentiment.polarity
    except:
        return None
df['sentiment'] = df['review_title'].apply(sentiment_calc)
df

Unnamed: 0,product_id,rating,user_id,review_title,sentiment
0,B07JW9H4J1,4.2,"AG3D6O4STAQKAY2UVGEUV46KN35Q,AHMY5CWJMMK5BJRBB...","Satisfied,Charging is really fast,Value for mo...",0.450000
1,B098NS6PVG,4.0,"AECPFYFQVRUWC3KGNLJIOREFP5LQ,AGYYVPDD7YG7FYNBX...","A Good Braided Cable for Your Type C Device,Go...",0.700000
2,B096MSW6CT,3.9,"AGU3BBQ2V2DDAMOAKGFAWDDQ6QHA,AESFLDV2PT363T2AQ...","Good speed for earlier versions,Good Product,W...",0.433333
3,B08HDJ86NZ,4.2,"AEWAZDZZJLQUYVOVGBEUKSLXHQ5A,AG5HTSFRRE6NL3M5S...","Good product,Good one,Nice,Really nice product...",0.537500
4,B08CF3B7N1,4.2,"AE3Q6KSUK5P75D5HFYHCRAOLODSA,AFUGIFH5ZAFXRDSZH...","As good as original,Decent,Good one for second...",0.200000
...,...,...,...,...,...
994,B08WKCTFF3,4.4,"AHMBY2YCZ6C6D5ZPODSHKAMFGXJQ,AG3O6DYHU7RR4V2YE...","Overall good product,Perfect,Amazing product f...",0.443750
995,B08498D67S,4.3,"AEVZ5C4WDFLWANNAZDB3Q33OK6JQ,AGEHBUZ4FXMTXQ5W2...","Good keyboard with some cons,Wrist pain,Worth ...",0.537500
996,B00C3GBCIS,4.2,"AH63HFCY2DBQCGPIVKPHXNHTA7WA,AFWFWVCRK5WBT2KNQ...","quality is awesome trust me guys 👍,Nice to pur...",0.516667
997,B00URH5E34,3.6,"AGA4V2SLJ744MITK2FWWGPXOFB7A,AHDOHVS266NLKERWU...","it worked properly for almost one year,ok,USB ...",0.300000


This code calculates the sentiment polarity of product review titles using text blob and adds it as a new column named 'sentiment' to the DataFrame.

In [None]:
df['Updated_score'] = df['rating']*df['sentiment']
df

Unnamed: 0,product_id,rating,user_id,review_title,sentiment,Updated_score
0,B07JW9H4J1,4.2,"AG3D6O4STAQKAY2UVGEUV46KN35Q,AHMY5CWJMMK5BJRBB...","Satisfied,Charging is really fast,Value for mo...",0.450000,1.890000
1,B098NS6PVG,4.0,"AECPFYFQVRUWC3KGNLJIOREFP5LQ,AGYYVPDD7YG7FYNBX...","A Good Braided Cable for Your Type C Device,Go...",0.700000,2.800000
2,B096MSW6CT,3.9,"AGU3BBQ2V2DDAMOAKGFAWDDQ6QHA,AESFLDV2PT363T2AQ...","Good speed for earlier versions,Good Product,W...",0.433333,1.690000
3,B08HDJ86NZ,4.2,"AEWAZDZZJLQUYVOVGBEUKSLXHQ5A,AG5HTSFRRE6NL3M5S...","Good product,Good one,Nice,Really nice product...",0.537500,2.257500
4,B08CF3B7N1,4.2,"AE3Q6KSUK5P75D5HFYHCRAOLODSA,AFUGIFH5ZAFXRDSZH...","As good as original,Decent,Good one for second...",0.200000,0.840000
...,...,...,...,...,...,...
994,B08WKCTFF3,4.4,"AHMBY2YCZ6C6D5ZPODSHKAMFGXJQ,AG3O6DYHU7RR4V2YE...","Overall good product,Perfect,Amazing product f...",0.443750,1.952500
995,B08498D67S,4.3,"AEVZ5C4WDFLWANNAZDB3Q33OK6JQ,AGEHBUZ4FXMTXQ5W2...","Good keyboard with some cons,Wrist pain,Worth ...",0.537500,2.311250
996,B00C3GBCIS,4.2,"AH63HFCY2DBQCGPIVKPHXNHTA7WA,AFWFWVCRK5WBT2KNQ...","quality is awesome trust me guys 👍,Nice to pur...",0.516667,2.170000
997,B00URH5E34,3.6,"AGA4V2SLJ744MITK2FWWGPXOFB7A,AHDOHVS266NLKERWU...","it worked properly for almost one year,ok,USB ...",0.300000,1.080000


This calculates a weighted score or a combined metric by scaling the user's explicit rating with the intensity of the textual sentiment.

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['user_id'] = le.fit_transform(df['user_id'])
df

Unnamed: 0,product_id,rating,user_id,review_title,sentiment,Updated_score
0,B07JW9H4J1,4.2,385,"Satisfied,Charging is really fast,Value for mo...",0.450000,1.890000
1,B098NS6PVG,4.0,52,"A Good Braided Cable for Your Type C Device,Go...",0.700000,2.800000
2,B096MSW6CT,3.9,530,"Good speed for earlier versions,Good Product,W...",0.433333,1.690000
3,B08HDJ86NZ,4.2,160,"Good product,Good one,Nice,Really nice product...",0.537500,2.257500
4,B08CF3B7N1,4.2,14,"As good as original,Decent,Good one for second...",0.200000,0.840000
...,...,...,...,...,...,...
994,B08WKCTFF3,4.4,667,"Overall good product,Perfect,Amazing product f...",0.443750,1.952500
995,B08498D67S,4.3,157,"Good keyboard with some cons,Wrist pain,Worth ...",0.537500,2.311250
996,B00C3GBCIS,4.2,589,"quality is awesome trust me guys 👍,Nice to pur...",0.516667,2.170000
997,B00URH5E34,3.6,420,"it worked properly for almost one year,ok,USB ...",0.300000,1.080000


Converts the unique text-based user_id (e.g., AV1YnR7w..., AVfpk8K...) into a sequence of integers (0,1,2,3,…).This allows the user_id column to be used as a numerical feature in modeling, where each unique number now represents a unique user.

In [None]:
def fun(score):
    if score <= -0.25:
        # Based on classify['1'] range
        return 1
    elif score <= 1.0:
        # Based on classify['2'] range
        return 2
    elif score <= 2.0:
        # Based on classify['3'] range
        return 3
    elif score <= 3.0:
        # Based on classify['4'] range
        return 4
    else: # score > 3.0
        # Based on classify['5'] range
        return 5

# Re-run the step with the corrected function:
# df['New_score'] = df['Updated_score'].apply(classify_score)
# df['New_score'] = pd.to_numeric(df['New_score'])

The Python function fun(values) takes a numerical input (values) and attempts to classify it into one of the five categories (
′
 1
′
  to  
′
 5
′
 ) defined in the classify dictionary.

In [None]:
df['New_score'] = df['Updated_score'].apply(fun)
df['New_score'] = pd.to_numeric(df['New_score'])
df

Unnamed: 0,product_id,rating,user_id,review_title,sentiment,Updated_score,New_score
0,B07JW9H4J1,4.2,385,"Satisfied,Charging is really fast,Value for mo...",0.450000,1.890000,3
1,B098NS6PVG,4.0,52,"A Good Braided Cable for Your Type C Device,Go...",0.700000,2.800000,4
2,B096MSW6CT,3.9,530,"Good speed for earlier versions,Good Product,W...",0.433333,1.690000,3
3,B08HDJ86NZ,4.2,160,"Good product,Good one,Nice,Really nice product...",0.537500,2.257500,4
4,B08CF3B7N1,4.2,14,"As good as original,Decent,Good one for second...",0.200000,0.840000,2
...,...,...,...,...,...,...,...
994,B08WKCTFF3,4.4,667,"Overall good product,Perfect,Amazing product f...",0.443750,1.952500,3
995,B08498D67S,4.3,157,"Good keyboard with some cons,Wrist pain,Worth ...",0.537500,2.311250,4
996,B00C3GBCIS,4.2,589,"quality is awesome trust me guys 👍,Nice to pur...",0.516667,2.170000,4
997,B00URH5E34,3.6,420,"it worked properly for almost one year,ok,USB ...",0.300000,1.080000,3


this sequence is to categorize the continuous, weighted Updated_score (which was Rating × Sentiment) into a fixed set of 5 discrete classes (from 1 to 5) for simplified analysis or as a final target variable.

In [None]:
df_pivot = df.pivot_table(index='product_id',columns='user_id',values='New_score').fillna(0)
df_pivot

user_id,0,1,2,3,4,5,6,7,8,9,...,724,725,726,727,728,729,730,731,732,733
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B002PD61Y4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B002SZEOLG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B003B00484,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B003L62T7W,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B004IO5BMQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B0BNVBJW2S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B0BNXFDTZ2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B0BP18W8TM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B0BP7XLX48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0




This pandas code reshapes the DataFrame df into a pivot table (df_pivot) suitable for use in collaborative filtering recommendation systems,

In [None]:
from scipy.sparse import csr_matrix

df_pivot_matrix = csr_matrix(df_pivot.values)
print(df_pivot_matrix)


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 901 stored elements and shape (891, 734)>
  Coords	Values
  (0, 419)	3.0
  (1, 537)	2.0
  (2, 380)	4.0
  (3, 31)	3.0
  (4, 461)	2.0
  (5, 301)	4.0
  (6, 394)	3.0
  (7, 394)	3.0
  (8, 20)	2.0
  (9, 523)	2.0
  (10, 485)	3.0
  (11, 537)	2.0
  (12, 136)	3.0
  (13, 537)	2.0
  (14, 57)	4.0
  (15, 279)	3.0
  (16, 109)	3.0
  (17, 384)	3.0
  (18, 620)	5.0
  (19, 70)	1.0
  (20, 589)	4.0
  (21, 215)	2.0
  (22, 25)	2.0
  (23, 430)	2.0
  (24, 310)	4.0
  :	:
  (866, 48)	3.0
  (867, 48)	3.0
  (868, 48)	3.0
  (869, 48)	3.0
  (870, 48)	3.0
  (871, 193)	4.0
  (872, 50)	4.0
  (873, 575)	3.0
  (874, 596)	4.0
  (875, 696)	3.0
  (876, 727)	2.0
  (877, 497)	4.0
  (878, 496)	4.0
  (879, 496)	4.0
  (880, 496)	4.0
  (881, 613)	3.0
  (882, 569)	4.0
  (883, 631)	3.0
  (884, 468)	5.0
  (885, 179)	4.0
  (886, 179)	4.0
  (887, 607)	4.0
  (888, 516)	4.0
  (889, 221)	4.0
  (890, 467)	5.0


 converts the pivot table into a sparse matrix format. This format is useful because the pivot table likely has many zero values (meaning many users haven't rated all products), and a sparse matrix efficiently stores only the non-zero values, saving memory and improving performance.

In [None]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', n_neighbors=20, radius=1)
model_knn.fit(df_pivot_matrix)

The trained model_knn can now be used to query a specific product (or user) and instantly find the 20 products (or users) whose rating patterns are most similar to the target. This similarity forms the basis of the recommendations.

In [None]:
similarity_matrix = cosine_similarity(df_pivot)
similarity_matrix

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

The resulting similarity_matrix is the core component of an Item-Based Collaborative Filtering system.

If your df_pivot has N products, the similarity_matrix will be an N×N matrix.

The value at row i,column j is the similarity score between Product i and Product j.

A score close to 1 means the two products were rated similarly by users (they are good substitutes or complements).

A score close to 0 means they are very different in terms of user ratings.

In [None]:
product_ID = input('Enter Product ID according to data set : ')
data = list(df_pivot.index) #shows list of ProductID in data-set
print(data)

Enter Product ID according to data set : B00P93X2H6
['B002PD61Y4', 'B002SZEOLG', 'B003B00484', 'B003L62T7W', 'B004IO5BMQ', 'B005FYNT3G', 'B005LJQMCK', 'B005LJQMZC', 'B006LW0WDQ', 'B0083T231O', 'B0085IATT6', 'B0088TKTY2', 'B008FWZGSG', 'B008IFXQFU', 'B008QS9J6Y', 'B009LJ2BXA', 'B009VCGPSY', 'B00A0VCJPI', 'B00AXHBBXU', 'B00BN5SNF0', 'B00C3GBCIS', 'B00CEQEGPI', 'B00DJ5N9VK', 'B00E3DVQFS', 'B00EYW1U68', 'B00GE55L22', 'B00GG59HU2', 'B00GGGOYEK', 'B00GGGOYEU', 'B00GZLB57U', 'B00J4YG0PC', 'B00K32PEW4', 'B00KIE28X0', 'B00KXULGJQ', 'B00LHZW3XY', 'B00LHZWD0C', 'B00LM4W1N2', 'B00LM4X0KU', 'B00LM4X3XE', 'B00LOD70SC', 'B00LVMTA2A', 'B00LXTFMRS', 'B00LY12TH6', 'B00LY1FN1K', 'B00LZLPYHW', 'B00LZLQ624', 'B00LZPQVMK', 'B00MFPCY5C', 'B00MUTWLW4', 'B00N1U7JXM', 'B00N1U9AJS', 'B00N3XLDW0', 'B00NFD0ETQ', 'B00NH11KIK', 'B00NH11PEY', 'B00NH12R1O', 'B00NH13Q8W', 'B00NNQMYNE', 'B00OFM6PEO', 'B00P93X0VO', 'B00P93X2H6', 'B00P93X6EK', 'B00R1P3B4O', 'B00RFWNJMC', 'B00RGLI0ZS', 'B00S2SEV7K', 'B00UGZWM2I', 'B00URH5E

The primary goal of this snippet is to get a target product from the user while simultaneously showing the user valid product IDs they can choose from.

In [None]:
query_index = data.index(product_ID) #shows index of productID by USER
print(query_index)

60


The query_index is the lookup key for the next step of the recommendation process. You need this numerical index to extract the corresponding row (or column) from the similarity_matrix to find the similarity scores between the user's chosen product and all other products.

In [None]:
similarity, indices = model_knn.kneighbors(df_pivot.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 8)
print(similarity) #shows similarity distance through productID by USER
print(indices) #shows indexs of productID by USER

[[0. 1. 1. 1. 1. 1. 1. 1.]]
[[ 60 879 878 877 876 875 874 873]]


This  found the 7 most similar products to the one the user selected. The output shows how similar they are and where those similar products are located in the main data table.

This tells you how close the other products' rating patterns are to the chosen product (low distance means high similarity).

In [None]:
data_dict={}
for i in range(0, len(similarity.flatten())):    #gives length of similarity array

    if i == 0:
        print('Recommendations for {0}:\n'.format(df_pivot.index[query_index]))
    else:
        data_dict[str(df_pivot.index[indices.flatten()[i]])] = float(similarity.flatten()[i])
        print(f'{df_pivot.index[indices.flatten()[i]]}, is similarity distance = with {similarity.flatten()[i]:.20f}')

print(data_dict)

Recommendations for B00P93X2H6:

B0BMGB3CH9, is similarity distance = with 1.00000000000000000000
B0BMGB2TPR, is similarity distance = with 1.00000000000000000000
B0BLV1GNLN, is similarity distance = with 1.00000000000000000000
B0BHZCNC4P, is similarity distance = with 1.00000000000000000000
B0BHYJ8CVF, is similarity distance = with 1.00000000000000000000
B0BHVPTM2C, is similarity distance = with 1.00000000000000000000
B0BGSV43WY, is similarity distance = with 1.00000000000000000000
{'B0BMGB3CH9': 1.0, 'B0BMGB2TPR': 1.0, 'B0BLV1GNLN': 1.0, 'B0BHZCNC4P': 1.0, 'B0BHYJ8CVF': 1.0, 'B0BHVPTM2C': 1.0, 'B0BGSV43WY': 1.0}
