In [2]:
from gensim.models import KeyedVectors

# Load the pre-trained Google News vectors
model= KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [3]:
dog = model['dog']
print(dog.shape)

(300,)


In [4]:
import numpy as np
def vectorize(sentence):
    words=sentence
    words_vecs = [model[word] for word in words if word in model]
    if len(words_vecs) == 0:
        return np.zeros(300)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

In [5]:
import pickle

with open('preprocessed.pkl','rb')as f:
    preprocessed=pickle.load(f)

with open('ratings.pkl','rb')as f:
    ratings=pickle.load(f)

In [6]:
x=[]
for i in preprocessed:
    x.append(vectorize(i))

y=[]
for i in ratings:
    if i==3:
        y.append(2)
    elif i>3:
        y.append(3)
    elif i<3:
        y.append(1)


In [7]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=42)


In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,f1_score,classification_report
DT=DecisionTreeClassifier()
DT.fit(x_train,y_train)
y_pred=DT.predict(x_test)
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

0.6480402959231859
              precision    recall  f1-score   support

           1       0.42      0.40      0.41      1258
           2       0.16      0.17      0.16       651
           3       0.79      0.79      0.79      4444

    accuracy                           0.65      6353
   macro avg       0.45      0.45      0.45      6353
weighted avg       0.65      0.65      0.65      6353



In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsOneClassifier
model = LogisticRegression(max_iter=1000,multi_class="ovr")

model=OneVsOneClassifier(model)
# Train the model
model.fit(x_train, y_train)

# Make predictions
y_pred = model.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.72      0.54      0.61      1258
           2       0.36      0.03      0.06       651
           3       0.80      0.97      0.88      4444

    accuracy                           0.78      6353
   macro avg       0.63      0.51      0.52      6353
weighted avg       0.74      0.78      0.74      6353



In [11]:
model = LogisticRegression(max_iter=1000)
from sklearn.multiclass import OneVsOneClassifier
model=OneVsOneClassifier(model)
# Train the model
model.fit(x_train, y_train)

# Make predictions
y_pred = model.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.72      0.54      0.61      1258
           2       0.36      0.03      0.06       651
           3       0.80      0.97      0.88      4444

    accuracy                           0.78      6353
   macro avg       0.63      0.51      0.52      6353
weighted avg       0.74      0.78      0.74      6353



In [12]:
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier()
model.fit(x_train, y_train)

# Make predictions
y_pred = model.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.78      0.37      0.50      1258
           2       0.77      0.04      0.07       651
           3       0.76      0.98      0.86      4444

    accuracy                           0.76      6353
   macro avg       0.77      0.46      0.48      6353
weighted avg       0.77      0.76      0.71      6353



In [13]:
from sklearn.svm import SVC
model=SVC()
model.fit(x_train, y_train)

# Make predictions
y_pred = model.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.72      0.59      0.65      1258
           2       0.33      0.00      0.01       651
           3       0.81      0.97      0.88      4444

    accuracy                           0.80      6353
   macro avg       0.62      0.52      0.51      6353
weighted avg       0.74      0.80      0.75      6353



In [14]:
from sklearn.svm import LinearSVC
model=LinearSVC()
model.fit(x_train, y_train)

# Make predictions
y_pred = model.predict(x_test)
print(classification_report(y_test,y_pred))



              precision    recall  f1-score   support

           1       0.71      0.53      0.61      1258
           2       0.30      0.02      0.04       651
           3       0.80      0.97      0.88      4444

    accuracy                           0.78      6353
   macro avg       0.60      0.51      0.51      6353
weighted avg       0.73      0.78      0.74      6353



In [15]:
from sklearn.neighbors import KNeighborsClassifier
model=KNeighborsClassifier()
model.fit(x_train, y_train)

# Make predictions
y_pred = model.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.61      0.44      0.51      1258
           2       0.23      0.08      0.11       651
           3       0.79      0.93      0.85      4444

    accuracy                           0.74      6353
   macro avg       0.54      0.48      0.49      6353
weighted avg       0.70      0.74      0.71      6353



In [16]:
import pandas as pd
with open('filtered_df.pkl','rb')as f:
    filtered_df=pickle.load(f)
unique_users={}
unique_items={}
user_num=0
item_num=0
for x in range(len(filtered_df)):
    if filtered_df.iloc[x]['reviewerID'] not in unique_users:
        unique_users[filtered_df.iloc[x]['reviewerID']]=user_num
        user_num=user_num+1
    if filtered_df.iloc[x]['asin'] not in unique_items:
        unique_items[filtered_df.iloc[x]['asin']]=item_num
        item_num=item_num+1
    

In [17]:
print(user_num)
print(item_num)

22542
709


In [18]:
print(unique_items)

{'B0001TST6W': 0, 'B00083Y87U': 1, 'B000KRGI4K': 2, 'B0017KZEI0': 3, 'B0036WHNVS': 4, 'B00443HVJW': 5, 'B004DTU4DM': 6, 'B004UVC3HS': 7, 'B004XKA9ZO': 8, 'B00501M426': 9, 'B00512Z8H2': 10, 'B0052Z83KC': 11, 'B0056BIYQA': 12, 'B0058LOZMA': 13, 'B005GQBVWO': 14, 'B005K22JHA': 15, 'B005Z294OQ': 16, 'B0076LTKTS': 17, 'B007D29QA8': 18, 'B008M324PG': 19, 'B0092Z479Y': 20, 'B00953THVQ': 21, 'B009CKU5JU': 22, 'B009SYM9T4': 23, 'B00A7I5U9G': 24, 'B00AAVEXZW': 25, 'B00AIIMSTA': 26, 'B00AIIMSRW': 27, 'B00AKDEPFI': 28, 'B00AZO3TZO': 29, 'B00B6KR8S0': 30, 'B00B8BF4EM': 31, 'B00BETSOEU': 32, 'B00BGO0Q9O': 33, 'B00BKEQBI0': 34, 'B00BLA5PYY': 35, 'B00CABCL9K': 36, 'B00CIV693Q': 37, 'B00CRGO6AK': 38, 'B00CRKJ2AU': 39, 'B00D7LN7PO': 40, 'B00D8JIPJI': 41, 'B00DHBDDX0': 42, 'B00DLJQ7S6': 43, 'B00DM2APM6': 44, 'B00DOJCMBO': 45, 'B00DOTG38W': 46, 'B00DY03N2U': 47, 'B00E1QXN7Q': 48, 'B00EDDAMT4': 49, 'B00EDDC58A': 50, 'B00EJ3FJGE': 51, 'B00F4BXJ58': 52, 'B00FI16HS0': 53, 'B00FI16820': 54, 'B00FQMZMTQ': 55, '

In [19]:
user_item_rating_matrix=np.zeros((22542,709))

In [20]:
for x in range(len(filtered_df)):
    user_item_rating_matrix[unique_users[filtered_df.iloc[x]['reviewerID']],unique_items[filtered_df.iloc[x]['asin']]]=filtered_df.iloc[x]['overall']

In [21]:
print(user_item_rating_matrix[909])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

In [22]:
user_item_rating_matrix = (user_item_rating_matrix - user_item_rating_matrix.min()) / (user_item_rating_matrix.max() - user_item_rating_matrix.min())

In [23]:
print(user_item_rating_matrix)

[[1.  0.  0.  ... 0.  0.  0. ]
 [0.8 0.  0.  ... 0.  0.  0. ]
 [1.  0.  0.  ... 0.  0.  0. ]
 ...
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]]


In [25]:
with open('user_item_rating_matrix.pkl','wb') as f:
    pickle.dump(user_item_rating_matrix,f)

In [142]:
import numpy as np
from scipy.spatial.distance import cosine
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt

# Assume user_item_rating_matrix is the numpy matrix representing the user-item-rating data

# Function to compute cosine similarity between two vectors
def cosine_similarity(vec1, vec2):
    return 1 - cosine(vec1, vec2)

# Function to find top N similar users/items
def find_top_n_similar(similarity_matrix, n):
    top_n_indices = np.argsort(-similarity_matrix, axis=1)[:, :n]
    return top_n_indices

# Function to predict missing ratings using top N similar users/items
def predict_missing_ratings(rating_matrix, top_n_indices, n):
    predictions = np.zeros_like(rating_matrix)
    for user_idx in range(rating_matrix.shape[0]):
        user_ratings = rating_matrix[user_idx, :]
        top_n_user_indices = top_n_indices[user_idx]
        similar_users_ratings = rating_matrix[top_n_user_indices, :]
        similar_users_weights = np.sum(similar_users_ratings != 0, axis=1)
        predictions[user_idx, :] = np.sum(similar_users_ratings / similar_users_weights[:, np.newaxis], axis=0) / n
    return predictions

# Function to compute Mean Absolute Error (MAE)
def compute_mae(actual_ratings, predicted_ratings):
    diff = np.abs(actual_ratings - predicted_ratings)
    return np.mean(diff[actual_ratings != 0])

# User-User Recommender System
def user_user_recommender(rating_matrix, n_values):
    user_similarity_matrix = np.zeros((rating_matrix.shape[0], rating_matrix.shape[0]))
    for i in range(rating_matrix.shape[0]):
        for j in range(i + 1, rating_matrix.shape[0]):
            user_similarity_matrix[i, j] = user_similarity_matrix[j, i] = cosine_similarity(rating_matrix[i, :], rating_matrix[j, :])

    maes = []
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    for n in n_values:
        fold_maes = []
        for train_index, val_index in kf.split(rating_matrix):
            train_matrix = rating_matrix[train_index]
            val_matrix = rating_matrix[val_index]
            top_n_indices = find_top_n_similar(user_similarity_matrix[train_index, :], n)
            predictions = predict_missing_ratings(train_matrix, top_n_indices, n)
            mae = compute_mae(val_matrix, predictions)
            fold_maes.append(mae)
        maes.append(np.mean(fold_maes))

    return maes

# Item-Item Recommender System
def item_item_recommender(rating_matrix, n_values):
    item_similarity_matrix = np.zeros((rating_matrix.shape[1], rating_matrix.shape[1]))
    for i in range(rating_matrix.shape[1]):
        for j in range(i + 1, rating_matrix.shape[1]):
            item_similarity_matrix[i, j] = item_similarity_matrix[j, i] = cosine_similarity(rating_matrix[:, i], rating_matrix[:, j])

    maes = []
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    for n in n_values:
        fold_maes = []
        for train_index, val_index in kf.split(rating_matrix):
            train_matrix = rating_matrix[train_index]
            val_matrix = rating_matrix[val_index]
            top_n_indices = find_top_n_similar(item_similarity_matrix, n)
            predictions = predict_missing_ratings(train_matrix.T, top_n_indices, n).T
            mae = compute_mae(val_matrix, predictions)
            fold_maes.append(mae)
        maes.append(np.mean(fold_maes))

    return maes

# Plotting MAE vs N
n_values = [10, 20, 30, 40, 50]
user_user_maes = user_user_recommender(user_item_rating_matrix, n_values)
item_item_maes = item_item_recommender(user_item_rating_matrix, n_values)

plt.figure(figsize=(8, 6))
plt.plot(n_values, user_user_maes, label='User-User Recommender')
plt.plot(n_values, item_item_maes, label='Item-Item Recommender')
plt.xlabel('N (Number of Similar Users/Items)')
plt.ylabel('Mean Absolute Error (MAE)')
plt.title('MAE vs N')
plt.legend()
plt.show()

# Report TOP 10 products by User Sum Ratings
user_sum_ratings = np.sum(user_item_rating_matrix, axis=0)
top_10_products = np.argsort(-user_sum_ratings)[:10]
print("TOP 10 products by User Sum Ratings:")
print(top_10_products)

IndexError: index 21262 is out of bounds for axis 0 with size 18033