In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
df=pd.read_csv('sample30.csv')

In [3]:
df.head()

Unnamed: 0,id,brand,categories,manufacturer,name,reviews_date,reviews_didPurchase,reviews_doRecommend,reviews_rating,reviews_text,reviews_title,reviews_userCity,reviews_userProvince,reviews_username,user_sentiment
0,AV13O1A8GV-KLJ3akUyj,Universal Music,"Movies, Music & Books,Music,R&b,Movies & TV,Mo...",Universal Music Group / Cash Money,Pink Friday: Roman Reloaded Re-Up (w/dvd),2012-11-30T06:21:45.000Z,,,5,i love this album. it's very good. more to the...,Just Awesome,Los Angeles,,joshua,Positive
1,AV14LG0R-jtxr-f38QfS,Lundberg,"Food,Packaged Foods,Snacks,Crackers,Snacks, Co...",Lundberg,Lundberg Organic Cinnamon Toast Rice Cakes,2017-07-09T00:00:00.000Z,True,,5,Good flavor. This review was collected as part...,Good,,,dorothy w,Positive
2,AV14LG0R-jtxr-f38QfS,Lundberg,"Food,Packaged Foods,Snacks,Crackers,Snacks, Co...",Lundberg,Lundberg Organic Cinnamon Toast Rice Cakes,2017-07-09T00:00:00.000Z,True,,5,Good flavor.,Good,,,dorothy w,Positive
3,AV16khLE-jtxr-f38VFn,K-Y,"Personal Care,Medicine Cabinet,Lubricant/Sperm...",K-Y,K-Y Love Sensuality Pleasure Gel,2016-01-06T00:00:00.000Z,False,False,1,I read through the reviews on here before look...,Disappointed,,,rebecca,Negative
4,AV16khLE-jtxr-f38VFn,K-Y,"Personal Care,Medicine Cabinet,Lubricant/Sperm...",K-Y,K-Y Love Sensuality Pleasure Gel,2016-12-21T00:00:00.000Z,False,False,1,My husband bought this gel for us. The gel cau...,Irritation,,,walker557,Negative


In [4]:
print(df.isnull().sum())

id                          0
brand                       0
categories                  0
manufacturer              141
name                        0
reviews_date               46
reviews_didPurchase     14068
reviews_doRecommend      2570
reviews_rating              0
reviews_text                0
reviews_title             190
reviews_userCity        28071
reviews_userProvince    29830
reviews_username           63
user_sentiment              1
dtype: int64


In [5]:
print(df.isna().mean().round(4) * 100)

id                       0.00
brand                    0.00
categories               0.00
manufacturer             0.47
name                     0.00
reviews_date             0.15
reviews_didPurchase     46.89
reviews_doRecommend      8.57
reviews_rating           0.00
reviews_text             0.00
reviews_title            0.63
reviews_userCity        93.57
reviews_userProvince    99.43
reviews_username         0.21
user_sentiment           0.00
dtype: float64


In [6]:
df= df.drop(columns=['reviews_didPurchase','reviews_doRecommend','reviews_userCity','reviews_userProvince'])

In [7]:
df.shape

(30000, 11)

In [8]:
print(df.isna().mean().round(4) * 100)

id                  0.00
brand               0.00
categories          0.00
manufacturer        0.47
name                0.00
reviews_date        0.15
reviews_rating      0.00
reviews_text        0.00
reviews_title       0.63
reviews_username    0.21
user_sentiment      0.00
dtype: float64


In [9]:
df = df[df['reviews_text'].notna()]
df = df[df['reviews_title'].notna()]
df = df[df['reviews_username'].notna()]
df = df[df['user_sentiment'].notna()]
df = df[df['reviews_date'].notna()]
df = df[df['manufacturer'].notna()]

In [10]:
print(df.isna().mean().round(4) * 100)

id                  0.0
brand               0.0
categories          0.0
manufacturer        0.0
name                0.0
reviews_date        0.0
reviews_rating      0.0
reviews_text        0.0
reviews_title       0.0
reviews_username    0.0
user_sentiment      0.0
dtype: float64


In [11]:
df.reviews_text

0        i love this album. it's very good. more to the...
1        Good flavor. This review was collected as part...
2                                             Good flavor.
3        I read through the reviews on here before look...
4        My husband bought this gel for us. The gel cau...
                               ...                        
29995    I got this conditioner with Influenster to try...
29996    I love it , I received this for review purpose...
29997    First of all I love the smell of this product....
29998    I received this through Influenster and will n...
29999    I received this product complimentary from inf...
Name: reviews_text, Length: 29566, dtype: object

In [12]:
df.user_sentiment.value_counts()

Positive    26262
Negative     3304
Name: user_sentiment, dtype: int64

We have imbalanced reviews i.e. Positive reviews are more than Negative reviews

In [13]:
def preprocess_reviews(df,column):
    df[column]=df[column].str.lower() # convert everything to lower
    df[column]=df[column].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")# remove punctuation
    

In [14]:
df.reviews_text.head(1)[0]

"i love this album. it's very good. more to the hip hop side than her current pop sound.. SO HYPE! i listen to this everyday at the gym! i give it 5star rating all the way. her metaphors are just crazy."

In [15]:
preprocess_reviews(df,'reviews_text')

In [16]:
# remove stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [17]:
from  nltk.corpus import stopwords

In [18]:
stop_words = set(stopwords.words('english'))

In [19]:
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [20]:
from nltk.tokenize import word_tokenize
import nltk

In [21]:
df['reviews_pos_tag']=df.reviews_text.apply(lambda x:  nltk.pos_tag(word_tokenize(x)))

In [22]:
df['reviews_pos_tag'].head()

0    [(i, RB), (love, VBP), (this, DT), (album, NN)...
1    [(good, JJ), (flavor, NN), (this, DT), (review...
2                           [(good, JJ), (flavor, NN)]
3    [(i, NN), (read, VBP), (through, IN), (the, DT...
4    [(my, PRP$), (husband, NN), (bought, VBD), (th...
Name: reviews_pos_tag, dtype: object

In [23]:
from nltk.stem import WordNetLemmatizer
 
lemmatizer = WordNetLemmatizer()

In [24]:
def getLemmatizedTokens(word_with_tag_list):
    lemmatizer = WordNetLemmatizer()
    tokens= set([lemmatizer.lemmatize(word,get_wordnet_pos(tag)) for (word,tag) in word_with_tag_list if word not in stop_words ])
    return " ".join(tokens)
         

In [25]:
df['review']=df.reviews_pos_tag.apply(lambda x: getLemmatizedTokens(x) )

In [26]:
df['user_sentiment'].value_counts()

Positive    26262
Negative     3304
Name: user_sentiment, dtype: int64

In [27]:
df['user_sentiment']=df['user_sentiment'].apply(lambda x : 1 if x=='Positive' else 0)

In [28]:
df['user_sentiment'].value_counts()

1    26262
0     3304
Name: user_sentiment, dtype: int64

In [29]:
x=df['review'] 
y=df['user_sentiment']

In [30]:
x.head()

0    current rating hop ! way hype 5star hip everyd...
1            part flavor review collect promotion good
2                                          flavor good
3    much look pleasant less starter live sensation...
4    irritation felt cause buy skin like n't gel re...
Name: review, dtype: object

In [31]:
y.value_counts()

1    26262
0     3304
Name: user_sentiment, dtype: int64

In [32]:
seed = 50 

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=seed)

In [33]:
word_vectorizer = TfidfVectorizer(
    strip_accents='unicode',    # Remove accents and perform other character normalization during the preprocessing step. 
    analyzer='word',            # Whether the feature should be made of word or character n-grams.
    token_pattern=r'\w{1,}',    # Regular expression denoting what constitutes a “token”, only used if analyzer == 'word'
    ngram_range=(1, 3),         # The lower and upper boundary of the range of n-values for different n-grams to be extracted
    stop_words='english',
    sublinear_tf=True)

word_vectorizer.fit(X_train)    # Fiting it on Train
train_word_features = word_vectorizer.transform(X_train)  # Transform on Train

In [34]:
X_train_transformed = word_vectorizer.transform(X_train.tolist())
X_test_transformed = word_vectorizer.transform(X_test.tolist())

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report
from sklearn import metrics

In [36]:
logit = LogisticRegression()
logit.fit(X_train_transformed,y_train)

LogisticRegression()

In [37]:
y_pred_train= logit.predict(X_train_transformed)

In [38]:
y_pred_test = logit.predict(X_test_transformed)

In [39]:
print("Logistic Regression accuracy", accuracy_score(y_pred_test, y_test))
print(classification_report(y_pred_test, y_test))

Logistic Regression accuracy 0.8917700112739572
              precision    recall  f1-score   support

           0       0.06      0.85      0.12        74
           1       1.00      0.89      0.94      8796

    accuracy                           0.89      8870
   macro avg       0.53      0.87      0.53      8870
weighted avg       0.99      0.89      0.94      8870



In [40]:
cm_train=metrics.confusion_matrix(y_train, y_pred_train)
cm_test=metrics.confusion_matrix(y_test, y_pred_test)

In [41]:
cm_train

array([[  206,  2086],
       [    3, 18401]], dtype=int64)

In [42]:
cm_test

array([[  63,  949],
       [  11, 7847]], dtype=int64)

In [43]:
# train set
TN_tr = cm_train[0, 0] 
FP_tr = cm_train[0, 1]
FN_tr = cm_train[1, 0]
TP_tr = cm_train[1, 1]

#test set
TN = cm_test[0, 0]
FP = cm_test[0, 1]
FN = cm_test[1, 0]
TP = cm_test[1, 1]

In [44]:
sensitivity_tr = TP_tr / float(FN_tr + TP_tr)
print("sensitivity for train set: ",sensitivity_tr)
sensitivity = TP / float(FN + TP)
print("sensitivity for test set: ",sensitivity)

sensitivity for train set:  0.9998369919582699
sensitivity for test set:  0.9986001527106134


In [45]:
specificity_tr = TN_tr / float(TN_tr + FP_tr)
print("specificity for train set: ",specificity_tr)
specificity = TN / float(TN + FP)
print("specificity for test set: ",specificity)

specificity for train set:  0.08987783595113438
specificity for test set:  0.06225296442687747


####  Logistic regression with SMOTE

In [46]:

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [47]:
X_train = pd.DataFrame(X_train).iloc[:,0].tolist()

In [48]:
X_train_transformed = word_vectorizer.transform(X_train)
X_test_transformed = word_vectorizer.transform(X_test.tolist())

In [49]:
from imblearn.over_sampling import SMOTE
from collections import Counter
counter = Counter(y_train)
print('Before',counter)

# oversampling the train dataset using SMOTE
smt = SMOTE()
X_train_transformed, y_train = smt.fit_resample(X_train_transformed, y_train)

counter = Counter(y_train)
print('After',counter)

Before Counter({1: 18381, 0: 2315})
After Counter({1: 18381, 0: 18381})


In [50]:
logit = LogisticRegression()
logit.fit(X_train_transformed,y_train)

LogisticRegression()

In [51]:
y_pred_train= logit.predict(X_train_transformed)

In [52]:
y_pred_test = logit.predict(X_test_transformed)

In [53]:
print("Logistic Regression accuracy", accuracy_score(y_pred_test, y_test))
print(classification_report(y_pred_test, y_test))

Logistic Regression accuracy 0.8989853438556934
              precision    recall  f1-score   support

           0       0.76      0.53      0.63      1413
           1       0.92      0.97      0.94      7457

    accuracy                           0.90      8870
   macro avg       0.84      0.75      0.78      8870
weighted avg       0.89      0.90      0.89      8870



In [54]:
cm_train=metrics.confusion_matrix(y_train, y_pred_train)
cm_test=metrics.confusion_matrix(y_test, y_pred_test)

In [55]:
# train set
TN_tr = cm_train[0, 0] 
FP_tr = cm_train[0, 1]
FN_tr = cm_train[1, 0]
TP_tr = cm_train[1, 1]

#test set
TN = cm_test[0, 0]
FP = cm_test[0, 1]
FN = cm_test[1, 0]
TP = cm_test[1, 1]

In [56]:
sensitivity_tr = TP_tr / float(FN_tr + TP_tr)
print("sensitivity for train set: ",sensitivity_tr)
sensitivity = TP / float(FN + TP)
print("sensitivity for test set: ",sensitivity)

sensitivity for train set:  0.9625156411511887
sensitivity for test set:  0.9162542824514656


In [57]:
specificity_tr = TN_tr / float(TN_tr + FP_tr)
print("specificity for train set: ",specificity_tr)
specificity = TN / float(TN + FP)
print("specificity for test set: ",specificity)

specificity for train set:  0.9961917197105707
specificity for test set:  0.7613751263902933


#### Random forest with SMOTE

In [58]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=50, random_state=seed, n_jobs=-1)
classifier.fit(X_train_transformed,y_train)

RandomForestClassifier(n_estimators=50, n_jobs=-1, random_state=50)

In [59]:
y_pred_train= classifier.predict(X_train_transformed)

In [60]:
y_pred_test= classifier.predict(X_test_transformed)

In [61]:
print("Random forest accuracy", accuracy_score(y_pred_test, y_test))
print(classification_report(y_pred_test, y_test))

Random forest accuracy 0.9062006764374295
              precision    recall  f1-score   support

           0       0.47      0.60      0.53       781
           1       0.96      0.94      0.95      8089

    accuracy                           0.91      8870
   macro avg       0.72      0.77      0.74      8870
weighted avg       0.92      0.91      0.91      8870



In [62]:
cm_train=metrics.confusion_matrix(y_train, y_pred_train)
cm_test=metrics.confusion_matrix(y_test, y_pred_test)

In [63]:
# train set
TN_tr = cm_train[0, 0] 
FP_tr = cm_train[0, 1]
FN_tr = cm_train[1, 0]
TP_tr = cm_train[1, 1]

#test set
TN = cm_test[0, 0]
FP = cm_test[0, 1]
FN = cm_test[1, 0]
TP = cm_test[1, 1]

In [64]:
sensitivity_tr = TP_tr / float(FN_tr + TP_tr)
print("sensitivity for train set: ",sensitivity_tr)
sensitivity = TP / float(FN + TP)
print("sensitivity for test set: ",sensitivity)

sensitivity for train set:  0.9995103639627877
sensitivity for test set:  0.9604111153406928


In [65]:
specificity_tr = TN_tr / float(TN_tr + FP_tr)
print("specificity for train set: ",specificity_tr)
specificity = TN / float(TN + FP)
print("specificity for test set: ",specificity)

specificity for train set:  0.9999455959958653
specificity for test set:  0.474216380182002


#### XGboost with SMOTE

In [None]:
import xgboost as xgb
xgb = xgb.XGBClassifier(n_jobs=-1)
xgb.fit(X_train_transformed,y_train)





In [None]:
y_pred_train= xgb.predict(X_train_transformed)
y_pred_test= xgb.predict(X_test_transformed)

In [None]:
print("xgboost accuracy", accuracy_score(y_pred_test, y_test))
print(classification_report(y_pred_test, y_test))

In [None]:
cm_train=metrics.confusion_matrix(y_train, y_pred_train)
cm_test=metrics.confusion_matrix(y_test, y_pred_test)

In [None]:
# train set
TN_tr = cm_train[0, 0] 
FP_tr = cm_train[0, 1]
FN_tr = cm_train[1, 0]
TP_tr = cm_train[1, 1]

#test set
TN = cm_test[0, 0]
FP = cm_test[0, 1]
FN = cm_test[1, 0]
TP = cm_test[1, 1]

In [None]:
sensitivity_tr = TP_tr / float(FN_tr + TP_tr)
print("sensitivity for train set: ",sensitivity_tr)
sensitivity = TP / float(FN + TP)
print("sensitivity for test set: ",sensitivity)

In [None]:
specificity_tr = TN_tr / float(TN_tr + FP_tr)
print("specificity for train set: ",specificity_tr)
specificity = TN / float(TN + FP)
print("specificity for test set: ",specificity)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import time
# Building Random Forest Model.
time1 = time.time()

n_estimators = [10,20,30] 
max_features = ['auto', 'sqrt']
max_depth = [4,5,6]
max_depth.append(None) # If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]

random_grid = {'n_estimators': n_estimators, 'max_features': max_features,
               'max_depth': max_depth, 'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

rf_classifier = RandomForestClassifier(random_state=42)

rf_final = RandomizedSearchCV(estimator=rf_classifier, param_distributions=random_grid, n_iter=5, cv=3, 
                               verbose=2, random_state=42, n_jobs=-1)


rf_final.fit(X_train_transformed,y_train)

time_taken = time.time() - time1
print('Time Taken: {:.2f} seconds'.format(time_taken))

In [None]:
rf_final.best_estimator_

In [None]:
y_pred_train= rf_final.predict(X_train_transformed)
y_pred_test= rf_final.predict(X_test_transformed)

In [None]:
# train set
TN_tr = cm_train[0, 0] 
FP_tr = cm_train[0, 1]
FN_tr = cm_train[1, 0]
TP_tr = cm_train[1, 1]

#test set
TN = cm_test[0, 0]
FP = cm_test[0, 1]
FN = cm_test[1, 0]
TP = cm_test[1, 1]

In [None]:
sensitivity_tr = TP_tr / float(FN_tr + TP_tr)
print("sensitivity for train set: ",sensitivity_tr)
sensitivity = TP / float(FN + TP)
print("sensitivity for test set: ",sensitivity)

In [None]:
specificity_tr = TN_tr / float(TN_tr + FP_tr)
print("specificity for train set: ",specificity_tr)
specificity = TN / float(TN + FP)
print("specificity for test set: ",specificity)

In [None]:
df.head()

In [None]:
df1 = df[["reviews_username","reviews_rating","name"]]

In [None]:
train, test = train_test_split(df1, test_size=0.30, random_state=42)

In [None]:
print(train.shape)
print(test.shape)

In [None]:
df_pivot = train.pivot_table(
    index='reviews_username',
    columns='name',
    values='reviews_rating'
).fillna(0)

df_pivot.head(3)

In [None]:
dummy_train = train.copy()

In [None]:
dummy_train['reviews_rating'] = dummy_train['reviews_rating'].apply(lambda x: 0 if x>=1 else 1)

In [None]:
dummy_train.head()

In [None]:
dummy_train.reviews_rating.isnull().sum()

In [None]:
dummy_train = dummy_train.pivot_table(index='reviews_username', columns='name', values='reviews_rating').fillna(1)

In [None]:
dummy_train.head()

In [None]:
from sklearn.metrics.pairwise import pairwise_distances

# Creating the User Similarity Matrix using pairwise_distance function.
user_correlation = 1 - pairwise_distances(df_pivot, metric='cosine')
user_correlation[np.isnan(user_correlation)] = 0
print(user_correlation)

In [None]:
user_correlation.shape

In [None]:
# Create a user-movie matrix.
df_pivot = train.pivot_table(
    index='reviews_username',
    columns='name',
    values='reviews_rating'
)

In [None]:
mean = np.nanmean(df_pivot, axis=1)
user_df_subtracted = (df_pivot.T-mean).T

In [None]:
pd.options.display.max_columns = None
user_df_subtracted.sample()

In [None]:
from sklearn.metrics.pairwise import pairwise_distances

# User Similarity Matrix: The correlation matrix of users.
user_correlation = 1 - pairwise_distances(user_df_subtracted.fillna(0), metric='cosine')
user_correlation[np.isnan(user_correlation)] = 0
print(user_correlation)

In [None]:
np.shape(user_correlation)

In [None]:
user_correlation[user_correlation<0]=0
user_correlation

In [None]:
user_correlation.shape

In [None]:
user_predicted_ratings = np.dot(user_correlation, df_pivot.fillna(0))
user_predicted_ratings

In [None]:
user_predicted_ratings.shape

In [None]:
user_final_rating = np.multiply(user_predicted_ratings,dummy_train)
user_final_rating.head()

#### item based similarity

In [None]:
item_based_matrix = train.pivot_table(index='reviews_username', columns='name', values='reviews_rating').T

In [None]:
item_based_matrix.head()

In [None]:
item_based_matrix.shape

In [None]:
mean = np.nanmean(item_based_matrix, axis=1)
item_df_subtracted = (item_based_matrix.T-mean).T

In [None]:
item_df_subtracted.head()

In [None]:
from sklearn.metrics.pairwise import pairwise_distances

item_correlation = 1 - pairwise_distances(item_df_subtracted.fillna(0), metric='cosine')
item_correlation[np.isnan(item_correlation)] = 0
print(item_correlation)

In [None]:
item_correlation[item_correlation<0]=0
item_correlation

In [None]:
item_correlation.shape

In [None]:
item_predicted_ratings = np.dot((item_based_matrix.fillna(0).T),item_correlation)
item_predicted_ratings

In [None]:
item_predicted_ratings.shape

In [None]:
item_final_rating = np.multiply(item_predicted_ratings,dummy_train)
item_final_rating.head()

In [None]:
common = test[test.reviews_username.isin(train.reviews_username)]
common.shape

In [None]:
common_user_based_matrix = common.pivot_table(index='reviews_username', columns='name', values='reviews_rating')

In [None]:
common_user_based_matrix.shape

In [None]:
user_correlation_df = pd.DataFrame(user_correlation)

In [None]:
user_correlation_df.shape

In [None]:
user_df_subtracted.shape

In [None]:
user_df_subtracted.head(5)

In [None]:
user_correlation_df.shape

In [None]:
user_correlation_df['reviews_username'] = user_df_subtracted.index
user_correlation_df.set_index('reviews_username',inplace=True)
user_correlation_df.head()

In [None]:
common.head(1)

In [None]:
list_name = common.reviews_username.tolist()

user_correlation_df.columns = user_df_subtracted.index.tolist()
user_correlation_df_1 =  user_correlation_df[user_correlation_df.index.isin(list_name)]


In [None]:
user_correlation_df_1.shape

In [None]:
user_correlation_df_2 = user_correlation_df_1.T[user_correlation_df_1.T.index.isin(list_name)]

In [None]:
user_correlation_df_3 = user_correlation_df_2.T

In [None]:
user_correlation_df_3.head()

In [None]:
user_correlation_df_3[user_correlation_df_3<0]=0

common_user_predicted_ratings = np.dot(user_correlation_df_3, common_user_based_matrix.fillna(0))
common_user_predicted_ratings

In [None]:
dummy_test = common.copy()

dummy_test['reviews_rating'] = dummy_test['reviews_rating'].apply(lambda x: 1 if x>=1 else 0)

dummy_test = dummy_test.pivot_table(index='reviews_username', columns='name', values='reviews_rating').fillna(0)

In [None]:
dummy_test.shape

In [None]:
common_user_predicted_ratings = np.multiply(common_user_predicted_ratings,dummy_test)

In [None]:
common_user_predicted_ratings.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler
from numpy import *

X  = common_user_predicted_ratings.copy() 
X = X[X>0]

scaler = MinMaxScaler(feature_range=(1, 5))
print(scaler.fit(X))
y = (scaler.transform(X))

print(y)

In [None]:
common_ = common.pivot_table(index='reviews_username', columns='name', values='reviews_rating')

In [None]:
# Finding total non-NaN value
total_non_nan = np.count_nonzero(~np.isnan(y))

In [None]:
rmse = (sum(sum((common_ - y )**2))/total_non_nan)**0.5
print(rmse)

In [None]:
common =  test[test.name.isin(train.name)]

common.name.nunique()
common.shape

In [None]:
common.head(1)

In [None]:
common_item_based_matrix = common.pivot_table(index='reviews_username', columns='name', values='reviews_rating').T

In [None]:
common_item_based_matrix.shape

In [None]:
item_correlation_df = pd.DataFrame(item_correlation)

In [None]:
item_correlation_df.head(1)

In [None]:
item_correlation_df['reviews_items'] = item_df_subtracted.index
item_correlation_df.set_index('reviews_items',inplace=True)
item_correlation_df.head()

In [None]:
common_item_based_matrix.shape

In [None]:
list_name = common.name.tolist()

In [None]:
item_correlation_df.columns = item_df_subtracted.index.tolist()
item_correlation_df_1 =  item_correlation_df[item_correlation_df.index.isin(list_name)]

In [None]:
item_correlation_df_2 = item_correlation_df_1.T[item_correlation_df_1.T.index.isin(list_name)]
item_correlation_df_3 = item_correlation_df_2.T

In [None]:
item_correlation_df_3[item_correlation_df_3<0]=0
common_item_predicted_ratings = np.dot(item_correlation_df_3, common_item_based_matrix.fillna(0))
common_item_predicted_ratings


In [None]:
common_item_predicted_ratings.shape

In [None]:
dummy_test = common.copy()

dummy_test['reviews_rating'] = dummy_test['reviews_rating'].apply(lambda x: 1 if x>=1 else 0)

dummy_test = dummy_test.pivot_table(index='reviews_username', columns='name', values='reviews_rating').T.fillna(0)

common_item_predicted_ratings = np.multiply(common_item_predicted_ratings,dummy_test)

In [None]:
common_ = common.pivot_table(index='reviews_username', columns='name', values='reviews_rating').T

In [None]:
from sklearn.preprocessing import MinMaxScaler
from numpy import *

X  = common_item_predicted_ratings.copy() 
X = X[X>0]

scaler = MinMaxScaler(feature_range=(1, 5))
print(scaler.fit(X))
y = (scaler.transform(X))

print(y)

In [None]:
# Finding total non-NaN value
total_non_nan = np.count_nonzero(~np.isnan(y))

In [None]:
rmse = (sum(sum((common_ - y )**2))/total_non_nan)**0.5
print(rmse)

In [None]:
# save the respective files Pickle 
import pickle
pickle.dump(user_final_rating,open('user_final_rating.pkl','wb'))
user_final_rating =  pickle.load(open('user_final_rating.pkl', 'rb'))

In [None]:
# Using User based similarity system as its RMSE value is less than item based similarity system.
user_input = input("Enter your user name")
print(user_input)

In [None]:
# Recommending the Top 20 products to the user.
d = user_final_rating.loc[user_input].sort_values(ascending=False)[0:20]
d

In [None]:
# save the respective files and models through Pickle 
import pickle
pickle.dump(logit,open('logit_model.pkl', 'wb'))
# loading pickle object
logit =  pickle.load(open('logit_model.pkl', 'rb'))

pickle.dump(word_vectorizer,open('word_vectorizer.pkl','wb'))
# loading pickle object
word_vectorizer = pickle.load(open('word_vectorizer.pkl','rb'))

In [None]:
# Define a function to recommend top 5 filtered products to the user.
def recommend(user_input):
    d = user_final_rating.loc[user_input].sort_values(ascending=False)[0:20]

    # Based on positive sentiment percentage.
    i= 0
    a = {}
    for prod_name in d.index.tolist():
      product_name = prod_name
      product_name_review_list =df[df['name']== product_name]['review'].tolist()
      features= word_vectorizer.transform(product_name_review_list)
      logit.predict(features)
      a[product_name] = logit.predict(features).mean()*100
    b= pd.Series(a).sort_values(ascending = False).head(5).index.tolist()
    print(b)

In [None]:
recommend(user_input)

In [None]:
df.to_csv("df.csv",index=False)