# Searching the best algorithm to predict the Amazon review ratings

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pymongo import MongoClient

from functions.nlp import cleaning_review
from functions.preproc import tfidf,svc_dimred
from functions.automl import BestClassifier

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

client = MongoClient()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ordovas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
db = client.get_database("amazon")
# Define an alias for the books dataset
books =db.books
# Obtain a random sample from the dataset, selecting only a few records 
# (we will use only overall and reviewTest, but I will load a few more just in case I
# want to play with more info...)
res = list(books.aggregate([
    { "$sample": { "size": 50000 }}
    ,{ "$project": {"id": "$_id", "_id": 0, "overall": 1, "reviewText": 1,"summary":1,"reviewerName":1}} 
]))
# Converting to pandas DataFrame
df=pd.DataFrame(res)

df=df.dropna()
df=df.reset_index()

In [3]:
df["review_clean"]=df["reviewText"]
df["review_clean"]=df["review_clean"].apply(cleaning_review)
df_tfidf=tfidf(df["review_clean"],5)
comps,var,svd_transformer = svc_dimred(df_tfidf,500)
data_svd = svd_transformer.transform(df_tfidf)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(data_svd), df["overall"], test_size=0.2, random_state=42)
clf=BestClassifier(X_train,y_train,[ "LinearSVC","SGDClassifier",
                             "KNeighborsClassifier","RandomForestClassifier","LogisticRegression"])

Analyzing LinearSVC
Score = 0.6480092115583865
Analyzing SGDClassifier
Score = 0.6296768710727754
Analyzing KNeighborsClassifier
Score = 0.6319027995928688
Analyzing RandomForestClassifier
Score = 0.6254501390998113
Analyzing LogisticRegression
Score = 0.6634404547528234
Best model:
LogisticRegression(C=4.641588833612772, max_iter=10000)


In [5]:
clf.score(X_train,y_train),clf.score(X_test,y_test)

(0.6789465786314526, 0.6668667466986795)

In [6]:
print(clf.bestclassifier_)

LogisticRegression(C=4.641588833612772, max_iter=10000)
