In [18]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

In [19]:
df = pd.read_csv('20191226-reviews.csv')

In [20]:
df['helpfulVotes'].fillna(0, inplace=True)
df.dropna(subset=['body'], inplace=True)
df['date']= pd.to_datetime(df['date'])
df.drop('asin', axis=1, inplace=True)
df.drop('name', axis=1, inplace=True)
df.drop('verified', axis=1, inplace=True)
df.drop('title', axis=1, inplace=True)


In [21]:
df['date'].min()

Timestamp('2003-11-24 00:00:00')

In [22]:
from datetime import datetime
start_date = datetime(2003, 11, 24)
df['days_since_start'] = (df['date'] - start_date).dt.days

In [23]:
df.drop('date', axis=1, inplace=True)

In [24]:
sia = SentimentIntensityAnalyzer()
df['pos_words'] = df.apply(lambda x: len([w for w in nltk.word_tokenize(x['body']) if sia.polarity_scores(w)['pos'] > 0]), axis=1)
df['neg_words'] = df.apply(lambda x: len([w for w in nltk.word_tokenize(x['body']) if sia.polarity_scores(w)['neg'] > 0]), axis=1)

In [25]:
df

Unnamed: 0,rating,body,helpfulVotes,days_since_start,pos_words,neg_words
0,3,I had the Samsung A600 for awhile which is abs...,1.0,687,17,13
1,1,Due to a software issue between Nokia and Spri...,17.0,44,5,2
2,5,"This is a great, reliable phone. I also purcha...",5.0,36,6,3
3,3,"I love the phone and all, because I really did...",1.0,115,3,0
4,4,The phone has been great for every purpose it ...,1.0,643,6,3
...,...,...,...,...,...,...
67981,5,I love the camera on this phone. The screen is...,1.0,5744,5,3
67982,5,I've been an Xperia user for several years and...,1.0,5773,4,5
67983,5,buy one more for my cousin,0.0,5711,0,0
67984,5,Product looks and works like new. Very much re...,0.0,5874,2,0


In [26]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(df.loc[:, ['helpfulVotes', 'days_since_start', 'pos_words', 'neg_words']], df.loc[:, 'rating'], train_size=0.7, random_state=42)

In [42]:
prm = Pipeline([
    ("poly", PolynomialFeatures(degree=4, include_bias=False)),
    ('linreg', LinearRegression())
])

In [43]:
prm.fit(X_train, y_train)

In [44]:
def predict_rating(body, date, helpfulVotes=0):
    return np.array([helpfulVotes, 
                     (date - start_date).days, 
                     len([w for w in nltk.word_tokenize(body) if sia.polarity_scores(w)['pos'] > 0]),
                     len([w for w in nltk.word_tokenize(body) if sia.polarity_scores(w)['neg'] > 0])
                    ])

In [45]:
predict_rating("I love Gary Numan's older stuff and this is a great tune!!", datetime(2016, 9, 16))[None, :]

array([[   0, 4680,    2,    0]])

In [46]:
prm.predict(predict_rating("Gary Numan's older stuff is ok!!", datetime(2016, 9, 16), 3)[None, :])



array([3.89985639])

In [47]:
prm.named_steps['linreg'].coef_, prm.named_steps['linreg'].intercept_

(array([-7.83374349e-02, -3.47512937e-03, -3.16252737e-02, -1.91335116e-02,
        -5.01320549e-04,  8.82474981e-05, -3.01564003e-03,  1.24044205e-03,
         2.18898985e-06,  1.17072188e-04, -2.14256974e-04,  7.06704463e-04,
        -1.59720855e-02,  3.63909582e-02, -8.17571091e-07,  3.64508510e-07,
         1.21501231e-05, -5.80582217e-05, -3.31088667e-08,  9.40443840e-07,
         3.25506855e-06,  7.48069962e-06, -2.89372614e-05, -1.62806875e-04,
        -4.82092614e-10, -2.08289081e-09,  5.50970465e-09, -5.37523869e-06,
         1.25974056e-05, -3.65799193e-06,  1.26899382e-04, -1.45554576e-04,
         1.87461238e-04, -1.06817007e-03,  2.72082246e-10,  6.97691384e-11,
         2.73982315e-09,  5.41118861e-10, -4.22819731e-11, -2.93821332e-09,
         9.64891434e-09,  9.41700918e-09,  2.66895641e-07, -2.74109764e-07,
         3.32970457e-12, -2.18758770e-11, -5.35843369e-10,  1.37095645e-09,
        -2.86222162e-08,  4.25397591e-08, -1.62205838e-07, -5.22204955e-08,
         2.5

In [48]:
prm.score(X_train, y_train)

0.24805532934319807

In [49]:
prm.score(X_val, y_val)

0.1662392332039656

In [50]:
labels_train = np.where(y_train >= 3.5, "pos", "neg")
labels_val = np.where(prm.predict(X_train) >= 3.5, "pos", "neg")
(labels_train == labels_val).mean()

0.7647293746715712