In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import explained_variance_score
import pickle
import sebastian

In [36]:
import_path = r'./data/cleaned_reviews.csv'
data = pd.read_csv(import_path, index_col=None)

In [37]:
data.head()

Unnamed: 0,Id,Text,Sentiment
0,1,bought several vitality canned dog food produc...,1.0
1,2,product arrived labeled jumbo salted peanut pe...,-1.0
2,3,confection around century light pillowy citrus...,0.5
3,4,looking secret ingredient robitussin believe f...,-0.5
4,5,great taffy great price wide assortment yummy ...,1.0


In [38]:
X = data['Text']
y = data['Sentiment'].values

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [41]:
sebas = sebastian.Sebastian()

In [55]:
pipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('ss', StandardScaler()),
    ('sgd', SGDRegressor())
])
params = {
    'tvec__stop_words': ['english'],
    'tvec__ngram_range': [(1, 1), (1, 2)],
    'tvec__max_df': [.3, .6, .9],
    'tvec__min_df': [1, 3, 7],
    'tvec__max_features': [2000, 3000, 4000],
    'ss__with_mean': [False],
    'sgd__alpha': [1e-7, 1e-6, 1e-5, 1e-4],
}
gs = GridSearchCV(pipe, param_grid=params, cv=5, n_jobs=-1)
gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('ss', StandardScaler()),
                                       ('sgd', SGDRegressor())]),
             n_jobs=-1,
             param_grid={'sgd__alpha': [1e-07, 1e-06, 1e-05, 0.0001],
                         'ss__with_mean': [False],
                         'tvec__max_df': [0.3, 0.6, 0.9],
                         'tvec__max_features': [2000, 3000, 4000],
                         'tvec__min_df': [1, 3, 7],
                         'tvec__ngram_range': [(1, 1), (1, 2)],
                         'tvec__stop_words': ['english']})

In [56]:
print('best score:', gs.best_score_)

print('best params:', sebas.get_params(gs.best_params_))
print()

best score: 0.32600843646170485
best params: sgd: alpha=0.0001 ss: with_mean=False tvec: max_df=0.3, max_features=2000, min_df=3, ngram_range=(1, 2), stop_words='english'



In [57]:
y_pred = gs.predict(X_test)

In [58]:
y_pred[:10]

array([0.18058466, 0.44550169, 0.95264411, 0.94779603, 0.78221566,
       0.47196871, 0.5900928 , 0.71497193, 0.70317484, 0.64448376])

In [59]:
X_test[:10]

566386    first thing note box smaller normal kraft mac ...
323054    love tea great aroma flavor given star poor pa...
506842    love love love tea even like licorice friend e...
113211    year old llasa apso picky treat buy many diffe...
57895     others said high quality gummy product cheapes...
332705    soup good promblem product alot sodium besides...
103325    good caramel flavor bitter perfect latte buy p...
456192    snack good taste like healthy snack satisfying...
338766    tried minute ago smell taste like medicine str...
325076    ounce pro treat beef liver treat great value d...
Name: Text, dtype: object

In [60]:
y_test[:10]

array([-0.5,  0.5,  1. ,  1. ,  1. ,  1. ,  1. ,  1. , -1. ,  1. ])

In [61]:
pickle.dump(gs.best_estimator_, open(r'./data/tfid_sgd.pkl', 'wb'))

In [62]:
with open(r'./data/tfid_sgd.pkl', 'rb') as f:
    model = pickle.load(f)

In [63]:
y_pred2 = model.predict(X_test)

In [64]:
y_pred2[:10]

array([0.18058466, 0.44550169, 0.95264411, 0.94779603, 0.78221566,
       0.47196871, 0.5900928 , 0.71497193, 0.70317484, 0.64448376])