In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from zipfile import ZipFile


In [6]:

# get the zip file
file = ZipFile('adjs.zip')

# extract the adjs.csv
df = pd.read_csv(file.open("adjs.csv"), encoding='latin1')

df

Unnamed: 0,Stars,Review,adjectives,has_great,has_good,has_nice,has_other,has_delicious,has_friendly,has_little,...,has_first,has_awesome,has_favorite,has_many,has_few,has_much,has_sure,has_next,has_happy,has_perfect
0,1,I got 'new' tires from them and within two wee...,"['flat', 'local', 'previous', 'new', 'resentfu...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,All I can say is the worst! We were the only 2...,"['only', 'electronic', 'fish', 'filthy', 'slim...",0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1,I have been to this restaurant twice and was d...,"['first', 'empty', 'rude', 'Ridiculous', 'long...",0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,1,Food was NOT GOOD at all! My husband & I ate h...,"['first', 'huge', 'much', 'runny/watery', 'muc...",0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
4,3,This is a tiny Starbucks and it locations like...,"['tiny', 'good', 'nice', 'central', 'favorite'...",0,1,1,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33436,1,"Not worth the time, much less the price of adm...","['colorful', 'small', 'bitter', 'several', 'ex...",0,1,0,0,0,0,0,...,0,0,0,0,1,1,0,1,0,0
33437,1,Just wanted to write a review to chip in with ...,"['little', 'worth', 'accurate', 'same', 'cool'...",0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
33438,4,I have been to the other Monks locations so I ...,"['other', 'disappointed', 'different', 'fish',...",1,1,0,1,0,0,0,...,0,0,1,1,0,0,0,0,0,0
33439,2,Don't go here. I know you might want to try it...,"['good', 'positive', '..', 'right', 'only', 'f...",0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [7]:
# initialize features
X   = df.iloc[:,3:].values
y   = df['Stars'].values     

y.reshape(-1, 1)

X_train, X_test, y_train, y_test = train_test_split(X,              #the input features
                                                    y,              #the label
                                                    test_size=0.3,  #set aside 30% of the data as the test set
                                                    random_state=7 #reproduce the result
                                                   )

In [14]:
# initialize Random Forest model and fit
rf = RandomForestClassifier(n_estimators=100, random_state=7, max_depth = 3)
rf.fit(X_train, y_train)

In [15]:
# predict
y_pred   = rf.predict(X_test)

In [16]:
# compute accuracy
accuracy_score(y_true = y_test, y_pred = y_pred)

0.31545898534835043

In [20]:
# initialize Decision Tree model and fit
dec_tree_clf = tree.DecisionTreeClassifier(max_depth = 3)
dec_tree_clf.fit(X_train, y_train)

In [21]:
# predict
y_pred = dec_tree_clf.predict(X_test);

In [22]:
# compute accuracy
accuracy_score(y_true = y_test, y_pred = y_pred)

0.2911392405063291

In [36]:
# number of neighbors is 10
k = 10

# initialize a knn_classifier
knn_classifier = KNeighborsClassifier(n_neighbors = k)

# construction of kfold object
kfold = StratifiedKFold(n_splits = 3, shuffle = True)

# allocate an empty array to store predictions in 
y_pred = np.empty_like(y)

for train_idx, test_idx in kfold.split(X, y):
    # build arrays which correspond to x, y train /test
    x_test = X[test_idx, :]
    x_train = X[train_idx, :]
    y_true_train = y[train_idx]
    


    # fit the classifier (use all available data)
    knn_classifier.fit(X, y)

    # estimate each review's star
    y_pred[test_idx] = knn_classifier.predict(x_test)

In [37]:
# compute accuracy
accuracy_score(y_true = y, y_pred = y_pred)

0.3704733710116324

##### HyperParameter Tuning for KNN as it had the best initial model accuracy

In [25]:
# list hyperparameters that we want to tune.
n_neighbors = list(range(1,30))

# cross validation method
cv = RepeatedStratifiedKFold(n_splits = 3, n_repeats=3, random_state=1)

# Convert to dictionary
hyperparameters = dict(n_neighbors = n_neighbors)

# Create new KNN object

knn_2 = KNeighborsClassifier()

# Use GridSearch
clf = GridSearchCV(knn_2, hyperparameters, cv = cv, scoring='accuracy')



In [26]:
# fit the model
best_model = clf.fit(X,y)

In [27]:
# print The value of best n_neighbors
print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])
print('The accuracy score was:', best_model.best_score_)

Best n_neighbors: 28
The accuracy score was: 0.30497493097295736


In [40]:
best_model.best_estimator_.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 28,
 'p': 2,
 'weights': 'uniform'}

In [44]:
# use best n_neighbors of 28
k = 28

# initialize a knn_classifier
knn_classifier = KNeighborsClassifier(n_neighbors = k)

# construction of kfold object
kfold = StratifiedKFold(n_splits = 3, shuffle = True)

# allocate an empty array to store predictions in 
y_pred = np.empty_like(y)

for train_idx, test_idx in kfold.split(X, y):
    # build arrays which correspond to x, y train /test
    x_test = X[test_idx, :]
    x_train = X[train_idx, :]
    y_true_train = y[train_idx]
    


    # fit the classifier (use all available data)
    knn_classifier.fit(X, y)

    # estimate each review's star
    y_pred[test_idx] = knn_classifier.predict(x_test)

In [45]:
# compute accuracy
accuracy_score(y_true = y, y_pred = y_pred)

0.35399659101103437