#### Aug. 10, 2018 320pm

In [2]:
import numpy   as np
import pandas  as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition   import PCA, TruncatedSVD

import pickle
from nltk.corpus import stopwords

import re

---

### Large Pickled Dataset



In [3]:
%%time
df = pd.read_pickle("newRev_VegCols_US.pkl")

CPU times: user 820 ms, sys: 1.36 s, total: 2.18 s
Wall time: 2.18 s


In [None]:
# df['vegFriendly'].value_counts()

# df['useful'].value_counts()[1:10]

# df['useful'].value_counts()[1:].sum() / df['useful'].value_counts().sum()

# df['useful'].value_counts()[1:].sum() 

In [5]:
X_df = df.drop(['vegFriendly', 'Vegan', 'Vegetarian'], axis=1)
y_df = df['vegFriendly']

In [6]:
X_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 923002 entries, 0 to 923001
Data columns (total 17 columns):
business_id    923002 non-null object
name           923002 non-null object
review_id      923002 non-null object
stars          923002 non-null int64
date           923002 non-null datetime64[ns]
text           923002 non-null object
useful         923002 non-null int64
funny          923002 non-null int64
cool           923002 non-null int64
city           923002 non-null object
state          923002 non-null object
Thai           923002 non-null float64
Italian        923002 non-null float64
Indian         923002 non-null float64
Chinese        923002 non-null float64
Mexican        923002 non-null float64
Text_length    923002 non-null int64
dtypes: datetime64[ns](1), float64(5), int64(5), object(6)
memory usage: 119.7+ MB


In [7]:
%%time
X_df.iloc[:, 5] = X_df['text'].apply( lambda rev: re.sub(r'(\d+)', '', rev) )

CPU times: user 16.4 s, sys: 88 ms, total: 16.5 s
Wall time: 16.5 s


In [8]:
%%time
X_df.iloc[:, 5] = X_df['text'].apply( lambda rev: rev.replace('_', '') )

CPU times: user 664 ms, sys: 0 ns, total: 664 ms
Wall time: 664 ms


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.3)

In [10]:
print( X_train.shape, X_test.shape)

(646101, 17) (276901, 17)


---

### Manipulate data sets

In [11]:
stopWords = list(stopwords.words('english'))
stopWords.extend(['good', 'excellent', 'best', 'like', 'place', 'really', 'ordered', 'amazing', 'fantastic', 'am', 'pm'])

In [12]:
vectorizer = TfidfVectorizer(stop_words=stopWords)

#### Vectorize Review Text

In [13]:
%%time
X_train_term  = vectorizer.fit_transform(X_train['text'])

CPU times: user 43.8 s, sys: 5.1 s, total: 48.9 s
Wall time: 48.9 s


In [14]:
X_train_term.shape

(646101, 141462)

In [15]:
ftrs = vectorizer.get_feature_names()

In [16]:
ftrs

['aa',
 'aaa',
 'aaaa',
 'aaaaa',
 'aaaaaa',
 'aaaaaaaaaa',
 'aaaaaaaaaaaaaaaaaaaaaaaaaauuuuuuuuuuuuuuuuggggggggggggggggggg',
 'aaaaaaaaaaaaaand',
 'aaaaaaaaaaaaaawwwwwwwwwwwwwwwwwwwwww',
 'aaaaaaaaaaaand',
 'aaaaaaaaaaah',
 'aaaaaaaaaah',
 'aaaaaaaaaand',
 'aaaaaaaack',
 'aaaaaaaahhhhhhhhh',
 'aaaaaaaahhhhmazing',
 'aaaaaaaamazing',
 'aaaaaaaand',
 'aaaaaaaawesooooooooooooome',
 'aaaaaaah',
 'aaaaaaahhhhhh',
 'aaaaaaall',
 'aaaaaaamaaaaazing',
 'aaaaaaamazing',
 'aaaaaaammmmazing',
 'aaaaaaand',
 'aaaaaaggghhh',
 'aaaaaahhhhh',
 'aaaaaahhhhhmazing',
 'aaaaaamazing',
 'aaaaaand',
 'aaaaaannnddd',
 'aaaaaannnndddd',
 'aaaaaawesome',
 'aaaaah',
 'aaaaahhh',
 'aaaaahhhhhh',
 'aaaaahhhhmaaazeballs',
 'aaaaamazing',
 'aaaaammmaazzzinnngg',
 'aaaaand',
 'aaaaarrrggghh',
 'aaaaawful',
 'aaaaay',
 'aaaack',
 'aaaah',
 'aaaahh',
 'aaaahhh',
 'aaaahhhh',
 'aaaahhhhhhh',
 'aaaajúa',
 'aaaall',
 'aaaalllllllll',
 'aaaallright',
 'aaaalmost',
 'aaaamaaazing',
 'aaaamazing',
 'aaaamazzzzing',
 'aaaa

---

#### Gaussian NB - Train

In [None]:
>>> clf = GaussianNB()
>>> clf.fit(X, Y)
GaussianNB(priors=None)
>>> print(clf.predict([[-0.8, -1]]))
[1]
>>> clf_pf = GaussianNB()
>>> clf_pf.partial_fit(X, Y, np.unique(Y))
GaussianNB(priors=None)
>>> print(clf_pf.predict([[-0.8, -1]]))

In [17]:
from sklearn.naive_bayes import GaussianNB

In [18]:
%%time
G_NB_model = GaussianNB()

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 11.2 µs


In [20]:
%%time
G_NB_model.fit(X_train_term, y_train)

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

---

#### Random Forest - Test

In [21]:
%%time
X_test_term = vectorizer.transform(X_test['text'])

CPU times: user 20.9 s, sys: 128 ms, total: 21 s
Wall time: 22.6 s


In [22]:
X_test_term.shape

(276901, 141462)

In [None]:
%%time
pred = G_NB_model.predict(X_test_term)

In [None]:
len(pred)

In [None]:
pred[:30]

In [None]:
np.unique(pred)

In [None]:
y_test[:10]

In [None]:
pred[:10]

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

from sklearn.metrics import classification_report, roc_auc_score, roc_curve

In [None]:
lbls = ['vegFriendly', 'Non-vegFriendly']

cm = confusion_matrix(y_test, pred)
cm

In [None]:
print(classification_report(y_test, pred))

In [None]:
accuracy_score(y_test, pred)

In [None]:
precision_score(y_test, pred)

In [None]:
recall_score(y_test, pred)

In [None]:
f1_score(y_test, pred)

In [None]:
plot_confusion_matrix(cm,lbls)

In [None]:
importances = RF_model.feature_importances_

importances

In [None]:
indices = np.argsort(importances)[::-1]
indices

In [None]:
indices = indices.reshape(1,80)

In [None]:
important_words = svd.inverse_transform(np.array(indices))

In [None]:
important_words.shape

In [None]:
iw_ind = np.argsort(important_words)[::-1]
iw_ind

In [None]:
stop = 100
cnt = 0
for ind in iw_ind[0]:
    print(ftrs[ind])
    if cnt == stop:
         break
    cnt += 1

In [None]:
pred[:10]

In [None]:
roc_auc_score(y_test, y)

In [None]:
y_test[:10]

In [None]:
y_score = RF_model.predict_proba(X_test_term_svd)[:,1]

In [None]:
y_score[:10]

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_score, pos_label=1)

In [None]:
plt.plot(fpr, tpr, 'o--')

In [None]:
roc_auc_score(y_test, y_score)

In [None]:
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)

indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

In [None]:
import itertools 
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    """
        This function prints and plots the confusion matrix.
        Normalization can be applied by setting `normalize=True`.
    """
    
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

#     print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
        
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')