#### Aug. 9, 2018 1pm

In [None]:
import numpy   as np
import pandas  as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing   import scale, StandardScaler
from sklearn.decomposition   import PCA, TruncatedSVD

import pickle
from nltk.corpus import stopwords

import re

---

### Large Pickled Dataset



In [None]:
%%time
df = pd.read_pickle("newRev_VegCols_US.pkl")

In [None]:
df['vegFriendly'].value_counts()

In [None]:
two = df['vegFriendly'] == 2

In [None]:
%%time
df[two] = 1

In [None]:
X_df = df.drop(['vegFriendly', 'Vegan', 'Vegetarian'], axis=1)
y_df = df['vegFriendly']

In [None]:
X_df.info()

In [None]:
X_df.iloc[:, 5] = X_df['text'].apply( lambda rev: re.sub(r'(\d+)', '', rev) )

In [None]:
X_df.iloc[:, 5] = X_df['text'].apply( lambda rev: re.sub(r'^[a]*[a-zA-Z]*$', '', rev) )

In [None]:
X_df.iloc[:, 5] = X_df['text'].apply( lambda rev: rev.replace('_', '') )

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.3)

In [None]:
print( X_train.shape, X_test.shape)

---

### Manipulate data sets

In [None]:
stopWords = list(stopwords.words('english'))
stopWords.extend(['good', 'excellent', 'best', 'like', 'place', 'really', 'ordered', 'amazing', 'fantastic'])

vectorizer = TfidfVectorizer(stop_words=stopWords)

#### Vectorize Review Text

In [None]:
%%time
X_train_term  = vectorizer.fit_transform(X_train['text'])

In [None]:
X_train_term.shape

In [None]:
ftrs = vectorizer.get_feature_names()

In [None]:
ftrs

#### Reduce Dimensionality

In [None]:
svd = TruncatedSVD(n_components=30, n_iter=7)

In [None]:
%%time
X_train_term_svd = svd.fit_transform(X_train_term)  

In [None]:
svd.singular_values_

In [None]:
s = svd.singular_values_

x = [i+1 for i in range(len(s))]
plt.plot(x, s, marker='o')
plt.xlabel("Singular Values")
plt.ylabel("Value");

##### Percentage of Variance Explained

In [None]:
svd.explained_variance_ratio_.sum()*100

---

#### Random Forest - Train

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
RF_model = RandomForestClassifier(n_jobs=-1)

In [None]:
%%time
RF_model.fit(X_train_term_svd, y_train)

---

#### Random Forest - Test

In [None]:
%%time
X_test_term = vectorizer.transform(X_test['text'])

In [None]:
X_test_term.shape

In [None]:
%%time
X_test_term_svd = svd.transform(X_test_term)

In [None]:
X_test_term_svd.shape

In [None]:
%%time
pred = RF_model.predict(X_test_term_svd)

In [None]:
len(pred)

In [None]:
pred[:30]

In [None]:
np.unique(pred)

In [None]:
import itertools 
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    """
        This function prints and plots the confusion matrix.
        Normalization can be applied by setting `normalize=True`.
    """
    
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

#     print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
        
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
lbls = ['Chinese', 'Indian', 'Mexican', 'Vegan', 'Vegetarian']

cm = confusion_matrix(y_test, pred, labels=lbls)

plot_confusion_matrix(cm,lbls)

In [None]:
cm

In [None]:
conf_mtx = dict()
metrics  = dict()
for i in range(cm.shape[0]):
    tp = cm[i,i]
    fn = cm[i,:].sum() - tp
    fp = cm[:,i].sum() - tp
    tn = cm.sum() - tp - fn - fp
    conf_mtx[ lbls[i] ] = np.array([[tn, fp], [fn, tp]])
    
    acc = (tp + tn) / conf_mtx[lbls[i]].sum()
    prc = tp / (tp + fp)
    rec = tp / (tp + fn)
    f1s = 2*tp / (2*tp + fp + fn)
    metrics[lbls[i]] = [acc, prc, rec, f1s]

In [None]:
conf_mtx[lbls[0]]

In [None]:
conf_mtx

In [None]:
metrics

# === === === === === === === === === === === === === === === 

In [None]:
classWeight = [{0: 1, 1: 1}, {0: 1, 1: 2}, {0: 1, 1: 1}, {0: 1, 1: 4}, {0: 1, 1: 4}]

In [None]:
classWeight = {'Chinese': {0: 1, 1: 1}, 'Indian': {0: 1, 1: 2}, 'Mexican': {0: 1, 1: 1}, 'Vegan': {0: 1, 1: 4}, 'Vegetarian': {0: 1, 1: 4}}
classWeight

In [None]:
classWeight = {'Chinese': 1, 'Indian': 2, 'Mexican': 1, 'Vegan': 4, 'Vegetarian': 4}
classWeight

In [None]:
RF_model = RandomForestClassifier(n_jobs=-1, class_weight = classWeight)

In [None]:
RF_model.fit(X_train_term_svd, y_train)

In [None]:
pred = RF_model.predict(X_test_term_svd)

In [None]:
lbls = ['Chinese', 'Indian', 'Mexican', 'Vegan', 'Vegetarian']

cm = confusion_matrix(y_test, pred, labels=lbls)

plot_confusion_matrix(cm,lbls)

#### Test Set: Class Balance

In [None]:
y_test.value_counts()

In [None]:
x = y_test.value_counts().values
lbls = list(y_test.value_counts().index)
plt.bar(range(len(x)), height=x, tick_label = lbls);

#### Train Set: Class Balance

In [None]:
y_train.value_counts()

In [None]:
x = y_train.value_counts().values
lbls = list(y_train.value_counts().index)
plt.bar(range(len(x)), height=x, tick_label = lbls);

---

### Train data PREDICTION

In [None]:
pred2 = m_kmeans.predict(X_train_term_svd)

In [None]:
catPred(m_kmeans, y_train, pred2)

In [None]:
top_pred_words(m_kmeans, X_train_term, ftrs, pred2)

---

### Top Words for Cluster Centroids

In [None]:
center_top_words(m_kmeans, m_svd, ftrs, 15)

In [None]:
# https://github.com/gSchool/dsi-solns-g69/blob/master/clustering/pair_part1_kmeans.py

In [None]:
pred

In [None]:
# Print out the reviews of a random sample of the restaurants assigned to each
# cluster to get a sense of the category.

print("\nRandom sample of reviews in each cluster")
assigned_cluster = m_kmeans.transform(X_train_term_svd)

In [None]:
assigned_cluster[:5,:]

In [None]:
assigned_cluster = assigned_cluster[:10,:].argmin(axis=1)
assigned_cluster

In [None]:
pred2[:10]

In [None]:
m_kmeans.cluster_centers_

In [None]:
vectorizer.get_feature_names()

In [None]:
!pip install -U sklearn

In [None]:
import sklearn

In [None]:
sklearn.__version__

In [None]:
! pip install -U scikit-learn