In [None]:
import numpy   as np
import pandas  as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing   import scale, StandardScaler
from sklearn.decomposition   import PCA, TruncatedSVD

import pickle

In [None]:
def top_pred_words(kmObj, X_df, ftrs_list, pred, num_reviews=5, num_words=5): 
    '''
        kmObjn: a kmeans object
        gTrue: true categories (ground truth)
    '''
    
    num_words += 1
    numClusters = kmObj.get_params()['n_clusters']
    
    for i in range(numClusters):
        mask = (pred == i)
        
        rev_ind = [ind for ind, flag in enumerate(mask) if flag]
        
        sample_reviews = np.random.choice(rev_ind, num_reviews, replace=False)
        
        print("\nCluster ", i, " =====")
        for review in sample_reviews:
            indx = X_df[review,:].indices
            data = X_df[review,:].data
            
            termsVals = indx[data.argsort()][-1:-num_words:-1]
            
            terms = []
            for w in termsVals:
                terms.append(ftrs_list[w])
            print(terms)

In [None]:
def center_top_words(kmeans_model, svd_model, ftrs, num_words=10):
    '''
        Print out the top num_words for each of the centroids in the kmeans model, 
        after we return the centroids back in 'word space' using the inverse SVD 
        transformation
        
        input:
        kmeans_model -  kmeans model object
        svd_model - svd model object
        ftrs - the features from the TFIDF vectorizer
        num_words - number of words to display        
    '''
        
    num_words += 1
    
    ftrs_array = np.array(ftrs)

    centroids = svd_model.inverse_transform(kmeans_model.cluster_centers_)

    for c in centroids:
        ind = np.abs(c).argsort()
        ind = ind[-1:-num_words:-1]
        print(ftrs_array[ind], "\n")

In [None]:
def catPred(kmObj, gTrue, pred): 
    '''
        kmObjn: a kmeans object
        gTrue: true categories (ground truth)
    '''
    numClusters = kmObj.get_params()['n_clusters']
        
    fig, ax = plt.subplots(nrows=1, ncols=numClusters, sharey=True, figsize=(50,12))

    for i in range(numClusters):
        mask = (pred == i)
        val = np.unique( gTrue[mask], return_counts=True)
        lbl = val[0]
        ht  = val[1] / val[1].sum()
        ax[i].bar(range(len(lbl)), height=ht, tick_label = lbl)
        ax[i].set_xticklabels(lbl, fontsize=20)
        ax[i].set_title(i, fontsize=20)

        ax[0].set_yticklabels(["0", "0.2", "0.4", "0.6", "0.8"], fontsize=20)
        ax[0].set_ylabel("Proportion", fontsize = 32);
        plt.suptitle("Proportion of Categories in each Clusterc", fontsize = 32);

---

### Load Pickled Dataset - Cuisines

In [None]:
# Five Cuisines for US (data) states only 
# - tying to minimize the amount of reviews using a foriegn language

df = pd.read_pickle('fiveCuisine.pkl')

In [None]:
X_df = df.drop('category', axis=1)
y_df = df['category']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.3)

In [None]:
print( X_train.shape, X_test.shape)

In [None]:
x = df['category'].value_counts().values
lbls = list(df['category'].value_counts().index)

plt.figure(figsize=(16,8))
plt.bar(range(len(x)), height=x, tick_label = lbls);

---

### Manipulate data sets

#### Vectorize Review Text

In [None]:
# use regex operations to remove numbers (dimension) from vectorization


In [None]:
vectorizer = TfidfVectorizer(stop_words='english')

# create a term transformer object
termTrans = vectorizer.fit(X_train['text'])

In [None]:
X_train_term  = termTrans.transform(X_train['text'])

In [None]:
print( X_train_term.shape )

In [None]:
# Gather features USED for vetorization
ftrs = m_termTrans.get_feature_names()

In [None]:
#### Reduce Dimensionality

In [None]:
svd = TruncatedSVD(n_components=100, n_iter=7)

In [None]:
X_svd = svd.fit_transform(X_train_term)  

In [None]:
s = svd.singular_values_
s

In [None]:
x = [i+1 for i in range(len(s))]
plt.plot(x, s, marker='o')
plt.xlabel("Singular Values")
plt.ylabel("Value");

In [None]:
vExp = m_svd.explained_variance_ratio_.sum()*100
print('Variance Explained: ', v)

In [None]:
---

### Apply predictive model to data sets

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, n_jobs=-1)

In [None]:
kmeans.fit(X_svd)

In [None]:
top_pred_words(kmeans, X_train_term, ftrs, train_pred)

In [None]:
catPred(kmeans, y_test, test_pred)

In [None]:
top_pred_words(kmeans, X_test_term, ftrs, train_pred)

In [None]:
center_top_words(kmeans, svd, ftrs, 15)