In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import feather
import scipy as sp 
from time import time
from sklearn.manifold import spectral_embedding
from sklearn.cluster import spectral_clustering
from sklearn.cluster.spectral import SpectralClustering
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import *
import pickle
from sklearn.utils.graph import graph_laplacian
from sklearn.utils.arpack import eigsh
from sklearn.manifold.spectral_embedding_ import _set_diag
from scipy.linalg import eigvals, eigvalsh
import seaborn
from matplotlib.backends.backend_pdf import PdfPages
from sklearn.grid_search import GridSearchCV
from sklearn import tree
from sklearn.feature_selection import SelectKBest
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
from scipy.stats import randint as sp_randint

%matplotlib inline



In [2]:
def test_clf(grid_search, features, labels, parameters, iterations=100):
    from sklearn.metrics import classification_report
    precision, recall = [], []
    for iteration in range(iterations):
        features_train, features_test, labels_train, labels_test = train_test_split(features, labels, random_state=iteration)
        grid_search.fit(features_train, labels_train)
        predictions = grid_search.predict(features_test)
    print classification_report(labels_test, predictions)
    best_params = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print '%s=%r, ' % (param_name, best_params[param_name])

In [3]:
label = pd.read_csv('../data/recomm_spectral_clustering_discretize.label.csv', index_col='biz_id')
label_checkin = pd.read_csv('../data/morethan5_discretize.label.csv', index_col='biz_id')
precom = pd.read_csv('../data/tfidf_all_biz_1percom.csv', index_col='biz_id')

In [4]:
more10_list = []
for indx in precom.index:
    if sum(precom.loc[indx,:] > 0) >= 10:
        if indx not in more10_list:
            more10_list.append(indx)

In [5]:
precom = precom.loc[more10_list, :]
precom2 = precom.copy()

In [6]:
for biz in label.index:
    precom.loc[biz, 'label' ] = label.label.loc[biz]
for biz in label_checkin.index:
    precom2.loc[biz, 'label' ] = label_checkin.label.loc[biz]

In [7]:
meat = ['meati', 'meat', 'steak', 'beef', 'pork', 'lamb', 'hamburg', 'kabob']
alcohol = ['beer', 'wine', 'cocktail', 'alcohol', 'martini', 'mimosa',
          'sangria', 'vodka', 'champagn', 'liquor', 'bourbon', 'tequilla', 'whiskey']

trans_alcohol = precom.loc[:, alcohol].mean(axis=1)
trans_meat = precom.loc[:, meat].mean(axis=1)

trans_alcohol2 = precom2.loc[:, alcohol].mean(axis=1)
trans_meat2 = precom2.loc[:, meat].mean(axis=1)

In [8]:
precom_na = precom.dropna()
features = precom_na.drop('label', axis=1).values
labels = precom_na.label.values

precom2_na = precom2.dropna()
features2 = precom2_na.drop('label', axis=1).values
labels2 = precom2_na.label.values

In [20]:
precom_reduced = precom.drop(alcohol + meat, axis=1 )
precom_reduced['alcohol'] = trans_alcohol.values
precom_reduced['meat'] = trans_meat.values
precom_reduced_na = precom_reduced.dropna()
features3 = precom_reduced_na.drop('label', axis=1)
labels3 = precom_reduced_na.label.values

precom2_reduced = precom2.drop(alcohol + meat, axis=1 )
precom2_reduced['alcohol'] = trans_alcohol2.values
precom2_reduced['meat'] = trans_meat2.values
precom2_reduced_na = precom2_reduced.dropna()
features4 = precom2_reduced_na.drop('label', axis=1)
labels4 = precom2_reduced_na.label.values

In [12]:
clf = tree.DecisionTreeClassifier()
t0= time()
parameters = {'criterion': ['gini', 'entropy'],
               'min_samples_split': [2, 10, 20],
               'max_depth': [None, 2, 5, 10],
               'min_samples_leaf': [1, 5, 10],
               'max_leaf_nodes': [None, 5, 10, 20]}
grid_search = GridSearchCV(clf, parameters)
for i in range(2,60):
    k_best = SelectKBest(k=i)
    k_features = k_best.fit_transform(features3, labels3)
    print "K best features with k = {0}".format(i)
    test_clf(grid_search, k_features, labels3, parameters, iterations=20)

print '\nDecision Tree  (Best k features):'

print '\nDecision Tree total time:{0}s'.format(round(time()-t0, 3))

K best features with k = 2
             precision    recall  f1-score   support

        0.0       0.26      0.86      0.40       297
        1.0       0.09      0.01      0.01       154
        2.0       0.37      0.23      0.28       172
        3.0       0.00      0.00      0.00       113
        4.0       0.00      0.00      0.00       153
        5.0       0.23      0.12      0.15       302
        6.0       0.00      0.00      0.00        56

avg / total       0.18      0.26      0.17      1247

criterion='gini', 
max_depth=None, 
max_leaf_nodes=10, 
min_samples_leaf=1, 
min_samples_split=10, 
K best features with k = 3
             precision    recall  f1-score   support

        0.0       0.26      0.77      0.39       297
        1.0       0.20      0.04      0.07       154
        2.0       0.33      0.15      0.21       172
        3.0       0.21      0.04      0.06       113
        4.0       0.11      0.03      0.05       153
        5.0       0.21      0.13      0.16     

KeyboardInterrupt: 

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

clf = ExtraTreesClassifier(n_jobs=4)
t0= time()
parameters = {'n_estimators': [10],
              'criterion': ['gini', 'entropy'],
              'max_depth': [None, 2, 5, 10], 
              'min_samples_split': [2, 10, 20],
              'min_samples_leaf': [1, 5, 10], 
              'max_leaf_nodes': [None, 5, 10, 20]}
grid_search = GridSearchCV(clf, parameters)
for i in [2,3,4,5]:
    k_best = SelectKBest(k=i)
    k_features = k_best.fit_transform(features, labels)
    print "K best features with k = {0}".format(i)
    test_clf(grid_search, k_features, labels, parameters, )

print '\nDecision Tree  (Best k features):'

print '\nDecision Tree total time:{0}s'.format(round(time()-t0, 3))

In [62]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
t0= time()
parameters = {'n_estimators':[10],
              'criterion': ['gini'],
              'max_depth': [10], 
              'min_samples_split': [10],
              'min_samples_leaf': [5], 
              'max_leaf_nodes': [None]}
grid_search = GridSearchCV(clf, parameters)
for i in [38]:
    k_best = SelectKBest(k=i)
    k_features = k_best.fit_transform(features4, labels4)
    print "K best features with k = {0}".format(i)
    test_clf(grid_search, k_features, labels4, parameters, iterations=1)

print '\nDecision Tree  (Best k features):'

print '\nDecision Tree total time:{0}s'.format(round(time()-t0, 3))

K best features with k = 38
             precision    recall  f1-score   support

        0.0       0.20      0.06      0.09       193
        1.0       0.36      0.90      0.51       385
        2.0       0.52      0.24      0.33       267
        3.0       0.00      0.00      0.00        25
        4.0       0.75      0.17      0.28       103
        5.0       1.00      0.02      0.05        81
        6.0       0.33      0.01      0.02       122

avg / total       0.44      0.38      0.29      1176

criterion='gini', 
max_depth=10, 
max_leaf_nodes=None, 
min_samples_leaf=5, 
min_samples_split=10, 
n_estimators=10, 

Decision Tree  (Best k features):

Decision Tree total time:0.283s


In [66]:
# k_features = vpd.read_csv('../report/k-best-user-visit.csv')
g = []
for c in features4.columns:
    for i in range(k_features.shape[1]):
        if np.array_equal(features4.loc[:, c].values, k_features[:, i]):
            g.append(c)
len(g)

38

In [16]:
a = pd.DataFrame(k_features)
a.to_csv('../report/k-best-rating.csv')

In [43]:
from sklearn.ensemble import AdaBoostClassifier
from time import time
clf = AdaBoostClassifier()
t0= time()
parameters = {'n_estimators': [10, 20, 40],
               'algorithm': ['SAMME', 'SAMME.R'],
               'learning_rate': [.5, 1, 1.5]}
grid_search = GridSearchCV(clf, parameters)
for i in [2,3,4,5]:
    k_best = SelectKBest(k=i)
    k_features = k_best.fit_transform(features, labels)
    print "K best features with k = {0}".format(i)
    test_clf(grid_search, k_features, labels, parameters)

print '\nAdaBoost (Best k features):'

print '\nAdaBoost total time:{0}s'.format(round(time()-t0, 3))



K best features with k = 2


NameError: global name 'start' is not defined

In [18]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
t0= time()
parameters = {}
grid_search = GridSearchCV(clf, parameters)
for i in range(2, 60):
    k_best = SelectKBest(k=i)
    k_features = k_best.fit_transform(features3, labels3)
    print "K best features with k = {0}".format(i)
    test_clf(grid_search, k_features, labels3, parameters)

print '\nGaussianNB  (Best k features):'

print '\nGaussianNB total time:{0}s'.format(round(time()-t0, 3))


K best features with k = 2
             precision    recall  f1-score   support

        0.0       0.24      0.94      0.38       282
        1.0       0.00      0.00      0.00       138
        2.0       0.33      0.13      0.18       175
        3.0       0.00      0.00      0.00       100
        4.0       0.00      0.00      0.00       155
        5.0       0.28      0.05      0.09       331
        6.0       0.00      0.00      0.00        66

avg / total       0.17      0.24      0.13      1247

K best features with k = 3
             precision    recall  f1-score   support

        0.0       0.24      0.92      0.38       282
        1.0       0.00      0.00      0.00       138
        2.0       0.34      0.10      0.16       175
        3.0       0.15      0.02      0.04       100
        4.0       0.00      0.00      0.00       155
        5.0       0.20      0.04      0.06       331
        6.0       0.00      0.00      0.00        66

avg / total       0.17      0.23      0.