In [1]:
%matplotlib inline

import pandas as pd

pd.set_option('display.max_colwidth', 120)

In [2]:
wine_df_full = pd.read_csv('data/wine_reviews.csv')

# let us reduce down our dataset so that it more manageable. 
wine_df = wine_df_full.sample(n = 10000)

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [4]:
pipeline = Pipeline([
                     ('vectorizer' , CountVectorizer()),
                     ('classifier' , KNeighborsClassifier())
                    ])

parameters = {'vectorizer__max_features' : [300, 500, 700],
              'classifier__n_neighbors' : [2,3, 5] }



In [8]:
grid        = GridSearchCV(pipeline,
                           parameters,
                           return_train_score = True,
                           cv = 3,
                           n_jobs = -1, 
                           verbose = 2)

In [9]:
grid.fit(wine_df['description'], wine_df['rating'])

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] classifier__n_neighbors=2, vectorizer__max_features=300 .........
[CV] classifier__n_neighbors=2, vectorizer__max_features=300 .........
[CV] classifier__n_neighbors=2, vectorizer__max_features=300 .........
[CV] classifier__n_neighbors=2, vectorizer__max_features=500 .........
[CV]  classifier__n_neighbors=2, vectorizer__max_features=300, total=   4.3s
[CV] classifier__n_neighbors=2, vectorizer__max_features=500 .........
[CV]  classifier__n_neighbors=2, vectorizer__max_features=300, total=   4.8s
[CV]  classifier__n_neighbors=2, vectorizer__max_features=300, total=   4.6s
[CV] classifier__n_neighbors=2, vectorizer__max_features=500 .........
[CV] classifier__n_neighbors=2, vectorizer__max_features=700 .........
[CV]  classifier__n_neighbors=2, vectorizer__max_features=500, total=   4.7s
[CV] classifier__n_neighbors=2, vectorizer__max_features=700 .........
[CV]  classifier__n_neighbors=2, vectorizer__max_features=500, t

[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  1.1min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ...owski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vectorizer__max_features': [300, 500, 700], 'classifier__n_neighbors': [2, 3, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=2)

In [10]:
pd.DataFrame(grid.cv_results_).sort_values(by = 'rank_test_score')

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_classifier__n_neighbors,param_vectorizer__max_features,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.650011,3.911422,0.729,0.9549,2,300,"{'classifier__n_neighbors': 2, 'vectorizer__max_features': 300}",1,0.740552,0.950945,0.727273,0.958452,0.719172,0.955302,0.005341,0.207406,0.008814,0.003078
1,0.663731,3.014868,0.7212,0.96905,2,500,"{'classifier__n_neighbors': 2, 'vectorizer__max_features': 500}",2,0.724655,0.967897,0.726373,0.971651,0.712571,0.967602,0.024517,0.748231,0.006141,0.001843
2,0.631577,2.498195,0.7166,0.9726,2,700,"{'classifier__n_neighbors': 2, 'vectorizer__max_features': 700}",3,0.726155,0.971497,0.714671,0.972551,0.708971,0.973751,0.011422,0.080301,0.007147,0.000921
3,0.685521,2.450972,0.7035,0.8046,3,300,"{'classifier__n_neighbors': 3, 'vectorizer__max_features': 300}",4,0.711758,0.80453,0.705071,0.80891,0.693669,0.80036,0.045753,0.05247,0.007468,0.003491
6,0.684377,2.984071,0.6984,0.7599,5,300,"{'classifier__n_neighbors': 5, 'vectorizer__max_features': 300}",5,0.707558,0.761026,0.69757,0.764212,0.690069,0.754462,0.029646,0.070882,0.007164,0.004059
4,0.676721,2.59692,0.6824,0.77825,3,500,"{'classifier__n_neighbors': 3, 'vectorizer__max_features': 500}",6,0.688062,0.780978,0.686169,0.776361,0.672967,0.777411,0.068777,0.043217,0.006714,0.001976
7,0.6976,2.933428,0.6741,0.7287,5,500,"{'classifier__n_neighbors': 5, 'vectorizer__max_features': 500}",7,0.679964,0.724572,0.678368,0.732263,0.663966,0.729264,0.013292,0.168813,0.007195,0.003165
5,0.721103,2.678074,0.6727,0.7648,3,700,"{'classifier__n_neighbors': 3, 'vectorizer__max_features': 700}",8,0.675165,0.764326,0.675968,0.769612,0.666967,0.760462,0.07141,0.134303,0.004067,0.00375
8,0.589188,2.520334,0.6653,0.71335,5,700,"{'classifier__n_neighbors': 5, 'vectorizer__max_features': 700}",9,0.666167,0.711671,0.672967,0.716514,0.656766,0.711864,0.104428,0.161456,0.006642,0.002239


In [11]:
best_pipeline = grid.best_estimator_


In [12]:
best_pipeline.get_params()

{'classifier': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
            metric_params=None, n_jobs=1, n_neighbors=2, p=2,
            weights='uniform'),
 'classifier__algorithm': 'auto',
 'classifier__leaf_size': 30,
 'classifier__metric': 'minkowski',
 'classifier__metric_params': None,
 'classifier__n_jobs': 1,
 'classifier__n_neighbors': 2,
 'classifier__p': 2,
 'classifier__weights': 'uniform',
 'memory': None,
 'steps': [('vectorizer',
   CountVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=300, min_df=1,
           ngram_range=(1, 1), preprocessor=None, stop_words=None,
           strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
           tokenizer=None, vocabulary=None)),
  ('classifier',
   KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
              metric_params=None, n_job

In [13]:
train_prediction = best_pipeline.predict(wine_df['description'])

print(classification_report(train_prediction, wine_df['rating']))

             precision    recall  f1-score   support

       High       1.00      0.90      0.95      4390
        Low       0.93      1.00      0.96      5610

avg / total       0.96      0.96      0.96     10000



In [14]:
test_sample = wine_df_full.sample(n = 10000, replace=False)

test_prediction = best_pipeline.predict(test_sample['description'])

print(classification_report(test_prediction, test_sample['rating']))

             precision    recall  f1-score   support

       High       0.60      0.77      0.68      3092
        Low       0.88      0.77      0.82      6908

avg / total       0.80      0.77      0.78     10000



In [15]:
from sklearn.feature_extraction.text import TfidfTransformer

In [16]:
pipeline = Pipeline([
                     ('vectorizer' , CountVectorizer()),
                     ('tfidf'      , TfidfTransformer()),
                     ('classifier' , KNeighborsClassifier())
                    ])

parameters = {'vectorizer__max_features' : [250, 300, 350],
              'vectorizer__stop_words'   : ['english', None],
              'tfidf__use_idf'           : [True, False],
              'classifier__n_neighbors'  : [2, 3] }




In [17]:
grid        = GridSearchCV(pipeline,
                           parameters,
                           return_train_score = True,
                           cv = 3,
                           n_jobs = -1, 
                           verbose = 2)

In [18]:
grid.fit(wine_df['description'], wine_df['rating'])

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=english 
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=english 
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=english 
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=english, total=   2.3s
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=english, total=   2.3s
[CV]  classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=eng

[CV]  classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=english, total=   3.1s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=english 


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.1min


[CV]  classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=None, total=   4.1s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=None, total=   3.9s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=None, total=   4.1s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=english, total=   3.8s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=3,

[CV]  classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=english, total=   3.0s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=english, total=   3.0s
[CV]  classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=None, total=   3.5s
[CV]  classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=None, total=   3.5s
[CV]  classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=None, total=   3.7s


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  2.8min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ...owski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vectorizer__max_features': [250, 300, 350], 'vectorizer__stop_words': ['english', None], 'tfidf__use_idf': [True, False], 'classifier__n_neighbors': [2, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=2)

In [19]:
pd.DataFrame(grid.cv_results_).sort_values(by = 'rank_test_score')


Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_classifier__n_neighbors,param_tfidf__use_idf,param_vectorizer__max_features,param_vectorizer__stop_words,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
17,1.170683,4.161584,0.7867,0.89755,3,True,350,,"{'classifier__n_neighbors': 3, 'tfidf__use_idf': True, 'vectorizer__max_features': 350, 'vectorizer__stop_words': None}",1,0.788542,0.89919,0.787579,0.895155,0.783978,0.898305,0.100817,0.107977,0.001964,0.001732
15,0.965019,4.175408,0.7812,0.8922,3,True,300,,"{'classifier__n_neighbors': 3, 'tfidf__use_idf': True, 'vectorizer__max_features': 300, 'vectorizer__stop_words': None}",2,0.781944,0.887339,0.783378,0.892305,0.778278,0.896955,0.119842,0.236494,0.002148,0.003927
16,0.928742,3.072961,0.7773,0.88515,3,True,350,english,"{'classifier__n_neighbors': 3, 'tfidf__use_idf': True, 'vectorizer__max_features': 350, 'vectorizer__stop_words': 'e...",3,0.766647,0.890489,0.782778,0.883156,0.782478,0.881806,0.073366,0.057893,0.007535,0.003815
14,0.944672,3.026088,0.7748,0.8872,3,True,300,english,"{'classifier__n_neighbors': 3, 'tfidf__use_idf': True, 'vectorizer__max_features': 300, 'vectorizer__stop_words': 'e...",4,0.763047,0.892439,0.777678,0.885256,0.783678,0.883906,0.016476,0.507439,0.008665,0.003745
12,0.990014,2.508165,0.7735,0.8815,3,True,250,english,"{'classifier__n_neighbors': 3, 'tfidf__use_idf': True, 'vectorizer__max_features': 250, 'vectorizer__stop_words': 'e...",5,0.762747,0.883438,0.775578,0.879556,0.782178,0.881506,0.109654,0.137139,0.008068,0.001585
13,0.830728,4.362559,0.7721,0.88935,3,True,250,,"{'classifier__n_neighbors': 3, 'tfidf__use_idf': True, 'vectorizer__max_features': 250, 'vectorizer__stop_words': None}",6,0.775345,0.888539,0.764776,0.887356,0.776178,0.892155,0.091845,0.802333,0.005189,0.002042
22,0.727,2.340584,0.7619,0.87875,3,False,350,english,"{'classifier__n_neighbors': 3, 'tfidf__use_idf': False, 'vectorizer__max_features': 350, 'vectorizer__stop_words': '...",7,0.754649,0.883738,0.762076,0.878656,0.768977,0.873856,0.043127,0.072686,0.005851,0.004035
20,0.843213,3.285683,0.7576,0.87725,3,False,300,english,"{'classifier__n_neighbors': 3, 'tfidf__use_idf': False, 'vectorizer__max_features': 300, 'vectorizer__stop_words': '...",8,0.745651,0.882238,0.755476,0.877756,0.771677,0.871756,0.089618,0.071229,0.010731,0.004294
4,0.578924,2.23731,0.7527,0.9092,2,True,350,english,"{'classifier__n_neighbors': 2, 'tfidf__use_idf': True, 'vectorizer__max_features': 350, 'vectorizer__stop_words': 'e...",9,0.74955,0.911491,0.762376,0.909255,0.746175,0.906855,0.01835,0.628051,0.006979,0.001893
18,0.987835,3.083754,0.7498,0.87475,3,False,250,english,"{'classifier__n_neighbors': 3, 'tfidf__use_idf': False, 'vectorizer__max_features': 250, 'vectorizer__stop_words': '...",10,0.75015,0.879238,0.743174,0.871156,0.756076,0.873856,0.182437,0.24311,0.005272,0.003359


In [21]:
best_pipeline = grid.best_estimator_

print('Train fit')
train_prediction = best_pipeline.predict(wine_df['description'])

print(classification_report(train_prediction, wine_df['rating']))

print('Test fit')
test_sample = wine_df_full.sample(n = 10000, replace=False)

test_prediction = best_pipeline.predict(test_sample['description'])

print(classification_report(test_prediction, test_sample['rating']))

Train fit
             precision    recall  f1-score   support

       High       0.84      0.90      0.87      3716
        Low       0.94      0.90      0.92      6284

avg / total       0.90      0.90      0.90     10000

Test fit
             precision    recall  f1-score   support

       High       0.70      0.77      0.74      3602
        Low       0.86      0.82      0.84      6398

avg / total       0.81      0.80      0.80     10000

