In [49]:
import nltk
import numpy as np
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import csv
import pandas as pd
import unicodedata
import spacy
from collections import Counter
import model_evaluation_utils as meu

In [2]:
# Import dataset (already have isolated the two columns for this analysis)
wine = pd.read_csv('winemag_filt.csv')

# Filter for 15 most popular wine varieties in the US
top15 = ['Pinot Noir','Chardonnay','Cabernet Sauvignon',
         'Red Blend','Riesling','Sauvignon Blanc','Syrah',
         'Rose','Merlot','Zinfandel','Malbec', 'White Blend',
        'Pinot Gris','Pinot Grigio','Shiraz','Moscato', 'Muscat']

wine = wine[wine.variety.isin(top15)]

# Take a random sample of 10000 observations from that filtered set
wine_samp = wine.sample(n=10000, random_state=27)

# Format columns as lists
corpus = wine_samp['description'].values.tolist()
labels = wine_samp['variety'].values.tolist()

In [3]:
# Accented char function
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [4]:
# Special char function
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

In [10]:
nlp = spacy.load('en_core_web_sm', parse=True, tag=True, entity=True)

# Lemmatization function (version of stemming that maintains English spellings)
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [11]:
# Tokenizer function
from nltk.tokenize.toktok import ToktokTokenizer
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
def remove_stopwords(text, is_lower_case=False, stopwords=stopword_list):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopwords]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [12]:
# All the functions together now!
def normalize_corpus(corpus,
                     accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
            
        normalized_corpus.append(doc)
        
    return normalized_corpus

In [13]:
# Applying to the data
norm_corpus = normalize_corpus(corpus)

In [14]:
# Making training and test sets
train_corpus, test_corpus, train_label_names, test_label_names = train_test_split(norm_corpus, labels, test_size=0.3, random_state=27)

In [15]:
# Table of distribution of varities in test, training datasets
trd = dict(Counter(train_label_names))
tsd = dict(Counter(test_label_names))

(pd.DataFrame([[key, trd[key], tsd[key]] for key in trd], 
             columns=['Target Label', 'Train Count', 'Test Count'])
.sort_values(by=['Train Count', 'Test Count'],
             ascending=False))

Unnamed: 0,Target Label,Train Count,Test Count
3,Pinot Noir,1267,517
0,Chardonnay,1083,480
8,Cabernet Sauvignon,870,361
1,Red Blend,803,368
6,Riesling,456,198
4,Sauvignon Blanc,440,191
7,Syrah,389,193
2,Rose,329,141
13,Merlot,280,104
5,Zinfandel,273,96


In [17]:
# build Bag of Words features on train articles
cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0)
cv_train_features = cv.fit_transform(train_corpus)

# transform test articles into features
cv_test_features = cv.transform(test_corpus)

In [18]:
# Naive Bayes model!
mnb = MultinomialNB(alpha=1)
mnb.fit(cv_train_features, train_label_names)
mnb_bow_cv_scores = cross_val_score(mnb, cv_train_features, train_label_names, cv=5)
mnb_bow_cv_mean_score = np.mean(mnb_bow_cv_scores)
print('CV Accuracy (5-fold):', mnb_bow_cv_scores)
print('Mean CV Accuracy:', mnb_bow_cv_mean_score)
mnb_bow_test_score = mnb.score(cv_test_features, test_label_names)
print('Test Accuracy:', mnb_bow_test_score)

# CV Accuracy (5-fold): [0.6059744  0.60271041 0.6152748  0.62517883 0.60086145]
# Mean CV Accuracy: 0.6099999779715365
# Test Accuracy: 0.624

CV Accuracy (5-fold): [0.6059744  0.60271041 0.6152748  0.62517883 0.60086145]
Mean CV Accuracy: 0.6099999779715365
Test Accuracy: 0.624


In [25]:
# Logistic Regression model!
lr = LogisticRegression(penalty='l2', max_iter=100, C=1, random_state=27)
lr.fit(cv_train_features, train_label_names)
lr_bow_cv_scores = cross_val_score(lr, cv_train_features, train_label_names, cv=5)
lr_bow_cv_mean_score = np.mean(lr_bow_cv_scores)
print('CV Accuracy (5-fold):', lr_bow_cv_scores)
print('Mean CV Accuracy:', lr_bow_cv_mean_score)
lr_bow_test_score = lr.score(cv_test_features, test_label_names)
print('Test Accuracy:', lr_bow_test_score)

# CV Accuracy (5-fold): [0.67923186 0.68972896 0.70164168 0.68383405 0.70136396]
# Mean CV Accuracy: 0.6911601035790784
# Test Accuracy: 0.704



CV Accuracy (5-fold): [0.67923186 0.68972896 0.70164168 0.68383405 0.70136396]
Mean CV Accuracy: 0.6911601035790784
Test Accuracy: 0.704


In [26]:
# LinearSVC model!
svm = LinearSVC(penalty='l2', C=1, random_state=27)
svm.fit(cv_train_features, train_label_names)
svm_bow_cv_scores = cross_val_score(svm, cv_train_features, train_label_names, cv=5)
svm_bow_cv_mean_score = np.mean(svm_bow_cv_scores)
print('CV Accuracy (5-fold):', svm_bow_cv_scores)
print('Mean CV Accuracy:', svm_bow_cv_mean_score)
svm_bow_test_score = svm.score(cv_test_features, test_label_names)
print('Test Accuracy:', svm_bow_test_score)

# CV Accuracy (5-fold): [0.61806543 0.64051355 0.65167737 0.62947067 0.65613783]
# Mean CV Accuracy: 0.639172972726904
# Test Accuracy: 0.6533333333333333

CV Accuracy (5-fold): [0.61806543 0.64051355 0.65167737 0.62947067 0.65613783]
Mean CV Accuracy: 0.639172972726904
Test Accuracy: 0.6533333333333333


In [21]:
# SDGClassifier!
svm_sgd = SGDClassifier(loss='hinge', penalty='l2', max_iter=5, random_state=27)
svm_sgd.fit(cv_train_features, train_label_names)
svmsgd_bow_cv_scores = cross_val_score(svm_sgd, cv_train_features, train_label_names, cv=5)
svmsgd_bow_cv_mean_score = np.mean(svmsgd_bow_cv_scores)
print('CV Accuracy (5-fold):', svmsgd_bow_cv_scores)
print('Mean CV Accuracy:', svmsgd_bow_cv_mean_score)
svmsgd_bow_test_score = svm_sgd.score(cv_test_features, test_label_names)
print('Test Accuracy:', svmsgd_bow_test_score)



CV Accuracy (5-fold): [0.61379801 0.64051355 0.6359743  0.6316166  0.64393396]
Mean CV Accuracy: 0.6331672830598999
Test Accuracy: 0.6723333333333333


In [24]:
# Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=10, random_state=27)
rfc.fit(cv_train_features, train_label_names)
rfc_bow_cv_scores = cross_val_score(rfc, cv_train_features, train_label_names, cv=5)
rfc_bow_cv_mean_score = np.mean(rfc_bow_cv_scores)
print('CV Accuracy (5-fold):', rfc_bow_cv_scores)
print('Mean CV Accuracy:', rfc_bow_cv_mean_score)
rfc_bow_test_score = rfc.score(cv_test_features, test_label_names)
print('Test Accuracy:', rfc_bow_test_score)

# CV Accuracy (5-fold): [0.53769559 0.56847361 0.55817273 0.57081545 0.55994257]
# Mean CV Accuracy: 0.5590199907710363
# Test Accuracy: 0.5706666666666667

CV Accuracy (5-fold): [0.53769559 0.56847361 0.55817273 0.57081545 0.55994257]
Mean CV Accuracy: 0.5590199907710363
Test Accuracy: 0.5706666666666667


In [29]:
# Gradient Boosting Classifier
gbc = GradientBoostingClassifier(n_estimators=10, random_state=27)
gbc.fit(cv_train_features, train_label_names)
gbc_bow_cv_scores = cross_val_score(gbc, cv_train_features, train_label_names, cv=5)
gbc_bow_cv_mean_score = np.mean(gbc_bow_cv_scores)
print('CV Accuracy (5-fold):', gbc_bow_cv_scores)
print('Mean CV Accuracy:', gbc_bow_cv_mean_score)
gbc_bow_test_score = gbc.score(cv_test_features, test_label_names)
print('Test Accuracy:', gbc_bow_test_score)

# CV Accuracy (5-fold): [0.58392603 0.59201141 0.61456103 0.5758226  0.5965542 ]
# Mean CV Accuracy: 0.5925750549377549
# Test Accuracy: 0.5923333333333334

CV Accuracy (5-fold): [0.58392603 0.59201141 0.61456103 0.5758226  0.5965542 ]
Mean CV Accuracy: 0.5925750549377549
Test Accuracy: 0.5923333333333334


In [31]:
# build BOW features on train articles with TFIDF
tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0)
tv_train_features = tv.fit_transform(train_corpus)

# transform test articles into features
tv_test_features = tv.transform(test_corpus)

# Now I'm going to re-run all of those models with this new set of features!

In [32]:
mnb = MultinomialNB(alpha=1)
mnb.fit(tv_train_features, train_label_names)
mnb_tfidf_cv_scores = cross_val_score(mnb, tv_train_features, train_label_names, cv=5)
mnb_tfidf_cv_mean_score = np.mean(mnb_tfidf_cv_scores)
print('CV Accuracy (5-fold):', mnb_tfidf_cv_scores)
print('Mean CV Accuracy:', mnb_tfidf_cv_mean_score)
mnb_tfidf_test_score = mnb.score(tv_test_features, test_label_names)
print('Test Accuracy:', mnb_tfidf_test_score)

CV Accuracy (5-fold): [0.46443812 0.47289586 0.47965739 0.472103   0.47738693]
Mean CV Accuracy: 0.4732962623862306
Test Accuracy: 0.48933333333333334


In [33]:
lr = LogisticRegression(penalty='l2', max_iter=100, C=1, random_state=27)
lr.fit(tv_train_features, train_label_names)
lr_tfidf_cv_scores = cross_val_score(lr, tv_train_features, train_label_names, cv=5)
lr_tfidf_cv_mean_score = np.mean(lr_tfidf_cv_scores)
print('CV Accuracy (5-fold):', lr_tfidf_cv_scores)
print('Mean CV Accuracy:', lr_tfidf_cv_mean_score)
lr_tfidf_test_score = lr.score(tv_test_features, test_label_names)
print('Test Accuracy:', lr_tfidf_test_score)



CV Accuracy (5-fold): [0.6514936  0.63623395 0.67594575 0.66595136 0.64895908]
Mean CV Accuracy: 0.6557167487195439
Test Accuracy: 0.672


In [34]:
# Second most accurate model
svm = LinearSVC(penalty='l2', C=1, random_state=27)
svm.fit(tv_train_features, train_label_names)
svm_tfidf_cv_scores = cross_val_score(svm, tv_train_features, train_label_names, cv=5)
svm_tfidf_cv_mean_score = np.mean(svm_tfidf_cv_scores)
print('CV Accuracy (5-fold):', svm_tfidf_cv_scores)
print('Mean CV Accuracy:', svm_tfidf_cv_mean_score)
svm_tfidf_test_score = svm.score(tv_test_features, test_label_names)
print('Test Accuracy:', svm_tfidf_test_score)

CV Accuracy (5-fold): [0.67923186 0.68045649 0.70877944 0.69599428 0.68772434]
Mean CV Accuracy: 0.6904372821859247
Test Accuracy: 0.7126666666666667


In [35]:
# Returns the most accurate model
svm_sgd = SGDClassifier(loss='hinge', penalty='l2', max_iter=5, random_state=27)
svm_sgd.fit(tv_train_features, train_label_names)
svmsgd_tfidf_cv_scores = cross_val_score(svm_sgd, tv_train_features, train_label_names, cv=5)
svmsgd_tfidf_cv_mean_score = np.mean(svmsgd_tfidf_cv_scores)
print('CV Accuracy (5-fold):', svmsgd_tfidf_cv_scores)
print('Mean CV Accuracy:', svmsgd_tfidf_cv_mean_score)
svmsgd_tfidf_test_score = svm_sgd.score(tv_test_features, test_label_names)
print('Test Accuracy:', svmsgd_tfidf_test_score)



CV Accuracy (5-fold): [0.68278805 0.68616262 0.71734475 0.69027182 0.68269921]
Mean CV Accuracy: 0.6918532913993541
Test Accuracy: 0.7136666666666667


In [36]:
rfc = RandomForestClassifier(n_estimators=10, random_state=27)
rfc.fit(tv_train_features, train_label_names)
rfc_tfidf_cv_scores = cross_val_score(rfc, tv_train_features, train_label_names, cv=5)
rfc_tfidf_cv_mean_score = np.mean(rfc_tfidf_cv_scores)
print('CV Accuracy (5-fold):', rfc_tfidf_cv_scores)
print('Mean CV Accuracy:', rfc_tfidf_cv_mean_score)
rfc_tfidf_test_score = rfc.score(tv_test_features, test_label_names)
print('Test Accuracy:', rfc_tfidf_test_score)

CV Accuracy (5-fold): [0.54623044 0.54564907 0.57316203 0.5658083  0.55348169]
Mean CV Accuracy: 0.5568663065194284
Test Accuracy: 0.545


In [37]:
gbc = GradientBoostingClassifier(n_estimators=10, random_state=27 )
gbc.fit(tv_train_features, train_label_names)
gbc_tfidf_cv_scores = cross_val_score(gbc, tv_train_features, train_label_names, cv=5)
gbc_tfidf_cv_mean_score = np.mean(gbc_tfidf_cv_scores)
print('CV Accuracy (5-fold):', gbc_tfidf_cv_scores)
print('Mean CV Accuracy:', gbc_tfidf_cv_mean_score)
gbc_tfidf_test_score = gbc.score(tv_test_features, test_label_names)
print('Test Accuracy:', gbc_tfidf_test_score)

CV Accuracy (5-fold): [0.57041252 0.58701854 0.60314061 0.57653791 0.58650395]
Mean CV Accuracy: 0.5847227072357699
Test Accuracy: 0.598


In [38]:
# Making a pretty table of all the results
pd.DataFrame([['Naive Bayes', mnb_bow_cv_mean_score, mnb_bow_test_score, 
               mnb_tfidf_cv_mean_score, mnb_tfidf_test_score],
              ['Logistic Regression', lr_bow_cv_mean_score, lr_bow_test_score, 
               lr_tfidf_cv_mean_score, lr_tfidf_test_score],
              ['Linear SVM', svm_bow_cv_mean_score, svm_bow_test_score, 
               svm_tfidf_cv_mean_score, svm_tfidf_test_score],
              ['Linear SVM (SGD)', svmsgd_bow_cv_mean_score, svmsgd_bow_test_score, 
               svmsgd_tfidf_cv_mean_score, svmsgd_tfidf_test_score],
              ['Random Forest', rfc_bow_cv_mean_score, rfc_bow_test_score, 
               rfc_tfidf_cv_mean_score, rfc_tfidf_test_score],
              ['Gradient Boosted Machines', gbc_bow_cv_mean_score, gbc_bow_test_score, 
               gbc_tfidf_cv_mean_score, gbc_tfidf_test_score]],
             columns=['Model', 'CV Score (TF)', 'Test Score (TF)', 'CV Score (TF-IDF)', 'Test Score (TF-IDF)'],
             ).T

Unnamed: 0,0,1,2,3,4,5
Model,Naive Bayes,Logistic Regression,Linear SVM,Linear SVM (SGD),Random Forest,Gradient Boosted Machines
CV Score (TF),0.61,0.69116,0.639173,0.633167,0.55902,0.592575
Test Score (TF),0.624,0.704,0.653333,0.672333,0.570667,0.592333
CV Score (TF-IDF),0.473296,0.655717,0.690437,0.691853,0.556866,0.584723
Test Score (TF-IDF),0.489333,0.672,0.712667,0.713667,0.545,0.598


In [41]:
# Fiddling with parameters of SGDCClassifier to optimize model
sgd_pipeline = Pipeline([('tfidf', TfidfVectorizer()),
                        ('sgd', SGDClassifier(random_state=27 ))
                       ])

param_grid = {'tfidf__ngram_range': [(1, 1), (1, 2)],
              'sgd__alpha': [1e-7, 1e-6, 1e-5, 1e-4]
}

gs_sgd = GridSearchCV(sgd_pipeline, param_grid, cv=5, verbose=2)
gs_sgd = gs_sgd.fit(train_corpus, train_label_names)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] sgd__alpha=1e-07, tfidf__ngram_range=(1, 1) .....................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ...... sgd__alpha=1e-07, tfidf__ngram_range=(1, 1), total=   0.2s
[CV] sgd__alpha=1e-07, tfidf__ngram_range=(1, 1) .....................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV] ...... sgd__alpha=1e-07, tfidf__ngram_range=(1, 1), total=   0.2s
[CV] sgd__alpha=1e-07, tfidf__ngram_range=(1, 1) .....................
[CV] ...... sgd__alpha=1e-07, tfidf__ngram_range=(1, 1), total=   0.2s
[CV] sgd__alpha=1e-07, tfidf__ngram_range=(1, 1) .....................
[CV] ...... sgd__alpha=1e-07, tfidf__ngram_range=(1, 1), total=   0.2s
[CV] sgd__alpha=1e-07, tfidf__ngram_range=(1, 1) .....................
[CV] ...... sgd__alpha=1e-07, tfidf__ngram_range=(1, 1), total=   0.2s
[CV] sgd__alpha=1e-07, tfidf__ngram_range=(1, 2) .....................
[CV] ...... sgd__alpha=1e-07, tfidf__ngram_range=(1, 2), total=   0.5s
[CV] sgd__alpha=1e-07, tfidf__ngram_range=(1, 2) .....................
[CV] ...... sgd__alpha=1e-07, tfidf__ngram_range=(1, 2), total=   0.6s
[CV] sgd__alpha=1e-07, tfidf__ngram_range=(1, 2) .....................
[CV] ...... sgd__alpha=1e-07, tfidf__ngram_range=(1, 2), total=   0.5s
[CV] sgd__alpha=1e-07, tfidf__ngram_range=(1, 2) .....................
[CV] .

[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:   21.6s finished


In [42]:
# Testing the "best parameters"
gs_sgd.best_estimator_.get_params()

{'memory': None,
 'steps': [('tfidf',
   TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
           stop_words=None, strip_accents=None, sublinear_tf=False,
           token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
           vocabulary=None)),
  ('sgd', SGDClassifier(alpha=0.0001, average=False, class_weight=None,
          early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
          l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
          n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
          power_t=0.5, random_state=42, shuffle=True, tol=None,
          validation_fraction=0.1, verbose=0, warm_start=False))],
 'tfidf': TfidfVectorizer(analyzer='word', binary=False, decode_e

In [43]:
# Accuracy of SGDClassifier now
best_sgd_test_score = gs_sgd.score(test_corpus, test_label_names)
print('Test Accuracy :', best_sgd_test_score)

Test Accuracy : 0.7123333333333334


In [44]:
# Repeatinf this process with the LinearSVC model
svm_pipeline = Pipeline([('tfidf', TfidfVectorizer()),
                        ('svm', LinearSVC(random_state=27))
                       ])

param_grid = {'tfidf__ngram_range': [(1, 1), (1, 2)],
              'svm__C': [0.01, 0.1, 1, 5]
}

gs_svm = GridSearchCV(svm_pipeline, param_grid, cv=5, verbose=2)
gs_svm = gs_svm.fit(train_corpus, train_label_names)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] svm__C=0.01, tfidf__ngram_range=(1, 1) ..........................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 1), total=   0.3s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 1) ..........................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s


[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 1), total=   0.2s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 1) ..........................
[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 1), total=   0.2s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 1) ..........................
[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 1), total=   0.3s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 1) ..........................
[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 1), total=   0.2s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 2) ..........................
[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 2), total=   0.6s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 2) ..........................
[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 2), total=   0.6s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 2) ..........................
[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 2), total=   0.6s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 2) ..........................
[CV] .

[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:   29.8s finished


In [45]:
gs_svm.best_estimator_.get_params()

{'memory': None,
 'steps': [('tfidf',
   TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
           stop_words=None, strip_accents=None, sublinear_tf=False,
           token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
           vocabulary=None)),
  ('svm', LinearSVC(C=5, class_weight=None, dual=True, fit_intercept=True,
        intercept_scaling=1, loss='squared_hinge', max_iter=1000,
        multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
        verbose=0))],
 'tfidf': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=None, min_df=1,
         ngram_range=(1, 2), norm='l2'

In [46]:
# Now this model is the most accurate
best_svm_test_score = gs_svm.score(test_corpus, test_label_names)
print('Test Accuracy :', best_svm_test_score)

Test Accuracy : 0.7166666666666667


In [50]:
svm_predictions = gs_svm.predict(test_corpus)
unique_classes = list(set(test_label_names))
meu.get_metrics(true_labels=test_label_names, predicted_labels=svm_predictions)

Accuracy: 0.7167
Precision: 0.7208
Recall: 0.7167
F1 Score: 0.7064


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [51]:
meu.display_classification_report(true_labels=test_label_names, 
                                  predicted_labels=svm_predictions, classes=unique_classes)

                    precision    recall  f1-score   support

        Chardonnay       0.75      0.90      0.82       480
   Sauvignon Blanc       0.77      0.72      0.75       191
        Pinot Gris       0.93      0.47      0.62        60
       White Blend       0.80      0.47      0.59        91
              Rose       0.77      0.71      0.74       141
            Shiraz       0.71      0.38      0.50        26
         Zinfandel       0.55      0.38      0.45        96
            Malbec       0.59      0.45      0.51       113
           Moscato       0.86      0.33      0.48        18
            Merlot       0.80      0.38      0.51       104
         Red Blend       0.79      0.72      0.75       368
        Pinot Noir       0.69      0.88      0.77       517
            Muscat       0.00      0.00      0.00         7
          Riesling       0.79      0.81      0.80       198
      Pinot Grigio       0.78      0.50      0.61        36
Cabernet Sauvignon       0.61      0.72

  'precision', 'predicted', average, warn_for)
