In [1]:
import nltk
import numpy as np
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import csv
import pandas as pd
import unicodedata
import spacy
from collections import Counter
import model_evaluation_utils as meu

In [2]:
# Import dataset (already have isolated the two columns for this analysis)
wine = pd.read_csv('winemag_filt.csv')

# Filter for 6 most popular wine varieties by frequency in the dataset excluding blends
top6 = ['Pinot Noir','Chardonnay','Cabernet Sauvignon','Riesling','Sauvignon Blanc','Syrah']

wine = wine[wine.variety.isin(top6)]

# Take a random sample of 10000 observations from that filtered set
#wine_samp = wine.sample(n=10000, random_state=27)

# Format columns as lists
corpus = wine['description'].values.tolist()
labels = wine['variety'].values.tolist()

In [3]:
# Accented char function
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [4]:
# Special char function
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

In [5]:
nlp = spacy.load('en_core_web_sm', parse=True, tag=True, entity=True)

# Lemmatization function (version of stemming that maintains English spellings)
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [6]:
# Tokenizer function
from nltk.tokenize.toktok import ToktokTokenizer
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
def remove_stopwords(text, is_lower_case=False, stopwords=stopword_list):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopwords]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [7]:
# All the functions together now!
def normalize_corpus(corpus,
                     accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
            
        normalized_corpus.append(doc)
        
    return normalized_corpus

In [8]:
# Applying to the data
norm_corpus = normalize_corpus(corpus)

In [9]:
# Making training and test sets
train_corpus, test_corpus, train_label_names, test_label_names = train_test_split(norm_corpus, labels, test_size=0.3, random_state=27)

In [None]:
# Table of distribution of varities in test, training datasets
trd = dict(Counter(train_label_names))
tsd = dict(Counter(test_label_names))

(pd.DataFrame([[key, trd[key], tsd[key]] for key in trd], 
             columns=['Target Label', 'Train Count', 'Test Count'])
.sort_values(by=['Train Count', 'Test Count'],
             ascending=False))

In [None]:
# build Bag of Words features on train articles
cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0)
cv_train_features = cv.fit_transform(train_corpus)

# transform test articles into features
cv_test_features = cv.transform(test_corpus)

In [None]:
# Naive Bayes model!
mnb = MultinomialNB(alpha=1)
mnb.fit(cv_train_features, train_label_names)
mnb_bow_cv_scores = cross_val_score(mnb, cv_train_features, train_label_names, cv=5)
mnb_bow_cv_mean_score = np.mean(mnb_bow_cv_scores)
print('CV Accuracy (5-fold):', mnb_bow_cv_scores)
print('Mean CV Accuracy:', mnb_bow_cv_mean_score)
mnb_bow_test_score = mnb.score(cv_test_features, test_label_names)
print('Test Accuracy:', mnb_bow_test_score)

# CV Accuracy (5-fold): [0.83947908 0.83635831 0.8374817  0.84333821 0.84070278]
# Mean CV Accuracy: 0.8394720166053153
# Test Accuracy: 0.8512876562606736

In [None]:
# Logistic Regression model!
lr = LogisticRegression(penalty='l2', max_iter=100, C=1, random_state=27)
lr.fit(cv_train_features, train_label_names)
lr_bow_cv_scores = cross_val_score(lr, cv_train_features, train_label_names, cv=5)
lr_bow_cv_mean_score = np.mean(lr_bow_cv_scores)
print('CV Accuracy (5-fold):', lr_bow_cv_scores)
print('Mean CV Accuracy:', lr_bow_cv_mean_score)
lr_bow_test_score = lr.score(cv_test_features, test_label_names)
print('Test Accuracy:', lr_bow_test_score)

# CV Accuracy (5-fold): [0.86391572 0.86372951 0.86032211 0.86486091 0.86412884]
# Mean CV Accuracy: 0.8633914166360599
# Test Accuracy: 0.8708928205478517

In [None]:
# LinearSVC model!
svm = LinearSVC(penalty='l2', C=1, random_state=27)
svm.fit(cv_train_features, train_label_names)
svm_bow_cv_scores = cross_val_score(svm, cv_train_features, train_label_names, cv=5)
svm_bow_cv_mean_score = np.mean(svm_bow_cv_scores)
print('CV Accuracy (5-fold):', svm_bow_cv_scores)
print('Mean CV Accuracy:', svm_bow_cv_mean_score)
svm_bow_test_score = svm.score(cv_test_features, test_label_names)
print('Test Accuracy:', svm_bow_test_score)

# CV Accuracy (5-fold): [0.84108867 0.84279859 0.83426061 0.83806735 0.84070278]
# Mean CV Accuracy: 0.8393836031658332
# Test Accuracy: 0.8488284718901564

In [None]:
# SDGClassifier!
svm_sgd = SGDClassifier(loss='hinge', penalty='l2', max_iter=5, random_state=27)
svm_sgd.fit(cv_train_features, train_label_names)
svmsgd_bow_cv_scores = cross_val_score(svm_sgd, cv_train_features, train_label_names, cv=5)
svmsgd_bow_cv_mean_score = np.mean(svmsgd_bow_cv_scores)
print('CV Accuracy (5-fold):', svmsgd_bow_cv_scores)
print('Mean CV Accuracy:', svmsgd_bow_cv_mean_score)
svmsgd_bow_test_score = svm_sgd.score(cv_test_features, test_label_names)
print('Test Accuracy:', svmsgd_bow_test_score)

# CV Accuracy (5-fold): [0.8444542  0.84909251 0.83762811 0.84392387 0.84670571]
# Mean CV Accuracy: 0.8443608784243025
# Test Accuracy: 0.8525855591228909

In [None]:
# Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=10, random_state=27)
rfc.fit(cv_train_features, train_label_names)
rfc_bow_cv_scores = cross_val_score(rfc, cv_train_features, train_label_names, cv=5)
rfc_bow_cv_mean_score = np.mean(rfc_bow_cv_scores)
print('CV Accuracy (5-fold):', rfc_bow_cv_scores)
print('Mean CV Accuracy:', rfc_bow_cv_mean_score)
rfc_bow_test_score = rfc.score(cv_test_features, test_label_names)
print('Test Accuracy:', rfc_bow_test_score)

# CV Accuracy (5-fold): [0.79777583 0.79844848 0.79970717 0.79809663 0.79106881]
# Mean CV Accuracy: 0.7970193850581991
# Test Accuracy: 0.8059976774369834

In [None]:
# Gradient Boosting Classifier
gbc = GradientBoostingClassifier(n_estimators=10, random_state=27)
gbc.fit(cv_train_features, train_label_names)
gbc_bow_cv_scores = cross_val_score(gbc, cv_train_features, train_label_names, cv=5)
gbc_bow_cv_mean_score = np.mean(gbc_bow_cv_scores)
print('CV Accuracy (5-fold):', gbc_bow_cv_scores)
print('Mean CV Accuracy:', gbc_bow_cv_mean_score)
gbc_bow_test_score = gbc.score(cv_test_features, test_label_names)
print('Test Accuracy:', gbc_bow_test_score)

# CV Accuracy (5-fold): [0.74422008 0.7441452  0.74114202 0.73923865 0.75065886]
# Mean CV Accuracy: 0.7438809613264279
# Test Accuracy: 0.7405560489104447

In [None]:
# build BOW features on train articles with TFIDF
tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0)
tv_train_features = tv.fit_transform(train_corpus)

# transform test articles into features
tv_test_features = tv.transform(test_corpus)

# Now I'm going to re-run all of those models with this new set of features!

In [None]:
mnb = MultinomialNB(alpha=1)
mnb.fit(tv_train_features, train_label_names)
mnb_tfidf_cv_scores = cross_val_score(mnb, tv_train_features, train_label_names, cv=5)
mnb_tfidf_cv_mean_score = np.mean(mnb_tfidf_cv_scores)
print('CV Accuracy (5-fold):', mnb_tfidf_cv_scores)
print('Mean CV Accuracy:', mnb_tfidf_cv_mean_score)
mnb_tfidf_test_score = mnb.score(tv_test_features, test_label_names)
print('Test Accuracy:', mnb_tfidf_test_score)

# CV Accuracy (5-fold): [0.74261048 0.74107143 0.73967789 0.74450952 0.74860908]
# Mean CV Accuracy: 0.7432956783377672
# Test Accuracy: 0.7549012910717945

In [None]:
lr = LogisticRegression(penalty='l2', max_iter=100, C=1, random_state=27)
lr.fit(tv_train_features, train_label_names)
lr_tfidf_cv_scores = cross_val_score(lr, tv_train_features, train_label_names, cv=5)
lr_tfidf_cv_mean_score = np.mean(lr_tfidf_cv_scores)
print('CV Accuracy (5-fold):', lr_tfidf_cv_scores)
print('Mean CV Accuracy:', lr_tfidf_cv_mean_score)
lr_tfidf_test_score = lr.score(tv_test_features, test_label_names)
print('Test Accuracy:', lr_tfidf_test_score)

# CV Accuracy (5-fold): [0.86289143 0.86080211 0.86325037 0.86325037 0.86661786]
# Mean CV Accuracy: 0.8633624254782909
# Test Accuracy: 0.8665209372224879

In [None]:
svm = LinearSVC(penalty='l2', C=1, random_state=27)
svm.fit(tv_train_features, train_label_names)
svm_tfidf_cv_scores = cross_val_score(svm, tv_train_features, train_label_names, cv=5)
svm_tfidf_cv_mean_score = np.mean(svm_tfidf_cv_scores)
print('CV Accuracy (5-fold):', svm_tfidf_cv_scores)
print('Mean CV Accuracy:', svm_tfidf_cv_mean_score)
svm_tfidf_test_score = svm.score(tv_test_features, test_label_names)
print('Test Accuracy:', svm_tfidf_test_score)

# CV Accuracy (5-fold): [0.8627451  0.86299766 0.86105417 0.86032211 0.86661786]
# Mean CV Accuracy: 0.8627473799206935
# Test Accuracy: 0.8703463351321812

In [None]:
svm_sgd = SGDClassifier(loss='hinge', penalty='l2', max_iter=5, random_state=27)
svm_sgd.fit(tv_train_features, train_label_names)
svmsgd_tfidf_cv_scores = cross_val_score(svm_sgd, tv_train_features, train_label_names, cv=5)
svmsgd_tfidf_cv_mean_score = np.mean(svmsgd_tfidf_cv_scores)
print('CV Accuracy (5-fold):', svmsgd_tfidf_cv_scores)
print('Mean CV Accuracy:', svmsgd_tfidf_cv_mean_score)
svmsgd_tfidf_test_score = svm_sgd.score(tv_test_features, test_label_names)
print('Test Accuracy:', svmsgd_tfidf_test_score)

# CV Accuracy (5-fold): [0.85923325 0.8588993  0.86002928 0.86046852 0.86647145]
# Mean CV Accuracy: 0.8610203592510397
# Test Accuracy: 0.8620124325432065

In [None]:
rfc = RandomForestClassifier(n_estimators=10, random_state=27)
rfc.fit(tv_train_features, train_label_names)
rfc_tfidf_cv_scores = cross_val_score(rfc, tv_train_features, train_label_names, cv=5)
rfc_tfidf_cv_mean_score = np.mean(rfc_tfidf_cv_scores)
print('CV Accuracy (5-fold):', rfc_tfidf_cv_scores)
print('Mean CV Accuracy:', rfc_tfidf_cv_mean_score)
rfc_tfidf_test_score = rfc.score(tv_test_features, test_label_names)
print('Test Accuracy:', rfc_tfidf_test_score)

# CV Accuracy (5-fold): [0.79323968 0.7876171  0.79399707 0.79165447 0.79414348]
# Mean CV Accuracy: 0.7921303603827885
# Test Accuracy: 0.8066124735296126

In [None]:
gbc = GradientBoostingClassifier(n_estimators=10, random_state=27 )
gbc.fit(tv_train_features, train_label_names)
gbc_tfidf_cv_scores = cross_val_score(gbc, tv_train_features, train_label_names, cv=5)
gbc_tfidf_cv_mean_score = np.mean(gbc_tfidf_cv_scores)
print('CV Accuracy (5-fold):', gbc_tfidf_cv_scores)
print('Mean CV Accuracy:', gbc_tfidf_cv_mean_score)
gbc_tfidf_test_score = gbc.score(tv_test_features, test_label_names)
print('Test Accuracy:', gbc_tfidf_test_score)

# CV Accuracy (5-fold): [0.74539069 0.74399883 0.74040996 0.73923865 0.75021962]
# Mean CV Accuracy: 0.7438515502069564
# Test Accuracy: 0.7415807090648269

In [None]:
# Making a pretty table of all the results
pd.DataFrame([['Naive Bayes', mnb_bow_cv_mean_score, mnb_bow_test_score, 
               mnb_tfidf_cv_mean_score, mnb_tfidf_test_score],
              ['Logistic Regression', lr_bow_cv_mean_score, lr_bow_test_score, 
               lr_tfidf_cv_mean_score, lr_tfidf_test_score],
              ['Linear SVM', svm_bow_cv_mean_score, svm_bow_test_score, 
               svm_tfidf_cv_mean_score, svm_tfidf_test_score],
              ['Linear SVM (SGD)', svmsgd_bow_cv_mean_score, svmsgd_bow_test_score, 
               svmsgd_tfidf_cv_mean_score, svmsgd_tfidf_test_score],
              ['Random Forest', rfc_bow_cv_mean_score, rfc_bow_test_score, 
               rfc_tfidf_cv_mean_score, rfc_tfidf_test_score],
              ['Gradient Boosted Machines', gbc_bow_cv_mean_score, gbc_bow_test_score, 
               gbc_tfidf_cv_mean_score, gbc_tfidf_test_score]],
             columns=['Model', 'CV Score (TF)', 'Test Score (TF)', 'CV Score (TF-IDF)', 'Test Score (TF-IDF)'],
             ).T

In [10]:
# Fiddling with parameters of LinearSVM on TFIDF
svm_pipeline = Pipeline([('tfidf', TfidfVectorizer()),
                        ('svm', LinearSVC(random_state=27))
                       ])

param_grid = {'tfidf__ngram_range': [(1, 1), (1, 2)],
              'svm__C': [0.01, 0.1, 1, 5]
}

gs_svm = GridSearchCV(svm_pipeline, param_grid, cv=5, verbose=2)
gs_svm = gs_svm.fit(train_corpus, train_label_names)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] svm__C=0.01, tfidf__ngram_range=(1, 1) ..........................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 1), total=   1.1s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 1) ..........................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.6s remaining:    0.0s


[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 1), total=   1.0s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 1) ..........................
[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 1), total=   1.1s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 1) ..........................
[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 1), total=   1.0s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 1) ..........................
[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 1), total=   1.0s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 2) ..........................
[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 2), total=   2.7s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 2) ..........................
[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 2), total=   2.7s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 2) ..........................
[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 2), total=   2.9s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 2) ..........................
[CV] .

[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  2.2min finished


In [11]:
# The best model!
gs_svm.best_estimator_.get_params()

{'memory': None,
 'steps': [('tfidf',
   TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
           stop_words=None, strip_accents=None, sublinear_tf=False,
           token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
           vocabulary=None)),
  ('svm', LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
        intercept_scaling=1, loss='squared_hinge', max_iter=1000,
        multi_class='ovr', penalty='l2', random_state=27, tol=0.0001,
        verbose=0))],
 'tfidf': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=None, min_df=1,
         ngram_range=(1, 2), norm='l2'

In [12]:
svm_predictions = gs_svm.predict(test_corpus)
unique_classes = list(set(test_label_names))
meu.get_metrics(true_labels=test_label_names, predicted_labels=svm_predictions)

# Accuracy: 0.8825
# Precision: 0.8827
# Recall: 0.8825
# F1 Score: 0.8808

Accuracy: 0.8825
Precision: 0.8827
Recall: 0.8825
F1 Score: 0.8808


In [None]:
# Fiddling with parameters of Logistic on TF
lr_pipeline = Pipeline([('tf', CountVectorizer()),
                        ('lr', LogisticRegression(random_state=27))
                       ])

param_grid = {'tf__ngram_range': [(1, 1), (1, 2)],
              'lr__C': [0.01, 0.1, 1, 5]
}

gs_lr = GridSearchCV(lr_pipeline, param_grid, cv=5, verbose=2)
gs_lr = gs_lr.fit(train_corpus, train_label_names)

In [None]:
lr_predictions = gs_lr.predict(test_corpus)
unique_classes = list(set(test_label_names))
meu.get_metrics(true_labels=test_label_names, predicted_labels=lr_predictions)

# Accuracy: 0.8812
# Precision: 0.8806
# Recall: 0.8812
# F1 Score: 0.8797

In [None]:
meu.display_classification_report(true_labels=test_label_names, 
                                  predicted_labels=svm_predictions, classes=unique_classes)

#    precision    recall  f1-score   support
#
#        Pinot Noir       0.89      0.94      0.91      3957
#          Riesling       0.93      0.90      0.92      1563
#   Sauvignon Blanc       0.87      0.76      0.81      1495
#        Chardonnay       0.87      0.94      0.91      3496
#             Syrah       0.88      0.70      0.78      1257
# Cabernet Sauvignon       0.86      0.87      0.87      2871

#         micro avg       0.88      0.88      0.88     14639
#         macro avg       0.89      0.85      0.87     14639
#      weighted avg       0.88      0.88      0.88     14639

In [None]:
# Confusion Matrix
from sklearn import metrics

unique_classes = list(set(test_label_names))

cm_frame = pd.DataFrame(data=cm, 
                        columns=unique_classes, 
                        index=unique_classes )
print(cm_frame) 

In [14]:
preds = pd.DataFrame({'Review' : test_corpus,
                      'Actual' : test_label_names,
                      'Predicted' : svm_predictions
                      })

print(preds)
preds.to_csv('predictions.csv')

                                                  Review              Actual  \
0      big toasty brood first calm become glass pleas...               Syrah   
1      start decently menthol toast aroma new oak lay...          Chardonnay   
2      simple pinot noir little sweet spritzy taste l...          Pinot Noir   
3      ripe wine violet aroma layer ripe jammy berry ...               Syrah   
4      source three acre vineyard plant smooth quite ...          Pinot Noir   
5      touch petrol lend slick mineral sheen luscious...            Riesling   
6      start slightly funky strike match aroma open r...     Sauvignon Blanc   
7      represent nice value refreshing balanced chard...          Chardonnay   
8      preserve lemon lemon curd combine dramatic eff...          Chardonnay   
9      hedonistic wine aroma ember dark coffee earth ...               Syrah   
10     fruity blackberry cherry flavor easy drink goo...  Cabernet Sauvignon   
11     soft ripe perfumed wine fine clea

In [None]:
# build BOW features on train articles with TFIDF w/ parameters from above
tv = TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                     encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
           stop_words=None, strip_accents=None, sublinear_tf=False,
           token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
           vocabulary=None)
tv_train_features = tv.fit_transform(train_corpus)

# transform test articles into features
tv_test_features = tv.transform(test_corpus)

In [None]:
indices = np.argsort(tv.idf_)[::-1]
features = tv.get_feature_names()
top_n = 25
top_features = [features[i] for i in indices[:top_n]]
print(top_features)