In [83]:
import nltk
import numpy as np
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import csv
import pandas as pd
import unicodedata
import spacy
from collections import Counter
import model_evaluation_utils as meu

In [84]:
# Import dataset (already have isolated the two columns for this analysis)
wine = pd.read_csv('winemag_filt.csv')

# Filter for 15 most popular wine varieties in the US
top15 = ['Pinot Noir','Chardonnay','Cabernet Sauvignon',
         'Red Blend','Riesling','Sauvignon Blanc','Syrah',
         'Rose','Merlot','Zinfandel','Malbec', 'White Blend',
        'Pinot Gris','Pinot Grigio','Shiraz','Moscato', 'Muscat']

wine = wine[wine.variety.isin(top15)]

# Take a random sample of 10000 observations from that filtered set
wine_samp = wine.sample(n=10000, random_state=27)

# Format columns as lists
corpus = wine_samp['description'].values.tolist()
labels = wine_samp['variety'].values.tolist()

labels = [v.replace('Moscato', 'Mo/uscat(o)') for v in labels]
labels = [v.replace('Muscat', 'Mo/uscat(o)') for v in labels]
labels = [v.replace('Pinot Gris', 'Pinot Gris/Grigio') for v in labels]
labels = [v.replace('Pinot Grigio', 'Pinot Gris/Grigio') for v in labels]

In [85]:
# Accented char function
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [86]:
# Special char function
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

In [87]:
nlp = spacy.load('en_core_web_sm', parse=True, tag=True, entity=True)

# Lemmatization function (version of stemming that maintains English spellings)
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [88]:
# Tokenizer function
from nltk.tokenize.toktok import ToktokTokenizer
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
def remove_stopwords(text, is_lower_case=False, stopwords=stopword_list):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopwords]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [89]:
# All the functions together now!
def normalize_corpus(corpus,
                     accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
            
        normalized_corpus.append(doc)
        
    return normalized_corpus

In [90]:
# Applying to the data
norm_corpus = normalize_corpus(corpus)

In [91]:
# Making training and test sets
train_corpus, test_corpus, train_label_names, test_label_names = train_test_split(norm_corpus, labels, test_size=0.3, random_state=27)

In [92]:
# Table of distribution of varities in test, training datasets
trd = dict(Counter(train_label_names))
tsd = dict(Counter(test_label_names))

(pd.DataFrame([[key, trd[key], tsd[key]] for key in trd], 
             columns=['Target Label', 'Train Count', 'Test Count'])
.sort_values(by=['Train Count', 'Test Count'],
             ascending=False))

Unnamed: 0,Target Label,Train Count,Test Count
3,Pinot Noir,1267,517
0,Chardonnay,1083,480
8,Cabernet Sauvignon,870,361
1,Red Blend,803,368
6,Riesling,456,198
4,Sauvignon Blanc,440,191
7,Syrah,389,193
2,Rose,329,141
13,Merlot,280,104
5,Zinfandel,273,96


In [93]:
# build Bag of Words features on train articles
cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0)
cv_train_features = cv.fit_transform(train_corpus)

# transform test articles into features
cv_test_features = cv.transform(test_corpus)

In [94]:
# Naive Bayes model!
mnb = MultinomialNB(alpha=1)
mnb.fit(cv_train_features, train_label_names)
mnb_bow_cv_scores = cross_val_score(mnb, cv_train_features, train_label_names, cv=5)
mnb_bow_cv_mean_score = np.mean(mnb_bow_cv_scores)
print('CV Accuracy (5-fold):', mnb_bow_cv_scores)
print('Mean CV Accuracy:', mnb_bow_cv_mean_score)
mnb_bow_test_score = mnb.score(cv_test_features, test_label_names)
print('Test Accuracy:', mnb_bow_test_score)

# CV Accuracy (5-fold): [0.60953058 0.60271041 0.62027123 0.62848962 0.6025825 ]
# Mean CV Accuracy: 0.6127168697541164
# Test Accuracy: 0.6293333333333333

CV Accuracy (5-fold): [0.60953058 0.60271041 0.62027123 0.62848962 0.6025825 ]
Mean CV Accuracy: 0.6127168697541164
Test Accuracy: 0.6293333333333333


In [95]:
# Logistic Regression model!
lr = LogisticRegression(penalty='l2', max_iter=100, C=1, random_state=27)
lr.fit(cv_train_features, train_label_names)
lr_bow_cv_scores = cross_val_score(lr, cv_train_features, train_label_names, cv=5)
lr_bow_cv_mean_score = np.mean(lr_bow_cv_scores)
print('CV Accuracy (5-fold):', lr_bow_cv_scores)
print('Mean CV Accuracy:', lr_bow_cv_mean_score)
lr_bow_test_score = lr.score(cv_test_features, test_label_names)
print('Test Accuracy:', lr_bow_test_score)

# CV Accuracy (5-fold): [0.68136558 0.68259629 0.70235546 0.68432355 0.70301291]
# Mean CV Accuracy: 0.6907307580896089
# Test Accuracy: 0.7043333333333334



CV Accuracy (5-fold): [0.68136558 0.68259629 0.70235546 0.68432355 0.70301291]
Mean CV Accuracy: 0.6907307580896089
Test Accuracy: 0.7043333333333334


In [96]:
# LinearSVC model!
svm = LinearSVC(penalty='l2', C=1, random_state=27)
svm.fit(cv_train_features, train_label_names)
svm_bow_cv_scores = cross_val_score(svm, cv_train_features, train_label_names, cv=5)
svm_bow_cv_mean_score = np.mean(svm_bow_cv_scores)
print('CV Accuracy (5-fold):', svm_bow_cv_scores)
print('Mean CV Accuracy:', svm_bow_cv_mean_score)
svm_bow_test_score = svm.score(cv_test_features, test_label_names)
print('Test Accuracy:', svm_bow_test_score)

# CCV Accuracy (5-fold): [0.61664296 0.63623395 0.65453248 0.6313529  0.65853659]
# Mean CV Accuracy: 0.639459774296731
# Test Accuracy: 0.6536666666666666

CV Accuracy (5-fold): [0.61664296 0.63623395 0.65453248 0.6313529  0.65853659]
Mean CV Accuracy: 0.639459774296731
Test Accuracy: 0.6536666666666666


In [97]:
# SDGClassifier!
svm_sgd = SGDClassifier(loss='hinge', penalty='l2', max_iter=5, random_state=27)
svm_sgd.fit(cv_train_features, train_label_names)
svmsgd_bow_cv_scores = cross_val_score(svm_sgd, cv_train_features, train_label_names, cv=5)
svmsgd_bow_cv_mean_score = np.mean(svmsgd_bow_cv_scores)
print('CV Accuracy (5-fold):', svmsgd_bow_cv_scores)
print('Mean CV Accuracy:', svmsgd_bow_cv_mean_score)
svmsgd_bow_test_score = svm_sgd.score(cv_test_features, test_label_names)
print('Test Accuracy:', svmsgd_bow_test_score)

# CV Accuracy (5-fold): [0.6230441  0.62624822 0.63526053 0.62848962 0.63271162]
# Mean CV Accuracy: 0.6291508167210028
# Test Accuracy: 0.6573333333333333



CV Accuracy (5-fold): [0.6230441  0.62624822 0.63526053 0.62848962 0.63271162]
Mean CV Accuracy: 0.6291508167210028
Test Accuracy: 0.6573333333333333


In [98]:
# Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=10, random_state=27)
rfc.fit(cv_train_features, train_label_names)
rfc_bow_cv_scores = cross_val_score(rfc, cv_train_features, train_label_names, cv=5)
rfc_bow_cv_mean_score = np.mean(rfc_bow_cv_scores)
print('CV Accuracy (5-fold):', rfc_bow_cv_scores)
print('Mean CV Accuracy:', rfc_bow_cv_mean_score)
rfc_bow_test_score = rfc.score(cv_test_features, test_label_names)
print('Test Accuracy:', rfc_bow_test_score)

# CV Accuracy (5-fold): [0.54125178 0.55777461 0.56316916 0.55619184 0.55164993]
# Mean CV Accuracy: 0.5540074637199572
# Test Accuracy: 0.5783333333333334

CV Accuracy (5-fold): [0.54125178 0.55777461 0.56316916 0.55619184 0.55164993]
Mean CV Accuracy: 0.5540074637199572
Test Accuracy: 0.5783333333333334


In [99]:
# Gradient Boosting Classifier
gbc = GradientBoostingClassifier(n_estimators=10, random_state=27)
gbc.fit(cv_train_features, train_label_names)
gbc_bow_cv_scores = cross_val_score(gbc, cv_train_features, train_label_names, cv=5)
gbc_bow_cv_mean_score = np.mean(gbc_bow_cv_scores)
print('CV Accuracy (5-fold):', gbc_bow_cv_scores)
print('Mean CV Accuracy:', gbc_bow_cv_mean_score)
gbc_bow_test_score = gbc.score(cv_test_features, test_label_names)
print('Test Accuracy:', gbc_bow_test_score)

# CV Accuracy (5-fold): [0.57539118 0.58844508 0.60599572 0.57480315 0.59397418]
# Mean CV Accuracy: 0.5877218602201207
# Test Accuracy: 0.5933333333333334

CV Accuracy (5-fold): [0.57539118 0.58844508 0.60599572 0.57480315 0.59397418]
Mean CV Accuracy: 0.5877218602201207
Test Accuracy: 0.5933333333333334


In [100]:
# build BOW features on train articles with TFIDF
tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0)
tv_train_features = tv.fit_transform(train_corpus)

# transform test articles into features
tv_test_features = tv.transform(test_corpus)

# Now I'm going to re-run all of those models with this new set of features!

In [101]:
mnb = MultinomialNB(alpha=1)
mnb.fit(tv_train_features, train_label_names)
mnb_tfidf_cv_scores = cross_val_score(mnb, tv_train_features, train_label_names, cv=5)
mnb_tfidf_cv_mean_score = np.mean(mnb_tfidf_cv_scores)
print('CV Accuracy (5-fold):', mnb_tfidf_cv_scores)
print('Mean CV Accuracy:', mnb_tfidf_cv_mean_score)
mnb_tfidf_test_score = mnb.score(tv_test_features, test_label_names)
print('Test Accuracy:', mnb_tfidf_test_score)

# CV Accuracy (5-fold): [0.46443812 0.47289586 0.47965739 0.47244094 0.47704448]
# Mean CV Accuracy: 0.47329535883498935
# Test Accuracy: 0.48933333333333334

CV Accuracy (5-fold): [0.46443812 0.47289586 0.47965739 0.47244094 0.47704448]
Mean CV Accuracy: 0.47329535883498935
Test Accuracy: 0.48933333333333334


In [102]:
lr = LogisticRegression(penalty='l2', max_iter=100, C=1, random_state=27)
lr.fit(tv_train_features, train_label_names)
lr_tfidf_cv_scores = cross_val_score(lr, tv_train_features, train_label_names, cv=5)
lr_tfidf_cv_mean_score = np.mean(lr_tfidf_cv_scores)
print('CV Accuracy (5-fold):', lr_tfidf_cv_scores)
print('Mean CV Accuracy:', lr_tfidf_cv_mean_score)
lr_tfidf_test_score = lr.score(tv_test_features, test_label_names)
print('Test Accuracy:', lr_tfidf_test_score)

# CV Accuracy (5-fold): [0.65362731 0.63766049 0.67880086 0.67000716 0.65208034]
# Mean CV Accuracy: 0.658435231120697
# Test Accuracy: 0.6763333333333333



CV Accuracy (5-fold): [0.65362731 0.63766049 0.67880086 0.67000716 0.65208034]
Mean CV Accuracy: 0.658435231120697
Test Accuracy: 0.6763333333333333


In [103]:
# Most accurate model now
svm = LinearSVC(penalty='l2', C=1, random_state=27)
svm.fit(tv_train_features, train_label_names)
svm_tfidf_cv_scores = cross_val_score(svm, tv_train_features, train_label_names, cv=5)
svm_tfidf_cv_mean_score = np.mean(svm_tfidf_cv_scores)
print('CV Accuracy (5-fold):', svm_tfidf_cv_scores)
print('Mean CV Accuracy:', svm_tfidf_cv_mean_score)
svm_tfidf_test_score = svm.score(tv_test_features, test_label_names)
print('Test Accuracy:', svm_tfidf_test_score)

# CV Accuracy (5-fold): [0.68349929 0.68045649 0.710207   0.6972083  0.69081779]
# Mean CV Accuracy: 0.692437773706382
# Test Accuracy: 0.713

CV Accuracy (5-fold): [0.68349929 0.68045649 0.710207   0.6972083  0.69081779]
Mean CV Accuracy: 0.692437773706382
Test Accuracy: 0.713


In [104]:
# Second most accurate model now
svm_sgd = SGDClassifier(loss='hinge', penalty='l2', max_iter=5, random_state=27)
svm_sgd.fit(tv_train_features, train_label_names)
svmsgd_tfidf_cv_scores = cross_val_score(svm_sgd, tv_train_features, train_label_names, cv=5)
svmsgd_tfidf_cv_mean_score = np.mean(svmsgd_tfidf_cv_scores)
print('CV Accuracy (5-fold):', svmsgd_tfidf_cv_scores)
print('Mean CV Accuracy:', svmsgd_tfidf_cv_mean_score)
svmsgd_tfidf_test_score = svm_sgd.score(tv_test_features, test_label_names)
print('Test Accuracy:', svmsgd_tfidf_test_score)

# CV Accuracy (5-fold): [0.68634424 0.67831669 0.70663812 0.69219757 0.68794835]
# Mean CV Accuracy: 0.6902889922669571
# Test Accuracy: 0.709



CV Accuracy (5-fold): [0.68634424 0.67831669 0.70663812 0.69219757 0.68794835]
Mean CV Accuracy: 0.6902889922669571
Test Accuracy: 0.709


In [105]:
rfc = RandomForestClassifier(n_estimators=10, random_state=27)
rfc.fit(tv_train_features, train_label_names)
rfc_tfidf_cv_scores = cross_val_score(rfc, tv_train_features, train_label_names, cv=5)
rfc_tfidf_cv_mean_score = np.mean(rfc_tfidf_cv_scores)
print('CV Accuracy (5-fold):', rfc_tfidf_cv_scores)
print('Mean CV Accuracy:', rfc_tfidf_cv_mean_score)
rfc_tfidf_test_score = rfc.score(tv_test_features, test_label_names)
print('Test Accuracy:', rfc_tfidf_test_score)

# CV Accuracy (5-fold): [0.52987198 0.53851641 0.57744468 0.54903364 0.56169297]
# Mean CV Accuracy: 0.5513119356276726
# Test Accuracy: 0.582

CV Accuracy (5-fold): [0.52987198 0.53851641 0.57744468 0.54903364 0.56169297]
Mean CV Accuracy: 0.5513119356276726
Test Accuracy: 0.582


In [106]:
gbc = GradientBoostingClassifier(n_estimators=10, random_state=27 )
gbc.fit(tv_train_features, train_label_names)
gbc_tfidf_cv_scores = cross_val_score(gbc, tv_train_features, train_label_names, cv=5)
gbc_tfidf_cv_mean_score = np.mean(gbc_tfidf_cv_scores)
print('CV Accuracy (5-fold):', gbc_tfidf_cv_scores)
print('Mean CV Accuracy:', gbc_tfidf_cv_mean_score)
gbc_tfidf_test_score = gbc.score(tv_test_features, test_label_names)
print('Test Accuracy:', gbc_tfidf_test_score)

# CV Accuracy (5-fold): [0.58534851 0.58202568 0.59814418 0.57766643 0.58106169]
# Mean CV Accuracy: 0.584849297552237
# Test Accuracy: 0.5943333333333334

CV Accuracy (5-fold): [0.58534851 0.58202568 0.59814418 0.57766643 0.58106169]
Mean CV Accuracy: 0.584849297552237
Test Accuracy: 0.5943333333333334


In [107]:
# Making a pretty table of all the results
pd.DataFrame([['Naive Bayes', mnb_bow_cv_mean_score, mnb_bow_test_score, 
               mnb_tfidf_cv_mean_score, mnb_tfidf_test_score],
              ['Logistic Regression', lr_bow_cv_mean_score, lr_bow_test_score, 
               lr_tfidf_cv_mean_score, lr_tfidf_test_score],
              ['Linear SVM', svm_bow_cv_mean_score, svm_bow_test_score, 
               svm_tfidf_cv_mean_score, svm_tfidf_test_score],
              ['Linear SVM (SGD)', svmsgd_bow_cv_mean_score, svmsgd_bow_test_score, 
               svmsgd_tfidf_cv_mean_score, svmsgd_tfidf_test_score],
              ['Random Forest', rfc_bow_cv_mean_score, rfc_bow_test_score, 
               rfc_tfidf_cv_mean_score, rfc_tfidf_test_score],
              ['Gradient Boosted Machines', gbc_bow_cv_mean_score, gbc_bow_test_score, 
               gbc_tfidf_cv_mean_score, gbc_tfidf_test_score]],
             columns=['Model', 'CV Score (TF)', 'Test Score (TF)', 'CV Score (TF-IDF)', 'Test Score (TF-IDF)'],
             ).T

Unnamed: 0,0,1,2,3,4,5
Model,Naive Bayes,Logistic Regression,Linear SVM,Linear SVM (SGD),Random Forest,Gradient Boosted Machines
CV Score (TF),0.612717,0.690731,0.63946,0.629151,0.554007,0.587722
Test Score (TF),0.629333,0.704333,0.653667,0.657333,0.578333,0.593333
CV Score (TF-IDF),0.473295,0.658435,0.692438,0.690289,0.551312,0.584849
Test Score (TF-IDF),0.489333,0.676333,0.713,0.709,0.582,0.594333


In [108]:
# Fiddling with parameters of LinearSVM
svm_pipeline = Pipeline([('tfidf', TfidfVectorizer()),
                        ('svm', LinearSVC(random_state=27))
                       ])

param_grid = {'tfidf__ngram_range': [(1, 1), (1, 2)],
              'svm__C': [0.01, 0.1, 1, 5]
}

gs_svm = GridSearchCV(svm_pipeline, param_grid, cv=5, verbose=2)
gs_svm = gs_svm.fit(train_corpus, train_label_names)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] svm__C=0.01, tfidf__ngram_range=(1, 1) ..........................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 1), total=   0.2s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 1) ..........................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 1), total=   0.2s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 1) ..........................
[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 1), total=   0.2s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 1) ..........................
[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 1), total=   0.2s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 1) ..........................
[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 1), total=   0.2s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 2) ..........................
[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 2), total=   0.6s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 2) ..........................
[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 2), total=   0.6s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 2) ..........................
[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 2), total=   0.6s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 2) ..........................
[CV] .

[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:   27.0s finished


In [109]:
gs_svm.best_estimator_.get_params()

{'memory': None,
 'steps': [('tfidf',
   TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
           stop_words=None, strip_accents=None, sublinear_tf=False,
           token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
           vocabulary=None)),
  ('svm', LinearSVC(C=5, class_weight=None, dual=True, fit_intercept=True,
        intercept_scaling=1, loss='squared_hinge', max_iter=1000,
        multi_class='ovr', penalty='l2', random_state=27, tol=0.0001,
        verbose=0))],
 'tfidf': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=None, min_df=1,
         ngram_range=(1, 2), norm='l2'

In [110]:
# Now this model is the most accurate
best_svm_test_score = gs_svm.score(test_corpus, test_label_names)
print('Test Accuracy :', best_svm_test_score)

# Test Accuracy : 0.718, slightly improved by 0.02 with combined categories

Test Accuracy : 0.718


In [111]:
svm_predictions = gs_svm.predict(test_corpus)
unique_classes = list(set(test_label_names))
meu.get_metrics(true_labels=test_label_names, predicted_labels=svm_predictions)

# Accuracy: 0.718
# Precision: 0.7248
# Recall: 0.718
# F1 Score: 0.7084

Accuracy: 0.718
Precision: 0.7248
Recall: 0.718
F1 Score: 0.7084


In [112]:
meu.display_classification_report(true_labels=test_label_names, 
                                  predicted_labels=svm_predictions, classes=unique_classes)

                    precision    recall  f1-score   support

        Chardonnay       0.75      0.91      0.82       480
   Sauvignon Blanc       0.77      0.72      0.75       191
       Mo/uscat(o)       1.00      0.32      0.48        25
       White Blend       0.81      0.47      0.60        91
              Rose       0.77      0.71      0.74       141
            Shiraz       0.71      0.38      0.50        26
         Zinfandel       0.55      0.38      0.45        96
            Malbec       0.59      0.45      0.51       113
            Merlot       0.80      0.38      0.51       104
         Red Blend       0.79      0.72      0.75       368
        Pinot Noir       0.69      0.88      0.77       517
          Riesling       0.79      0.81      0.80       198
 Pinot Gris/Grigio       0.87      0.49      0.63        96
Cabernet Sauvignon       0.61      0.72      0.66       361
             Syrah       0.69      0.54      0.61       193

         micro avg       0.72      0.7