In [1]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import re

import matplotlib.pyplot as plt

In [3]:
#Read in dataset. Temporarily using csv but will convert to database 
data=pd.read_csv("Resources/joined_db.csv")
data.head()

Unnamed: 0,points,title,description,price,variety,country
0,87,Quinta dos Avidagos 2011 Avidagos Red (Douro),"This is ripe and fruity, a wine that is smooth...",15,Portuguese Red,Portugal
1,87,Rainstorm 2013 Pinot Gris (Willamette Valley),"Tart and snappy, the flavors of lime flesh and...",14,Pinot Gris,US
2,87,St. Julian 2013 Reserve Late Harvest Riesling ...,"Pineapple rind, lemon pith and orange blossom ...",13,Riesling,US
3,87,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,"Much like the regular bottling from 2012, this...",65,Pinot Noir,US
4,87,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Blackberry and raspberry aromas show a typical...,15,Tempranillo-Merlot,Spain


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120914 entries, 0 to 120913
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   points       120914 non-null  int64 
 1   title        120914 non-null  object
 2   description  120914 non-null  object
 3   price        120914 non-null  int64 
 4   variety      120914 non-null  object
 5   country      120914 non-null  object
dtypes: int64(2), object(4)
memory usage: 5.5+ MB


In [5]:
#stopwords dictionary
stop_words=set(stopwords.words("english"))
add_stopwords=["wine","flavor",'finish','flavors','like','drink','syrah','cabernet','palate']
stop_words=stop_words.union(add_stopwords)
print(stop_words)

{'drink', "you're", 'won', 'itself', "it's", 'with', 'on', 'him', 'they', 'doing', 'the', 'here', 'up', 'but', 'his', 'wouldn', "couldn't", 'other', 'through', "don't", "you'd", 'has', 'does', 'have', 'both', 'above', 'doesn', 'flavor', "you'll", 'you', "needn't", 'then', 'ain', 'we', 'been', 'haven', 'our', 'needn', 'these', 'll', "wasn't", "she's", 'themselves', 'further', 'as', 'wasn', 'for', 'isn', 'be', "shan't", 'to', 'myself', 'most', 'into', 'at', 'theirs', 'off', 'about', 'shouldn', 'am', 'after', 've', 'did', 'hers', 'weren', 'between', 'why', 'hasn', 'flavors', 'her', 'syrah', 'over', 'mightn', 'are', 'being', 'she', 'its', 'aren', "you've", "won't", 'down', 'those', 'yourself', 'some', "isn't", 'no', "should've", 'should', 'shan', 'same', 'yours', 'from', 'is', 'who', 'nor', 'ma', 'because', 'd', 'couldn', 'was', 'having', 'do', 'once', 'wine', 'only', 'o', 'finish', 'not', 'will', 'own', 'them', "haven't", 'a', 'very', 'm', 'if', 'didn', 'until', 'yourselves', 'ourselves',

In [6]:
## Need to Create Natural Language processing to add taste values for prediction
#loops through descriptions and cleans them
clean_description = []
for w in range(len(data.description)):
    desc = data['description'][w].lower()
    
    #remove punctuation
    desc = re.sub('[^a-zA-Z]', ' ', desc)
    
    #remove tags
    desc=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",desc)
    
    #remove digits and special chars
    desc=re.sub("(\\d|\\W)+"," ",desc)
    
    clean_description.append(desc)
#assign the cleaned descriptions to the data frame
data['clean_description'] = clean_description
#calculate the frequency
word_frequency = pd.Series(' '.join(data['clean_description']).split()).value_counts()[:30]
word_frequency

and        327802
the        206358
a          167544
of         162534
with       112581
this       106600
is          88589
it          80418
wine        72245
flavors     60140
in          59555
to          52335
s           50332
fruit       46261
on          43764
aromas      37466
that        37088
palate      36786
finish      33658
acidity     31528
from        30161
but         29580
tannins     28111
drink       27953
cherry      27830
black       27146
ripe        24594
are         24487
has         22066
for         20459
dtype: int64

In [7]:
#need to remove the words that are not truly descriptive/ filler words
stem_desc=[]
for i in range(len(data['clean_description'])):
    split_text = data['clean_description'][i].split()

    lem = WordNetLemmatizer()
    split_text = [lem.lemmatize(word) for word in split_text if not word in stop_words] 
    split_text = " ".join(split_text)
    stem_desc.append(split_text)
stem_desc

['ripe fruity smooth still structured firm tannin filled juicy red berry fruit freshened acidity already drinkable although certainly better',
 'tart snappy lime flesh rind dominate green pineapple poke crisp acidity underscoring stainless steel fermented',
 'pineapple rind lemon pith orange blossom start aroma bit opulent note honey drizzled guava mango giving way slightly astringent semidry',
 'much regular bottling come across rather rough tannic rustic earthy herbal characteristic nonetheless think pleasantly unfussy country good companion hearty winter stew',
 'blackberry raspberry aroma show typical navarran whiff green herb case horseradish mouth fairly full bodied tomatoey acidity spicy herbal complement dark plum fruit fresh grabby',
 'bright informal red open aroma candied berry white pepper savory herb carry balanced fresh acidity soft tannin',
 'dry restrained offer spice profusion balanced acidity firm texture much food',
 'savory dried thyme note accent sunnier preserved 

In [10]:
data['stem_description']=stem_desc
keywords=pd.Series(' '.join(data['stem_description']).split()).value_counts()[:40]
keywords

fruit         58050
aroma         38831
cherry        31991
acidity       31528
tannin        30095
black         27162
ripe          24594
note          23870
spice         20554
red           20488
berry         17949
oak           17343
nose          16340
dry           16290
fresh         15859
rich          15826
plum          15544
full          14881
blackberry    14460
apple         14423
show          13927
blend         13194
sweet         12592
soft          12488
offer         12477
well          12312
white         12172
light         11823
crisp         11744
dark          11516
texture       11372
bodied        11170
citrus        11016
raspberry     10716
vanilla       10642
herb          10583
hint          10374
bright         9926
pepper         9833
lemon          9488
dtype: int64

In [11]:
data.head()

Unnamed: 0,points,title,description,price,variety,country,clean_description,stem_description
0,87,Quinta dos Avidagos 2011 Avidagos Red (Douro),"This is ripe and fruity, a wine that is smooth...",15,Portuguese Red,Portugal,this is ripe and fruity a wine that is smooth ...,ripe fruity smooth still structured firm tanni...
1,87,Rainstorm 2013 Pinot Gris (Willamette Valley),"Tart and snappy, the flavors of lime flesh and...",14,Pinot Gris,US,tart and snappy the flavors of lime flesh and ...,tart snappy lime flesh rind dominate green pin...
2,87,St. Julian 2013 Reserve Late Harvest Riesling ...,"Pineapple rind, lemon pith and orange blossom ...",13,Riesling,US,pineapple rind lemon pith and orange blossom s...,pineapple rind lemon pith orange blossom start...
3,87,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,"Much like the regular bottling from 2012, this...",65,Pinot Noir,US,much like the regular bottling from this comes...,much regular bottling come across rather rough...
4,87,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Blackberry and raspberry aromas show a typical...,15,Tempranillo-Merlot,Spain,blackberry and raspberry aromas show a typical...,blackberry raspberry aroma show typical navarr...


In [12]:
search_terms=[]
for i in range(len(data['stem_description'])):
    split_text = data['stem_description'][i].split()
    lem = WordNetLemmatizer()
    split_text = [lem.lemmatize(word) for word in split_text if word in keywords.index] 
    split_text = " ".join(split_text)
    search_terms.append(split_text)
search_terms=pd.DataFrame(search_terms)
search_terms

Unnamed: 0,0
0,ripe tannin red berry fruit acidity
1,crisp acidity
2,lemon aroma note
3,
4,blackberry raspberry aroma show herb full bodi...
...,...
120909,note light acidity
120910,cherry soft fruit
120911,well crisp dry ripe spice
120912,dry crisp acidity spice apple


In [13]:
data['Keyword_count']=search_terms[0].str.split().str.len()
data.head()

Unnamed: 0,points,title,description,price,variety,country,clean_description,stem_description,Keyword_count
0,87,Quinta dos Avidagos 2011 Avidagos Red (Douro),"This is ripe and fruity, a wine that is smooth...",15,Portuguese Red,Portugal,this is ripe and fruity a wine that is smooth ...,ripe fruity smooth still structured firm tanni...,6
1,87,Rainstorm 2013 Pinot Gris (Willamette Valley),"Tart and snappy, the flavors of lime flesh and...",14,Pinot Gris,US,tart and snappy the flavors of lime flesh and ...,tart snappy lime flesh rind dominate green pin...,2
2,87,St. Julian 2013 Reserve Late Harvest Riesling ...,"Pineapple rind, lemon pith and orange blossom ...",13,Riesling,US,pineapple rind lemon pith and orange blossom s...,pineapple rind lemon pith orange blossom start...,3
3,87,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,"Much like the regular bottling from 2012, this...",65,Pinot Noir,US,much like the regular bottling from this comes...,much regular bottling come across rather rough...,0
4,87,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Blackberry and raspberry aromas show a typical...,15,Tempranillo-Merlot,Spain,blackberry and raspberry aromas show a typical...,blackberry raspberry aroma show typical navarr...,12


In [18]:
#Drop columns that are not necessary for machine learning 
ML_dataset=data.drop(columns=["description",'clean_description','stem_description','title'])
ML_dataset.head()

Unnamed: 0,points,price,variety,country,Keyword_count
0,87,15,Portuguese Red,Portugal,6
1,87,14,Pinot Gris,US,2
2,87,13,Riesling,US,3
3,87,65,Pinot Noir,US,0
4,87,15,Tempranillo-Merlot,Spain,12


In [19]:
ML_dataset=ML_dataset.dropna()
ML_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 120914 entries, 0 to 120913
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   points         120914 non-null  int64 
 1   price          120914 non-null  int64 
 2   variety        120914 non-null  object
 3   country        120914 non-null  object
 4   Keyword_count  120914 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 5.5+ MB


In [20]:
#Use Get dummies to convert the remaining text columns to integers
X_encoded=pd.get_dummies(ML_dataset)
X_encoded=X_encoded.drop(columns="points")
X_encoded.shape


(120914, 735)

In [21]:
X_encoded.head()
X_encoded.describe()

Unnamed: 0,price,Keyword_count,variety_Abouriou,variety_Agiorgitiko,variety_Aglianico,variety_Aidani,variety_Airen,variety_Albana,variety_Albanello,variety_Albariño,...,country_Serbia,country_Slovakia,country_Slovenia,country_South Africa,country_Spain,country_Switzerland,country_Turkey,country_US,country_Ukraine,country_Uruguay
count,120914.0,120914.0,120914.0,120914.0,120914.0,120914.0,120914.0,120914.0,120914.0,120914.0,...,120914.0,120914.0,120914.0,120914.0,120914.0,120914.0,120914.0,120914.0,120914.0,120914.0
mean,35.368675,5.751551,2.5e-05,0.000521,0.002431,8e-06,2.5e-05,0.000157,8e-06,0.00392,...,9.9e-05,8e-06,0.000662,0.010694,0.054361,5.8e-05,0.000744,0.448782,0.000116,0.000901
std,41.031336,2.570385,0.004981,0.02282,0.04925,0.002876,0.004981,0.012534,0.002876,0.062488,...,0.009962,0.002876,0.025714,0.102856,0.226729,0.007609,0.027272,0.497372,0.01076,0.030011
min,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,17.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,25.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,42.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
max,3300.0,19.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [22]:
data_scaler = StandardScaler()
X_scaled=data_scaler.fit_transform(X_encoded)

## Split Data into Training and Testing

In [23]:
y=ML_dataset["points"]
y.value_counts()


88     16004
87     15761
90     14354
86     11740
89     11306
91     10559
85      8901
92      8865
84      6097
93      5935
94      3448
83      2886
82      1772
95      1406
81       680
96       482
80       395
97       207
98        69
99        28
100       19
Name: points, dtype: int64

In [24]:
# create train and testing data 
X_train,X_test,y_train,y_test=train_test_split(X_scaled,y,random_state=1)

In [25]:
# create Balanced Random Forest Classifier model 
brf_model = BalancedRandomForestClassifier(n_estimators = 100, random_state=1)
#Train and fit model 
brf_model= brf_model.fit(X_train,y_train) 
#Run test data 
predictions = brf_model.predict(X_test)

In [26]:
# Calculate the balanced accuracy score
balanced_accuracy_score(y_test,predictions)

0.15676102603860137

In [27]:
# Display the confusion matrix
cm=confusion_matrix(y_test,predictions)
cm

array([[ 58,  18,  14,   4,   4,   3,   3,   0,   1,   2,   1,   0,   1,
          0,   0,   0,   0,   0,   0,   1,   0],
       [ 61,  29,  21,  12,  10,   7,   2,   0,   0,   1,   1,   0,   1,
          0,   2,   0,   0,   0,   0,   0,   0],
       [120,  86,  68,  28,  37,  28,  15,   8,  10,  23,   1,   4,   4,
          5,   2,   1,   0,   0,   3,   1,   7],
       [171,  81,  99,  48,  83,  51,  34,   9,  24,  51,   5,   7,   9,
          6,   4,   4,   0,   1,   1,   2,   4],
       [207, 148, 187, 120, 227, 132, 126,  43,  67, 134,  23,  25,  23,
         14,  18,   1,   5,   3,   4,   4,  20],
       [230, 151, 240, 161, 298, 215, 193,  93,  88, 296,  34,  28,  64,
         36,  25,   1,   7,   2,  11,  10,  23],
       [207, 124, 329, 181, 350, 303, 331, 139, 197, 460,  55,  76,  95,
         52,  62,  11,   7,   8,  10,  13,  28],
       [226, 142, 329, 208, 324, 297, 363, 203, 298, 822,  90, 132, 153,
         96,  69,  37,   8,  23,  18,  32,  40],
       [152, 109, 307, 1

In [28]:
# Print the imbalanced classification report
report=classification_report_imbalanced(y_test,predictions)
print(report)

                   pre       rec       spe        f1       geo       iba       sup

         80       0.03      0.53      0.94      0.06      0.71      0.48       110
         81       0.03      0.20      0.96      0.05      0.44      0.18       147
         82       0.03      0.15      0.93      0.05      0.37      0.13       451
         83       0.04      0.07      0.96      0.05      0.26      0.06       694
         84       0.11      0.15      0.93      0.13      0.37      0.13      1531
         85       0.12      0.10      0.94      0.11      0.30      0.08      2206
         86       0.16      0.11      0.94      0.13      0.32      0.09      3038
         87       0.20      0.05      0.97      0.08      0.22      0.05      3910
         88       0.18      0.08      0.94      0.11      0.28      0.07      4030
         89       0.13      0.22      0.85      0.16      0.44      0.18      2836
         90       0.18      0.04      0.98      0.07      0.20      0.04      3569
   

In [63]:
# List the features sorted in descending order by feature importance
important_features=brf_model.feature_importances_

cols=X_encoded.columns

features_df = pd.DataFrame({'feature':cols,
                          'importance': important_features})
features_df.sort_values('importance',ascending=False)

Unnamed: 0,feature,importance
0,price,0.642052
1,Keyword_count,0.144595
434,variety_Pinot Noir,0.011500
732,country_US,0.008432
701,country_Chile,0.007711
...,...,...
232,variety_Groppello,0.000000
694,country_Armenia,0.000000
21,variety_Ansonica,0.000000
440,variety_Piquepoul Blanc,0.000000


## Try Logistic regression as a model for the data 

In [64]:
data_scaler = StandardScaler()
X_scaled=data_scaler.fit_transform(X_encoded)

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled,
   y, random_state=1, stratify=y)

In [66]:
classifier = LogisticRegression(solver='lbfgs',
   max_iter=800,
   random_state=1)
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=800, random_state=1)

In [67]:
y_pred = classifier.predict(X_test)

In [68]:
print(accuracy_score(y_test, y_pred))

0.8059810116113666


## Try grouping based on wine quality 

In [69]:
ML_dataset.head()

Unnamed: 0,points,price,variety,country,Keyword_count,wine_class
0,87,15,Portuguese Red,Portugal,6,average
1,87,14,Pinot Gris,US,2,average
2,87,13,Riesling,US,3,average
3,87,65,Pinot Noir,US,0,average
4,87,15,Tempranillo-Merlot,Spain,12,average


In [70]:
Grouped_data=ML_dataset
Grouped_data['wine_class']=3
for i in range(len(ML_dataset['points'])):
    if Grouped_data['points'][i] > 90:
        Grouped_data['wine_class'][i]='excellent'   
    else:
        Grouped_data['wine_class'][i]='average'
        
Grouped_data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,points,price,variety,country,Keyword_count,wine_class
0,87,15,Portuguese Red,Portugal,6,average
1,87,14,Pinot Gris,US,2,average
2,87,13,Riesling,US,3,average
3,87,65,Pinot Noir,US,0,average
4,87,15,Tempranillo-Merlot,Spain,12,average


In [71]:
X_grouped=Grouped_data.drop(columns=["wine_class","points"])
X_grouped=pd.get_dummies(X_grouped)
X_grouped.shape


(120914, 735)

In [72]:
X_grouped.head()

Unnamed: 0,price,Keyword_count,variety_Abouriou,variety_Agiorgitiko,variety_Aglianico,variety_Aidani,variety_Airen,variety_Albana,variety_Albanello,variety_Albariño,...,country_Serbia,country_Slovakia,country_Slovenia,country_South Africa,country_Spain,country_Switzerland,country_Turkey,country_US,country_Ukraine,country_Uruguay
0,15,6,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,14,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,13,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,65,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,15,12,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [73]:
y=Grouped_data['wine_class']
y.value_counts()

average      89896
excellent    31018
Name: wine_class, dtype: int64

In [74]:
X_train,X_test,y_train,y_test=train_test_split(X_grouped,y,random_state=1)

In [75]:
# Resample the training data with the BalancedRandomForestClassifier

#model
brf_model = BalancedRandomForestClassifier(n_estimators = 100, random_state=1)

#fit
brf_model= brf_model.fit(X_train,y_train) 

#predict
predictions = brf_model.predict(X_test)

In [76]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test,predictions)

0.7681181955075698

In [77]:
# Print the imbalanced classification report
report=classification_report_imbalanced(y_test,predictions)
print(report)

                   pre       rec       spe        f1       geo       iba       sup

    average       0.92      0.73      0.81      0.81      0.77      0.58     22522
  excellent       0.51      0.81      0.73      0.62      0.77      0.59      7707

avg / total       0.81      0.75      0.79      0.76      0.77      0.59     30229



In [78]:
# List the features sorted in descending order by feature importance
important_features=brf_model.feature_importances_

cols=X_encoded.columns

features_df = pd.DataFrame({'feature':cols,
                          'importance': important_features})
features_df.sort_values('importance',ascending=False)

Unnamed: 0,feature,importance
0,price,0.642052
1,Keyword_count,0.144595
434,variety_Pinot Noir,0.011500
732,country_US,0.008432
701,country_Chile,0.007711
...,...,...
232,variety_Groppello,0.000000
694,country_Armenia,0.000000
21,variety_Ansonica,0.000000
440,variety_Piquepoul Blanc,0.000000


In [79]:
#Trying model on data from joined JSON database: 
new_data=pd.read_csv("Resources/joined_db.csv")
new_data.head()

Unnamed: 0,points,title,description,price,variety,country
0,87,Quinta dos Avidagos 2011 Avidagos Red (Douro),"This is ripe and fruity, a wine that is smooth...",15,Portuguese Red,Portugal
1,87,Rainstorm 2013 Pinot Gris (Willamette Valley),"Tart and snappy, the flavors of lime flesh and...",14,Pinot Gris,US
2,87,St. Julian 2013 Reserve Late Harvest Riesling ...,"Pineapple rind, lemon pith and orange blossom ...",13,Riesling,US
3,87,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,"Much like the regular bottling from 2012, this...",65,Pinot Noir,US
4,87,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Blackberry and raspberry aromas show a typical...,15,Tempranillo-Merlot,Spain
