In [33]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import re

import matplotlib.pyplot as plt

In [23]:
#Read in dataset. Temporarily using csv but will convert to database 
data=pd.read_csv("Resources/winemag-data_first150k.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150930 entries, 0 to 150929
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Unnamed: 0   150930 non-null  int64  
 1   country      150925 non-null  object 
 2   description  150930 non-null  object 
 3   designation  105195 non-null  object 
 4   points       150930 non-null  int64  
 5   price        137235 non-null  float64
 6   province     150925 non-null  object 
 7   region_1     125870 non-null  object 
 8   region_2     60953 non-null   object 
 9   variety      150930 non-null  object 
 10  winery       150930 non-null  object 
dtypes: float64(1), int64(2), object(8)
memory usage: 12.7+ MB


In [74]:
#stopwords dictionary
stop_words=set(stopwords.words("english"))
add_stopwords=["wine","flavor",'finish','flavors','like','drink','syrah','cabernet','palate']
stop_words=stop_words.union(add_stopwords)
print(stop_words)

{'did', 'yours', 'most', 'when', 'its', 'been', 'mustn', 'don', 'which', 'have', 'the', 'before', 'against', 'out', 'themselves', 'few', 'cabernet', "you'll", 'y', "should've", 'shouldn', 'on', 'wasn', 'to', 'ours', 'm', 'doing', 'needn', 'off', 'or', 'syrah', 'from', 'other', 've', 'had', 'like', 'now', 'their', 'in', 'a', 'as', "isn't", 'should', 'can', 'aren', "hasn't", 'flavors', 'itself', 'once', 'those', 'does', "wouldn't", 'through', "you've", 'him', 'll', 't', "that'll", 'weren', 'own', 'was', 'ma', 'why', "aren't", 'but', 'what', 'doesn', 'are', 'mightn', 'too', 'having', 're', 'until', 'has', 'shan', 'between', 'you', "you're", "didn't", 'after', 'finish', 'where', 'ain', 'o', 'hadn', 'yourselves', 'here', 'into', 'hers', 'only', "don't", 'over', 'nor', 'couldn', 'didn', 'do', 'i', 'at', 'with', "needn't", 'how', 'ourselves', 'palate', 'isn', "she's", 'below', 'again', "it's", 'this', 'some', 'will', 'by', 'won', 'such', "mustn't", 'he', 'herself', 'than', "doesn't", 'yoursel

In [75]:
## Need to Create Natural Language processing to add taste values for prediction
#loops through descriptions and cleans them
clean_description = []
for w in range(len(data.description)):
    desc = data['description'][w].lower()
    
    #remove punctuation
    desc = re.sub('[^a-zA-Z]', ' ', desc)
    
    #remove tags
    desc=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",desc)
    
    #remove digits and special chars
    desc=re.sub("(\\d|\\W)+"," ",desc)
    
    clean_description.append(desc)
#assign the cleaned descriptions to the data frame
data['clean_description'] = clean_description
#calculate the frequency
word_frequency = pd.Series(' '.join(data['clean_description']).split()).value_counts()[:30]
word_frequency

and        405755
the        259066
a          216300
of         184332
with       152830
is         111659
this       109934
it         103448
wine        90785
flavors     77992
in          74904
s           74208
to          64853
fruit       60474
that        50181
but         48655
on          46089
finish      37777
aromas      35861
cherry      32770
acidity     32662
tannins     32240
for         30426
palate      29404
from        29293
has         27307
ripe        27096
are         26987
black       26591
by          26552
dtype: int64

In [76]:
#need to remove the words that are not truly descriptive/ filler words
stem_desc=[]
for i in range(len(data['clean_description'])):
    split_text = data['clean_description'][i].split()

    lem = WordNetLemmatizer()
    split_text = [lem.lemmatize(word) for word in split_text if not word in stop_words] 
    split_text = " ".join(split_text)
    stem_desc.append(split_text)
stem_desc

['tremendous varietal hail oakville aged three year oak juicy red cherry fruit compelling hint caramel greet framed elegant fine tannin subtle minty tone background balanced rewarding start year ahead develop nuance enjoy',
 'ripe aroma fig blackberry cassis softened sweetened slathering oaky chocolate vanilla full layered intense cushioned rich chocolaty black fruit baking spice toasty everlasting heady ideally balanced',
 'mac watson honor memory made mother tremendously delicious balanced complex botrytised white dark gold color layer toasted hazelnut pear compote orange peel reveling succulence g l residual sugar',
 'spent month new french oak incorporates fruit ponzi aurora abetina madrona vineyard among others aromatic dense toasty deftly blend aroma toast cigar box blackberry black cherry coffee graphite tannin polished fine sheen frame loaded dark chocolate espresso',
 'top la b gude named highest point vineyard foot structure density considerable acidity still calming month wo

In [77]:
data['stem_description']=stem_desc
keywords=pd.Series(' '.join(data['stem_description']).split()).value_counts()[:40]
keywords

fruit         72788
cherry        40647
aroma         37106
tannin        33998
acidity       32662
ripe          27096
spice         26666
black         26592
dry           24988
note          24634
sweet         22586
berry         21867
red           21542
rich          21441
oak           21102
blackberry    19569
show          18240
fresh         18099
soft          17870
good          17387
year          16220
nose          15933
plum          15824
blend         14398
full          14376
well          14363
apple         14037
crisp         14030
vanilla       13960
raspberry     12751
dark          12703
white         12441
citrus        12371
chocolate     12291
mouth         11831
green         11592
offer         11559
light         11305
texture       10914
peach         10701
dtype: int64

In [84]:
keywords.index[0]

'fruit'

In [79]:
data.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery,clean_description,stem_description
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz,this tremendous varietal wine hails from oakvi...,tremendous varietal hail oakville aged three y...
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez,ripe aromas of fig blackberry and cassis are s...,ripe aroma fig blackberry cassis softened swee...
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley,mac watson honors the memory of a wine once ma...,mac watson honor memory made mother tremendous...
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi,this spent months in new french oak and incorp...,spent month new french oak incorporates fruit ...
4,4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude,this is the top wine from la b gude named afte...,top la b gude named highest point vineyard foo...


In [96]:
search_terms=[]
for i in range(len(data['stem_description'])):
    split_text = data['stem_description'][i].split()
    lem = WordNetLemmatizer()
    split_text = [lem.lemmatize(word) for word in split_text if word in keywords.index] 
    split_text = " ".join(split_text)
    search_terms.append(split_text)
search_terms=pd.DataFrame(search_terms)
search_terms

Unnamed: 0,0
0,year oak red cherry fruit tannin year
1,ripe aroma blackberry chocolate vanilla full r...
2,white dark
3,oak fruit blend aroma blackberry black cherry ...
4,acidity
...,...
150925,white citrus white
150926,offer nose light good green apple note
150927,apple peach
150928,peach cherry ripe fresh fruit crisp apple peac...


In [103]:
data['Keyword_count']=search_terms[0].str.split().str.len()
data.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery,clean_description,stem_description,Keyword_count
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz,this tremendous varietal wine hails from oakvi...,tremendous varietal hail oakville aged three y...,7
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez,ripe aromas of fig blackberry and cassis are s...,ripe aroma fig blackberry cassis softened swee...,10
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley,mac watson honors the memory of a wine once ma...,mac watson honor memory made mother tremendous...,2
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi,this spent months in new french oak and incorp...,spent month new french oak incorporates fruit ...,10
4,4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude,this is the top wine from la b gude named afte...,top la b gude named highest point vineyard foo...,1


In [122]:
#Drop columns that are not necessary for machine learning 
ML_dataset=data.drop(columns=["description",'region_2','winery','clean_description','stem_description',"Unnamed: 0",'designation'])
ML_dataset.head()

Unnamed: 0,country,points,price,province,region_1,variety,Keyword_count
0,US,96,235.0,California,Napa Valley,Cabernet Sauvignon,7
1,Spain,96,110.0,Northern Spain,Toro,Tinta de Toro,10
2,US,96,90.0,California,Knights Valley,Sauvignon Blanc,2
3,US,96,65.0,Oregon,Willamette Valley,Pinot Noir,10
4,France,95,66.0,Provence,Bandol,Provence red blend,1


In [132]:
ML_dataset=ML_dataset.dropna()
ML_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 114393 entries, 0 to 150929
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   country        114393 non-null  object 
 1   points         114393 non-null  int64  
 2   price          114393 non-null  float64
 3   province       114393 non-null  object 
 4   region_1       114393 non-null  object 
 5   variety        114393 non-null  object 
 6   Keyword_count  114393 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 7.0+ MB


In [133]:
#Use Get dummies to convert the remaining text columns to integers
X_encoded=pd.get_dummies(ML_dataset)
X_encoded=X_encoded.drop(columns="points")
X_encoded.shape


(114393, 1729)

In [134]:
X_encoded.head()
X_encoded.describe()

Unnamed: 0,price,Keyword_count,country_Argentina,country_Australia,country_Canada,country_France,country_Italy,country_Spain,country_US,province_Alsace,...,variety_Viura,variety_Viura-Chardonnay,variety_Viura-Sauvignon Blanc,variety_Viura-Verdejo,variety_White Blend,variety_White Riesling,variety_Xarel-lo,variety_Zibibbo,variety_Zinfandel,variety_Zweigelt
count,114393.0,114393.0,114393.0,114393.0,114393.0,114393.0,114393.0,114393.0,114393.0,114393.0,...,114393.0,114393.0,114393.0,114393.0,114393.0,114393.0,114393.0,114393.0,114393.0,114393.0
mean,34.657916,5.324539,0.048805,0.042782,0.001696,0.12916,0.164206,0.071333,0.542017,0.01188,...,0.000962,9.6e-05,9e-06,9e-06,0.01869,0.00028,0.00021,0.000271,0.033131,2.6e-05
std,37.266912,2.355411,0.215462,0.202367,0.041147,0.335378,0.370464,0.257381,0.498234,0.108347,...,0.030995,0.009806,0.002957,0.002957,0.135428,0.016723,0.014483,0.01646,0.17898,0.005121
min,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,17.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,25.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,40.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2300.0,18.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Split Data into Training and Testing

In [135]:
y=ML_dataset["points"]
y.value_counts()


87     15845
88     13130
90     11914
86     11556
85      9442
89      9034
84      8391
91      7772
92      7224
83      4864
93      4709
82      3401
94      2782
95      1375
81      1222
80       748
96       524
97       284
98       113
99        39
100       24
Name: points, dtype: int64

In [136]:
# create train and testing data 
X_train,X_test,y_train,y_test=train_test_split(X_encoded,y,random_state=1)

In [137]:
# create Balanced Random Forest Classifier model 
brf_model = BalancedRandomForestClassifier(n_estimators = 100, random_state=1)
#Train and fit model 
brf_model= brf_model.fit(X_train,y_train) 
#Run test data 
predictions = brf_model.predict(X_test)

In [138]:
# Calculate the balanced accuracy score
balanced_accuracy_score(y_test,predictions)

0.22164545323748042

In [139]:
# Display the confusion matrix
cm=confusion_matrix(y_test,predictions)
cm

array([[ 97,  26,  28,   8,   8,   0,   0,   3,   2,   0,   0,   1,   0,
          0,   0,   0,   0,   0,   0,   0,   0],
       [115,  44,  77,  11,  31,   9,  12,   6,   0,   4,   5,   1,   0,
          1,   1,   0,   0,   2,   0,   1,   0],
       [241,  75, 242,  55,  93,  22,  29,  23,  13,   6,  14,   5,   7,
          0,   2,   4,   2,   6,   1,   3,   3],
       [276,  98, 253,  99, 152,  63,  65,  61,  36,  13,  24,  21,  22,
          3,   8,   6,   3,  14,   3,   4,   2],
       [316, 130, 359, 144, 330, 130, 180, 146,  72,  35,  54,  27,  52,
          7,  15,  18,  13,   6,  16,   4,   5],
       [304, 117, 306, 145, 344, 165, 273, 233,  85,  57,  76,  45,  72,
          9,  22,  30,  22,  22,  14,  14,   9],
       [248, 114, 288, 113, 375, 169, 370, 408, 165,  83, 137,  87, 108,
         27,  43,  54,  24,  25,  21,  33,  10],
       [284, 122, 336, 107, 475, 194, 386, 629, 285, 119, 255, 157, 212,
         46,  95,  74,  47,  31,  28,  48,  38],
       [214,  89, 186,  

In [140]:
# Print the imbalanced classification report
report=classification_report_imbalanced(y_test,predictions)
print(report)

                   pre       rec       spe        f1       geo       iba       sup

         80       0.04      0.56      0.92      0.07      0.72      0.50       173
         81       0.05      0.14      0.97      0.07      0.36      0.12       320
         82       0.10      0.29      0.92      0.14      0.51      0.25       846
         83       0.12      0.08      0.97      0.10      0.28      0.07      1226
         84       0.12      0.16      0.91      0.14      0.38      0.13      2059
         85       0.15      0.07      0.97      0.10      0.26      0.06      2364
         86       0.18      0.13      0.93      0.15      0.34      0.11      2902
         87       0.23      0.16      0.91      0.19      0.38      0.13      3968
         88       0.17      0.08      0.95      0.11      0.28      0.07      3274
         89       0.17      0.07      0.97      0.10      0.26      0.06      2321
         90       0.18      0.12      0.94      0.14      0.33      0.10      2970
   

In [141]:
# List the features sorted in descending order by feature importance
important_features=brf_model.feature_importances_

cols=X_encoded.columns

features_df = pd.DataFrame({'feature':cols,
                          'importance': important_features})
features_df.sort_values('importance',ascending=False)

Unnamed: 0,feature,importance
0,price,0.175252
1,Keyword_count,0.136937
1309,variety_Cabernet Sauvignon,0.018125
1343,variety_Chardonnay,0.015416
1567,variety_Pinot Noir,0.013851
...,...,...
878,region_1_Prosecco Treviso,0.000000
208,region_1_Cabardes,0.000000
880,region_1_Prosecco di Conegliano,0.000000
883,region_1_Prosecco di Valdobbiadene Superiore,0.000000
