## Basic setup

In [1]:
## Import packages 
import pandas as pd
import numpy as np
import nltk
import re

In [2]:
# Load wine dataset 
df = pd.read_csv('winemag-data_first150k.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


In [3]:
# Copy Paige's DANK char removal 

def remove_weird_char(string):
    words = string.lower().split()
    refined = []
    for i in words:
        refined.append(re.sub("[^\w-]+", "", i))
        sentence = ' '.join([word for word in refined])
    return(sentence)

In [4]:
#Clean up that description column
df['description'] = df['description'].str.lower()
df['description'] = df['description'].apply(remove_weird_char)

df['description'].head()

0    this tremendous 100 varietal wine hails from o...
1    ripe aromas of fig blackberry and cassis are s...
2    mac watson honors the memory of a wine once ma...
3    this spent 20 months in 30 new french oak and ...
4    this is the top wine from la bgude named after...
Name: description, dtype: object

In [9]:
## Remove stopwords
from nltk.corpus import stopwords
cachedStopWords = stopwords.words("english")
cachedStopWords.append('wine')
def rmStopWords(text):
        text = ' '.join([word for word in text.split() if word not in cachedStopWords])
        return(text)

In [10]:
df['description']=df['description'].apply(rmStopWords)

In [11]:
df = df[['variety', 'description']]
df.head()

Unnamed: 0,variety,description
0,Cabernet Sauvignon,tremendous 100 varietal hails oakville aged th...
1,Tinta de Toro,ripe aromas fig blackberry cassis softened swe...
2,Sauvignon Blanc,mac watson honors memory made mother tremendou...
3,Pinot Noir,spent 20 months 30 new french oak incorporates...
4,Provence red blend,top la bgude named highest point vineyard 1200...


## Calculate lift values

In [12]:
df['variety'].value_counts()[:25]

Chardonnay                       14482
Pinot Noir                       14291
Cabernet Sauvignon               12800
Red Blend                        10062
Bordeaux-style Red Blend          7347
Sauvignon Blanc                   6320
Syrah                             5825
Riesling                          5524
Merlot                            5070
Zinfandel                         3799
Sangiovese                        3345
Malbec                            3208
White Blend                       2824
Rosé                              2817
Tempranillo                       2556
Nebbiolo                          2241
Portuguese Red                    2216
Sparkling Blend                   2004
Shiraz                            1970
Corvina, Rondinella, Molinara     1682
Rhône-style Red Blend             1505
Barbera                           1365
Pinot Gris                        1365
Cabernet Franc                    1363
Sangiovese Grosso                 1346
Name: variety, dtype: int

In [13]:
top_varieties = list(df['variety'].value_counts()[:25].index)

mask = df['variety'].isin(top_varieties)
new_df = df[mask]
new_df.head()

Unnamed: 0,variety,description
0,Cabernet Sauvignon,tremendous 100 varietal hails oakville aged th...
2,Sauvignon Blanc,mac watson honors memory made mother tremendou...
3,Pinot Noir,spent 20 months 30 new french oak incorporates...
8,Pinot Noir,renamed vineyard formerly bottled delancellott...
9,Pinot Noir,producer sources two blocks vineyard wineone h...


In [14]:
#dealing with top 25 vatieties
len(top_varieties)

25

In [15]:
from nltk import pos_tag, word_tokenize

all_desc = new_df['description'].str.cat(sep=' ')

pos_df = pd.DataFrame(pos_tag(word_tokenize(all_desc)), columns = ['word', 'POS'])
pos_df['word'].value_counts()[:20]

flavors               61718
fruit                 44575
finish                29191
tannins               27792
cherry                26735
aromas                25914
acidity               24047
palate                21410
ripe                  21178
black                 20975
drink                 18948
dry                   18683
spice                 17910
oak                   17414
rich                  16876
sweet                 16223
red                   15645
notes                 15345
soft                  13963
good                  13865
berry                 13717
shows                 12834
blackberry            12571
nose                  11897
fresh                 11790
blend                 11320
years                 10998
vanilla               10863
cabernet              10654
plum                  10441
                      ...  
chart                     1
gandia                    1
berthiers                 1
benedetto                 1
oakroughly          

In [16]:
attribute_list = pos_df['word'].value_counts()[:20].index

In [18]:
#calculate lift values

def calc_lift(a, b):
    total_size = len(df)
    filter_a = df[df['variety']==a]
    num_a = len(filter_a)
    num_b = len(df[df['description'].str.contains(b)])
    num_a_b = len(filter_a['description'][filter_a['description'].str.contains(b)])
    if num_a_b==0:
        return .2
    return total_size*float(num_a_b)/float(num_a*num_b)

In [19]:
lift_df = pd.DataFrame(columns=top_varieties, index=attribute_list)

for attribute, series in list(lift_df.iterrows()):
    for variety in series.index:
        lift_df[variety].loc[attribute] = calc_lift(variety, attribute)

lift_df

Unnamed: 0,Chardonnay,Pinot Noir,Cabernet Sauvignon,Red Blend,Bordeaux-style Red Blend,Sauvignon Blanc,Syrah,Riesling,Merlot,Zinfandel,...,Nebbiolo,Portuguese Red,Sparkling Blend,Shiraz,"Corvina, Rondinella, Molinara",Rhône-style Red Blend,Barbera,Pinot Gris,Cabernet Franc,Sangiovese Grosso
flavors,1.18142,1.11135,1.19422,0.87089,0.708477,1.26915,1.12957,1.01437,1.1682,1.30949,...,0.203857,0.632153,1.13277,0.905026,0.367751,0.942886,0.419095,1.09439,1.02332,0.2493
fruit,1.05548,0.918733,0.823677,0.958115,1.3923,1.16807,1.00323,0.914565,0.902196,0.646832,...,0.778918,1.70963,0.708292,1.10576,0.998762,0.928142,1.10749,1.13303,0.987216,0.984445
finish,0.992232,0.79859,0.987778,0.960375,0.521282,1.26491,0.974459,1.63065,1.08394,0.835554,...,0.786586,0.261128,1.1016,1.79468,0.538916,1.43573,0.686125,1.22522,1.05278,0.397606
tannins,0.0344001,1.18253,1.88859,1.37286,2.61356,0.0137755,1.41324,0.00875584,1.59412,1.2617,...,2.45182,3.03823,0.0120677,1.36263,0.600996,1.59403,0.687417,0.0106302,1.97301,1.04568
cherry,0.00366616,2.14804,1.36165,1.72354,0.502573,0.00305486,1.22966,0.0463095,2.14677,1.27178,...,2.04611,0.148111,0.284205,0.83303,2.25264,1.75107,1.80337,0.0106081,1.92288,2.41692
aromas,0.694951,0.391973,0.783938,1.45942,0.375707,0.959249,0.705318,0.839519,0.859157,0.23471,...,2.6329,0.177122,1.37953,1.26907,1.97337,0.722867,1.70966,0.456326,0.766875,1.93665
acidity,1.51651,1.21039,0.409767,0.454176,1.23246,1.38418,0.496008,1.46997,0.496309,0.43867,...,0.617247,2.05372,0.968227,0.277987,0.241382,0.555226,1.46645,1.18284,0.8486,0.624321
palate,0.857628,0.611921,0.907788,1.32543,0.400185,0.963888,0.765766,1.62228,0.936402,0.291549,...,1.53664,0.124954,1.3139,1.29824,0.404079,1.03032,0.925762,1.06223,1.00838,0.370296
ripe,1.1629,1.00893,1.15446,0.93124,1.5286,0.981256,1.01474,0.966535,1.01008,1.32506,...,0.836679,1.75564,0.355393,0.58287,0.901588,0.80481,0.72763,0.97254,1.03795,0.795494
black,0.0040928,0.676042,2.32272,1.60697,1.93297,0.00687754,2.04865,0.0150218,1.57435,1.6975,...,1.24839,1.73679,0.00394357,1.5846,1.25921,1.49657,1.73112,0.0231587,1.20022,1.40033


## Vectorize each variety

In [20]:
#calculate vectors

def calc_vect(a, b):
    temp_string = df[df['variety']==a]['description'].str.cat(sep=' ')
    return temp_string.count(b)

# note the attribute features arbitrarily chosen as the 100 words with highest frequency

In [48]:
vector_df = pd.DataFrame(columns=top_varieties, index=pos_df['word'].value_counts()[:100].index)

for attribute, series in list(vector_df.iterrows()):
    for variety in series.index:
        vector_df[variety].loc[attribute] = calc_vect(variety, attribute)

vector_df

Unnamed: 0,Chardonnay,Pinot Noir,Cabernet Sauvignon,Red Blend,Bordeaux-style Red Blend,Sauvignon Blanc,Syrah,Riesling,Merlot,Zinfandel,...,Nebbiolo,Portuguese Red,Sparkling Blend,Shiraz,"Corvina, Rondinella, Molinara",Rhône-style Red Blend,Barbera,Pinot Gris,Cabernet Franc,Sangiovese Grosso
flavors,8867,8132,7886,4556,2698,4113,3453,2860,3083,2539,...,234,741,1161,912,315,721,296,764,736,170
fruit,8905,7613,5893,5445,6358,4659,3380,3009,2563,1328,...,939,2448,766,1273,926,759,859,953,822,697
finish,4336,3469,3818,2914,1156,2413,1719,2750,1659,970,...,529,173,666,1062,271,652,281,503,432,160
tannins,104,3612,5144,2908,4264,18,1733,10,1708,1003,...,1168,1484,5,560,209,506,196,3,577,291
cherry,11,6605,3781,3780,804,4,1516,54,2339,1020,...,968,73,125,366,827,561,529,4,593,725
aromas,2384,1326,2376,3487,650,1438,978,1104,1037,210,...,1436,92,657,591,793,256,561,147,245,653
acidity,4750,3726,1128,985,1967,1877,625,1751,536,353,...,293,984,422,118,86,180,424,347,246,178
palate,2527,1773,2359,2696,600,1233,894,1895,979,224,...,696,55,535,522,135,316,259,303,280,99
ripe,3663,3125,3204,2097,2519,1358,1276,1187,1131,1116,...,413,865,152,250,324,262,219,287,315,233
black,15,2752,9303,5286,4043,11,3948,21,2390,1946,...,818,1047,2,941,595,713,742,8,508,537


In [49]:
#calculate cosine similarities 

from scipy import spatial
def calc_cos(series_A, series_B):
    sim = round(1 - spatial.distance.cosine(series_A, series_B),3)
    if sim==1:
        return np.nan
    return sim

In [50]:
cos_df = pd.DataFrame(columns=top_varieties, index=top_varieties)

for variety_A, series in list(cos_df.iterrows()):
    for variety_B in series.index:
        cos_df[variety_B].loc[variety_A] = calc_cos(vector_df[variety_B], vector_df[variety_A])

cos_df

Unnamed: 0,Chardonnay,Pinot Noir,Cabernet Sauvignon,Red Blend,Bordeaux-style Red Blend,Sauvignon Blanc,Syrah,Riesling,Merlot,Zinfandel,...,Nebbiolo,Portuguese Red,Sparkling Blend,Shiraz,"Corvina, Rondinella, Molinara",Rhône-style Red Blend,Barbera,Pinot Gris,Cabernet Franc,Sangiovese Grosso
Chardonnay,,0.727,0.655,0.614,0.657,0.88,0.652,0.863,0.685,0.649,...,0.507,0.627,0.89,0.683,0.575,0.642,0.614,0.885,0.722,0.521
Pinot Noir,0.727,,0.846,0.808,0.8,0.734,0.843,0.693,0.878,0.868,...,0.723,0.753,0.788,0.844,0.736,0.854,0.78,0.79,0.908,0.726
Cabernet Sauvignon,0.655,0.846,,0.891,0.868,0.666,0.917,0.616,0.919,0.919,...,0.739,0.746,0.685,0.879,0.73,0.866,0.814,0.638,0.941,0.733
Red Blend,0.614,0.808,0.891,,0.845,0.659,0.879,0.602,0.897,0.832,...,0.804,0.727,0.69,0.871,0.807,0.93,0.848,0.618,0.915,0.802
Bordeaux-style Red Blend,0.657,0.8,0.868,0.845,,0.674,0.826,0.612,0.837,0.794,...,0.716,0.957,0.62,0.804,0.684,0.82,0.77,0.658,0.893,0.703
Sauvignon Blanc,0.88,0.734,0.666,0.659,0.674,,0.681,0.905,0.704,0.679,...,0.511,0.631,0.883,0.712,0.582,0.667,0.644,0.921,0.747,0.529
Syrah,0.652,0.843,0.917,0.879,0.826,0.681,,0.629,0.891,0.921,...,0.737,0.754,0.692,0.897,0.745,0.927,0.829,0.663,0.896,0.741
Riesling,0.863,0.693,0.616,0.602,0.612,0.905,0.629,,0.653,0.644,...,0.542,0.579,0.889,0.681,0.614,0.654,0.637,0.93,0.698,0.539
Merlot,0.685,0.878,0.919,0.897,0.837,0.704,0.891,0.653,,0.902,...,0.748,0.73,0.724,0.881,0.759,0.878,0.818,0.68,0.937,0.753
Zinfandel,0.649,0.868,0.919,0.832,0.794,0.679,0.921,0.644,0.902,,...,0.682,0.707,0.72,0.853,0.725,0.86,0.783,0.677,0.894,0.707


In [51]:
#most similar varieties using cosine similarity
cos_df.idxmax(axis=0)

Chardonnay                                     Sparkling Blend
Pinot Noir                                      Cabernet Franc
Cabernet Sauvignon                              Cabernet Franc
Red Blend                                Rhône-style Red Blend
Bordeaux-style Red Blend                        Portuguese Red
Sauvignon Blanc                                     Pinot Gris
Syrah                                    Rhône-style Red Blend
Riesling                                            Pinot Gris
Merlot                                          Cabernet Franc
Zinfandel                                                Syrah
Sangiovese                                             Barbera
Malbec                                             Tempranillo
White Blend                                    Sparkling Blend
Rosé                                            Cabernet Franc
Tempranillo                                             Malbec
Nebbiolo                                            San