In [3]:
## Import packages 
import pandas as pd
import numpy as np
import nltk
import re

In [59]:
# Load wine dataset 
df = pd.read_csv('winemag-data_first150k.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


In [5]:
# Copy Paige's DANK char removal 

def remove_weird_char(string):
    words = string.lower().split()
    refined = []
    for i in words:
        refined.append(re.sub("[^\w]+", "", i))
        sentence = ' '.join([word for word in refined])
    return(sentence)

In [60]:
#Clean up that description column
df['description'] = df['description'].str.lower()
df['description'] = df['description'].apply(remove_weird_char)

df['description'].head()

0    this tremendous 100 varietal wine hails from o...
1    ripe aromas of fig blackberry and cassis are s...
2    mac watson honors the memory of a wine once ma...
3    this spent 20 months in 30 new french oak and ...
4    this is the top wine from la bégude named afte...
Name: description, dtype: object

In [7]:
# Make dataframe of just description column 

desc = df['description'].str.cat(sep=' ')

In [8]:
#tokenize, tag, and turn into a dataframe
from nltk import pos_tag, word_tokenize

pos_df = pd.DataFrame(pos_tag(word_tokenize(desc)), columns = ['Word', 'POS'])
pos_df['POS'].value_counts()[:5]

NN    1538436
JJ     919076
IN     690797
DT     642043
CC     469040
Name: POS, dtype: int64

In [9]:
pos_df['Word'].value_counts()[:5]

and     404954
the     258453
a       215624
of      184159
with    152656
Name: Word, dtype: int64

In [10]:
## Remove stopwords
from nltk.corpus import stopwords
cachedStopWords = stopwords.words("english")
def rmStopWords(text):
        text = ' '.join([word for word in text.split() if word not in cachedStopWords])
        return(text)

In [11]:
# Clean that description 
clean_desc = rmStopWords(desc)

In [12]:
# DO IT AGAIN 
clean_pos_df = pd.DataFrame(pos_tag(word_tokenize(clean_desc)), columns = ['Word', 'POS'])
clean_pos_df['POS'].value_counts()[:5]

NN     1319977
JJ      980764
NNS     398566
RB      222529
VBP     204431
Name: POS, dtype: int64

In [13]:
clean_pos_df['Word'].value_counts()[:100]

wine          87628
flavors       77838
fruit         56503
finish        37723
aromas        35830
acidity       32604
tannins       32186
cherry        30685
palate        28945
ripe          26722
black         24614
drink         23630
dry           22977
spice         22658
sweet         21290
rich          21172
oak           19670
notes         19607
red           19208
soft          17745
fresh         17666
good          17291
berry         17104
nose          15877
shows         15766
blackberry    14974
crisp         13960
blend         13666
vanilla       13319
plum          13286
              ...  
mouthfeel      7390
little         7250
cola           7226
style          7118
toast          7075
one            7069
wood           7000
pear           6950
merlot         6887
made           6881
great          6879
herbal         6868
flavor         6865
tart           6822
creamy         6799
slightly       6785
complex        6640
elegant        6631
balance        6547


In [98]:
#calculate lift values

def calc_lift(a, b):
    total_size = len(test_df)
    filter_a = test_df[test_df['message'].str.contains(a)]
    num_a = len(filter_a)
    num_b = len(test_df[test_df['message'].str.contains(b)])
    num_a_b = len(filter_a['message'][filter_a['message'].str.contains(b)])
    if num_a_b==0:
        return .2
    return total_size*float(num_a_b)/float(num_a*num_b)

In [18]:
variety_names = set(df['variety'])
variety_names

{'Agiorgitiko',
 'Aglianico',
 'Aidani',
 'Airen',
 'Albana',
 'Albariño',
 'Albarossa',
 'Albarín',
 'Aleatico',
 'Alfrocheiro',
 'Alicante',
 'Alicante Bouschet',
 'Aligoté',
 'Alsace white blend',
 'Altesse',
 'Alvarelhão',
 'Alvarinho',
 'Alvarinho-Chardonnay',
 'Angevine',
 'Ansonica',
 'Antão Vaz',
 'Apple',
 'Aragonez',
 'Aragonês',
 'Argaman',
 'Arinto',
 'Arneis',
 'Asprinio',
 'Assyrtico',
 'Assyrtiko',
 'Athiri',
 'Austrian Red Blend',
 'Austrian white blend',
 'Auxerrois',
 'Avesso',
 'Azal',
 'Baco Noir',
 'Baga',
 'Baga-Touriga Nacional',
 'Barbera',
 'Bastardo',
 'Bical',
 'Black Monukka',
 'Black Muscat',
 'Blatina',
 'Blauburgunder',
 'Blauer Portugieser',
 'Blaufränkisch',
 'Bobal',
 'Bombino Bianco',
 'Bonarda',
 'Bordeaux-style Red Blend',
 'Bordeaux-style White Blend',
 'Bovale',
 'Boğazkere',
 'Brachetto',
 'Braucol',
 'Bual',
 'Bukettraube',
 'Cabernet',
 'Cabernet Blend',
 'Cabernet Franc',
 'Cabernet Franc-Cabernet Sauvignon',
 'Cabernet Franc-Carmenère',
 'Cab

In [46]:
grouped = df.groupby(['variety'], as_index=False)
grouped.size()

## Get list of wine varities that have < 1000 reviews 
name_list = []
for name in variety_names:
    var_len = (len(df[df['variety']==str(name)]))
    if var_len < 1000:
        name_list.append(str(name))

name_list = set(name_list)

name_list

{'Agiorgitiko',
 'Aglianico',
 'Aidani',
 'Airen',
 'Albana',
 'Albariño',
 'Albarossa',
 'Albarín',
 'Aleatico',
 'Alfrocheiro',
 'Alicante',
 'Alicante Bouschet',
 'Aligoté',
 'Alsace white blend',
 'Altesse',
 'Alvarelhão',
 'Alvarinho',
 'Alvarinho-Chardonnay',
 'Angevine',
 'Ansonica',
 'Antão Vaz',
 'Apple',
 'Aragonez',
 'Aragonês',
 'Argaman',
 'Arinto',
 'Arneis',
 'Asprinio',
 'Assyrtico',
 'Assyrtiko',
 'Athiri',
 'Austrian Red Blend',
 'Austrian white blend',
 'Auxerrois',
 'Avesso',
 'Azal',
 'Baco Noir',
 'Baga',
 'Baga-Touriga Nacional',
 'Bastardo',
 'Bical',
 'Black Monukka',
 'Black Muscat',
 'Blatina',
 'Blauburgunder',
 'Blauer Portugieser',
 'Blaufränkisch',
 'Bobal',
 'Bombino Bianco',
 'Bonarda',
 'Bovale',
 'Boğazkere',
 'Brachetto',
 'Braucol',
 'Bual',
 'Bukettraube',
 'Cabernet',
 'Cabernet Blend',
 'Cabernet Franc-Cabernet Sauvignon',
 'Cabernet Franc-Carmenère',
 'Cabernet Franc-Malbec',
 'Cabernet Franc-Merlot',
 'Cabernet Franc-Tempranillo',
 'Cabernet Me

In [61]:
df['variety'].value_counts()[:25]

Chardonnay                       14482
Pinot Noir                       14291
Cabernet Sauvignon               12800
Red Blend                        10062
Bordeaux-style Red Blend          7347
Sauvignon Blanc                   6320
Syrah                             5825
Riesling                          5524
Merlot                            5070
Zinfandel                         3799
Sangiovese                        3345
Malbec                            3208
White Blend                       2824
Rosé                              2817
Tempranillo                       2556
Nebbiolo                          2241
Portuguese Red                    2216
Sparkling Blend                   2004
Shiraz                            1970
Corvina, Rondinella, Molinara     1682
Rhône-style Red Blend             1505
Pinot Gris                        1365
Barbera                           1365
Cabernet Franc                    1363
Sangiovese Grosso                 1346
Name: variety, dtype: int

In [62]:
mask = ~df['variety'].isin(name_list)
df[mask].shape

df = df[mask]
## Remove all reviews from the brands with < 1000 reviews 

In [63]:
variety_names = set(df['variety'])
variety_names
len(variety_names)
## dealing with top 31 varietys 

31

In [64]:
grouped = df.groupby(df['variety'])
grouped.size()

variety
Barbera                           1365
Bordeaux-style Red Blend          7347
Bordeaux-style White Blend        1261
Cabernet Franc                    1363
Cabernet Sauvignon               12800
Champagne Blend                   1238
Chardonnay                       14482
Corvina, Rondinella, Molinara     1682
Grüner Veltliner                  1042
Malbec                            3208
Merlot                            5070
Nebbiolo                          2241
Pinot Grigio                      1305
Pinot Gris                        1365
Pinot Noir                       14291
Port                              1058
Portuguese Red                    2216
Red Blend                        10062
Rhône-style Red Blend             1505
Riesling                          5524
Rosé                              2817
Sangiovese                        3345
Sangiovese Grosso                 1346
Sauvignon Blanc                   6320
Shiraz                            1970
Sparkling Blend  

In [88]:
dict = {}
for var in variety_names:
    dict[var] = df[df['variety']== str(var)]['description'].str.cat(sep=' ')

In [92]:
test_df = pd.DataFrame.from_dict(orient='index', data=dict)
test_df.columns = ['message']

In [93]:
test_df

Unnamed: 0,message
Tempranillo,nicely oaked blackberry licorice vanilla and c...
Pinot Noir,this spent 20 months in 30 new french oak and ...
White Blend,this wine is dry and substantial emphasizing e...
Bordeaux-style White Blend,from one of the many recent exceptional years ...
Grüner Veltliner,much like its companion from the revelation s...
Sangiovese Grosso,this fresh rosso di montalcino is redolent of ...
Viognier,juicy kiwi lime blossom and sour apple candy a...
Sauvignon Blanc,mac watson honors the memory of a wine once ma...
Champagne Blend,fresh whiffs of strawberry and blossom go hand...
Red Blend,this bright savory wine delivers aromas and fl...


In [96]:
cleaner_desc = test_df['message'].str.cat(sep=' ')
cleaner_desc = rmStopWords(cleaner_desc)

cleaner_pos_df = pd.DataFrame(pos_tag(word_tokenize(cleaner_desc)), columns = ['Word', 'POS'])
cleaner_pos_df['POS'].value_counts()[:5]

NN     1086390
JJ      806852
NNS     332982
RB      185721
VBP     168885
Name: POS, dtype: int64

In [97]:
cleaner_pos_df['Word'].value_counts()[:20]

wine       71420
flavors    65069
fruit      46930
finish     30462
tannins    28184
aromas     26916
cherry     26827
acidity    26634
ripe       22744
palate     22285
black      21157
dry        19787
drink      19770
spice      18777
rich       18221
oak        17600
sweet      17260
notes      15920
red        15901
soft       14833
Name: Word, dtype: int64

In [100]:
aset = list(variety_names)
new_test_df = pd.DataFrame(columns=aset, index=aset)

for brand1, series in list(new_test_df.iterrows()):
    for brand2 in series.index:
        new_test_df[brand2].loc[brand1] = calc_lift(brand1, brand2)

new_test_df

ValueError: cannot index with vector containing NA / NaN values

In [101]:
new_test_df

Unnamed: 0,Tempranillo,Pinot Noir,White Blend,Bordeaux-style White Blend,Grüner Veltliner,Sangiovese Grosso,Viognier,Sauvignon Blanc,Champagne Blend,Red Blend,...,Chardonnay,Malbec,Sparkling Blend,Bordeaux-style Red Blend,Pinot Grigio,Pinot Gris,Zinfandel,Portuguese Red,Cabernet Franc,Port
Tempranillo,,,,,,,,,,,...,,,,,,,,,,
Pinot Noir,,,,,,,,,,,...,,,,,,,,,,
White Blend,,,,,,,,,,,...,,,,,,,,,,
Bordeaux-style White Blend,,,,,,,,,,,...,,,,,,,,,,
Grüner Veltliner,,,,,,,,,,,...,,,,,,,,,,
Sangiovese Grosso,,,,,,,,,,,...,,,,,,,,,,
Viognier,,,,,,,,,,,...,,,,,,,,,,
Sauvignon Blanc,,,,,,,,,,,...,,,,,,,,,,
Champagne Blend,,,,,,,,,,,...,,,,,,,,,,
Red Blend,,,,,,,,,,,...,,,,,,,,,,
