In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
sns.set_context('notebook', font_scale=1.5)
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

wine_reviews = pd.read_csv("../data/winemag-data-130k-v2.csv", index_col=0)
print("Before removing duplicates:", len(wine_reviews))
wine_reviews.tail()

Before removing duplicates: 129971


Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
129966,Germany,Notes of honeysuckle and cantaloupe sweeten th...,Brauneberger Juffer-Sonnenuhr Spätlese,90,28.0,Mosel,,,Anna Lee C. Iijima,,Dr. H. Thanisch (Erben Müller-Burggraef) 2013 ...,Riesling,Dr. H. Thanisch (Erben Müller-Burggraef)
129967,US,Citation is given as much as a decade of bottl...,,90,75.0,Oregon,Oregon,Oregon Other,Paul Gregutt,@paulgwine,Citation 2004 Pinot Noir (Oregon),Pinot Noir,Citation
129968,France,Well-drained gravel soil gives this wine its c...,Kritt,90,30.0,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Gresser 2013 Kritt Gewurztraminer (Als...,Gewürztraminer,Domaine Gresser
129969,France,"A dry style of Pinot Gris, this is crisp with ...",,90,32.0,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Marcel Deiss 2012 Pinot Gris (Alsace),Pinot Gris,Domaine Marcel Deiss
129970,France,"Big, rich and off-dry, this is powered by inte...",Lieu-dit Harth Cuvée Caroline,90,21.0,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Schoffit 2012 Lieu-dit Harth Cuvée Car...,Gewürztraminer,Domaine Schoffit


In [2]:
wine_reviews = wine_reviews.drop_duplicates()
print("Removing duplicates based on all columns:", len(wine_reviews))

Removing duplicates based on all columns: 119988


In [3]:
wine_reviews_ddp = wine_reviews.drop_duplicates('description')
print("Removing duplicates based on description:", len(wine_reviews_ddp))

Removing duplicates based on description: 119955


In [4]:
# full join two dedupped data and find the rows only in the first with '_merge' flag
wine_reviews_all = wine_reviews.merge(wine_reviews_ddp, how='outer', indicator=True)
dup_wine_desc = wine_reviews_all[wine_reviews_all['_merge']=='left_only'].description

wine_reviews_all[wine_reviews_all['description'].isin(dup_wine_desc)]

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,_merge
2159,Italy,"Easygoing and enjoyable, this has aromas of ma...",,84,15.0,Central Italy,Montepulciano d'Abruzzo,,Kerin O’Keefe,@kerinokeefe,Valori 2014 Montepulciano d'Abruzzo,Montepulciano,Valori,both
4216,Italy,This zesty red has pretty aromas that suggest ...,,88,30.0,Tuscany,Rosso di Montalcino,,Kerin O’Keefe,@kerinokeefe,Ridolfi 2014 Rosso di Montalcino,Sangiovese,Ridolfi,both
8859,US,"Hugely delicious, just a joy to drink. So soft...",Vintner Select Cuvée,93,130.0,California,Napa Valley,Napa,,,Pride Mountain 2008 Vintner Select Cuvée Caber...,Cabernet Sauvignon,Pride Mountain,both
12090,US,"Seductively tart in lemon pith, cranberry and ...",Woods Vineyard Rosé of,91,29.0,California,Russian River Valley,Sonoma,Virginie Boone,@vboone,Ousterhout 2014 Woods Vineyard Rosé of Pinot N...,Pinot Noir,Ousterhout,both
13063,US,"Cigar box, café au lait, and dried tobacco aro...",Estate Grown,88,30.0,Washington,Red Mountain,Columbia Valley,Sean P. Sullivan,@wawinereport,Ambassador Vineyard 2013 Estate Grown Syrah (R...,Syrah,Ambassador Vineyard,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109793,US,The tannins are so fierce on this baby Caberne...,,95,55.0,California,Howell Mountain,Napa,,,La Jota Vineyard 2009 Merlot (Howell Mountain),Merlot,La Jota Vineyard,left_only
115505,Italy,"Sleek by Amarone standards, this opens with ar...",,90,40.0,Veneto,Amarone della Valpolicella,,Kerin O’Keefe,@kerinokeefe,Villa Annaberta 2012 Amarone della Valpolicella,Red Blend,Villa Annaberta,left_only
115743,Greece,"Aromas of citrus, melon and orange blossom sta...",,89,22.0,Cephalonia,,,Susan Kostrzewa,@suskostrzewa,Gentilini 2015 Robola (Cephalonia),Robola,Gentilini,left_only
117429,Italy,"Ripe plum, game, truffle, leather and menthol ...",,87,40.0,Tuscany,Brunello di Montalcino,,Kerin O’Keefe,@kerinokeefe,Poggiarellino 2011 Brunello di Montalcino,Sangiovese,Poggiarellino,left_only


In [5]:
wine_reviews.reset_index().tail()

Unnamed: 0,index,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
119983,129966,Germany,Notes of honeysuckle and cantaloupe sweeten th...,Brauneberger Juffer-Sonnenuhr Spätlese,90,28.0,Mosel,,,Anna Lee C. Iijima,,Dr. H. Thanisch (Erben Müller-Burggraef) 2013 ...,Riesling,Dr. H. Thanisch (Erben Müller-Burggraef)
119984,129967,US,Citation is given as much as a decade of bottl...,,90,75.0,Oregon,Oregon,Oregon Other,Paul Gregutt,@paulgwine,Citation 2004 Pinot Noir (Oregon),Pinot Noir,Citation
119985,129968,France,Well-drained gravel soil gives this wine its c...,Kritt,90,30.0,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Gresser 2013 Kritt Gewurztraminer (Als...,Gewürztraminer,Domaine Gresser
119986,129969,France,"A dry style of Pinot Gris, this is crisp with ...",,90,32.0,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Marcel Deiss 2012 Pinot Gris (Alsace),Pinot Gris,Domaine Marcel Deiss
119987,129970,France,"Big, rich and off-dry, this is powered by inte...",Lieu-dit Harth Cuvée Caroline,90,21.0,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Schoffit 2012 Lieu-dit Harth Cuvée Car...,Gewürztraminer,Domaine Schoffit


In [6]:
wine_reviews = wine_reviews.reset_index(drop = True)

In [7]:
wine_reviews.describe()

Unnamed: 0,points,price
count,119988.0,111593.0
mean,88.442236,35.620747
std,3.092915,42.103728
min,80.0,4.0
25%,86.0,17.0
50%,88.0,25.0
75%,91.0,42.0
max,100.0,3300.0


One Hot Encoder


In [8]:
enc_cols = wine_reviews.drop(['description', 'points', 'price', 'designation', 'winery'], axis = 1)
dummies = pd.get_dummies(enc_cols, prefix = ['country', 'province', 'region_1', 'variety'])

# combined with log transformed 'price'
X_encoded = pd.concat([np.log(wine_reviews['price']), dummies], axis = 1)
X_encoded.shape


ValueError: Length of 'prefix' (4) did not match the length of the columns being encoded (8).