# Importing data

In [1]:
import pandas as pd
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
init_notebook_mode(connected=True)

In [2]:
reviews = pd.read_csv('winemag-data-130k-v2.csv.zip', index_col=0)
reviews.head(5)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


# Country based analysis

In [3]:
countries = reviews.groupby('country').agg({'points':'mean','price':'mean','variety':'count'})
countries.head(3)

Unnamed: 0_level_0,points,price,variety
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Argentina,86.710263,24.510117,3800
Armenia,87.5,14.5,2
Australia,88.580507,35.437663,2329


In [4]:
px.choropleth(countries, locations=countries.index,locationmode='country names', color='points')

# Analysing points and prices distribution

In [5]:
reviews[['price','points']].fillna(value=reviews[['price','points']].mean()).values

array([[35.36338913, 87.        ],
       [15.        , 87.        ],
       [14.        , 87.        ],
       ...,
       [30.        , 90.        ],
       [32.        , 90.        ],
       [21.        , 90.        ]])

In [6]:
x = reviews[['price','points']].fillna(value=reviews[['price','points']].mean()).values 

In [7]:
x.shape

(129971, 2)

In [8]:
cov = np.cov(x,rowvar=False)
cov

array([[1566.34415673,   48.37851588],
       [  48.37851588,    9.23995971]])

In [9]:
mean = np.mean(x, axis=0,keepdims=True)
mean

array([[35.36338913, 88.44713821]])

In [10]:
def multivariate_gaussian_nonnormalized(x, d=2, mean=mean, covariance=cov):
    """pdf of the multivariate normal distribution."""
    x_m = x - mean
    return np.exp(np.diag(-(np.dot(np.dot(x_m,np.linalg.inv(cov)),x_m.T))))

In [11]:
X = np.arange(mean[0,0]-2*cov[0,0]**0.5,mean[0,0]+2*cov[0,0]**0.5, cov[0,0]**0.5/20)
Y = np.arange(mean[0,1]-2*cov[1,1]**0.5, mean[0,1]+2*cov[1,1]**0.5, cov[1,1]**0.5/20)
Xgrid, Ygrid = np.meshgrid(X,Y)
mapped = list(map(list,zip(Xgrid.ravel(),Ygrid.ravel())))
Z = multivariate_gaussian_nonnormalized(mapped).reshape(Ygrid.shape)

In [12]:
Y.shape

(81,)

In [13]:
X.shape

(80,)

In [14]:
Z.shape

(81, 80)

In [15]:
fig = go.Figure(data=[go.Surface(z=Z,x=X,y=Y)])

In [16]:
fig.show()

# Plotting average price vs average points of different wine

In [17]:
reviews[['winery','variety','points','price']].isna().sum()

winery        0
variety       1
points        0
price      8996
dtype: int64

In [18]:
avg_points_price = reviews[['winery','variety','points','price']].dropna().groupby('variety')\
                        .mean('points').sort_values(by='points', ascending=False)

In [19]:
avg_points_price

Unnamed: 0_level_0,points,price
variety,Unnamed: 1_level_1,Unnamed: 2_level_1
Terrantez,95.000000,236.0
Tinta del Pais,95.000000,47.5
Gelber Traminer,95.000000,35.0
Bual,94.333333,100.0
Riesling-Chardonnay,94.000000,40.0
...,...,...
Shiraz-Tempranillo,82.000000,8.5
Picapoll,82.000000,21.0
Aidani,82.000000,27.0
Airen,81.666667,9.0


In [20]:
scatter = go.Scatter(x=avg_points_price['points'],y=avg_points_price['price'], mode='markers')
layout = go.Layout(title = "price vs points", yaxis_title='price', xaxis_title='points', title_x=0.5)

In [21]:
iplot({"data": scatter,"layout": layout})

# Common wineries and wine variety heatmap

In [22]:
common_winery10 = reviews.winery.value_counts().index.tolist()[:10]
common_winery10

['Wines & Winemakers',
 'Testarossa',
 'DFJ Vinhos',
 'Williams Selyem',
 'Louis Latour',
 'Georges Duboeuf',
 'Chateau Ste. Michelle',
 'Concha y Toro',
 'Columbia Crest',
 'Kendall-Jackson']

In [23]:
common_wine10 = reviews.variety.value_counts().index.tolist()[:10]
common_wine10

['Pinot Noir',
 'Chardonnay',
 'Cabernet Sauvignon',
 'Red Blend',
 'Bordeaux-style Red Blend',
 'Riesling',
 'Sauvignon Blanc',
 'Syrah',
 'Rosé',
 'Merlot']

In [24]:
heatmapper = reviews.query('winery in @common_winery10 and \
                           variety in @common_wine10').groupby(['winery','variety'])['points'].count().reset_index()

In [29]:
heatmap = go.Heatmap(z=heatmapper['points'],
                    x=heatmapper['winery'],
                   y=heatmapper['variety'],
                    connectgaps=True )

In [30]:
iplot({"data": heatmap})