In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.figure_factory as ff

from plotly.offline import iplot
from glob import glob
from collections import Counter
from tqdm import tqdm
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = 'all'
%matplotlib inline

In [4]:
paths = glob('./data/*.csv')
paths

['./data/winemag-data-130k-v2.csv', './data/winemag-data_first150k.csv']

In [18]:
df = pd.read_csv(paths[1])
df = df[df.columns[1:]]
df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


In [37]:
count_countries = sorted(list(Counter(df['country']).items()), key=lambda x: x[1], reverse=True)
top5_countries = sorted(count_countries[:5], key=lambda x: x[1])

data = [
    go.Bar(
        y = list(map(lambda x: x[0], top5_countries)),
        x = list(map(lambda x: x[1], top5_countries)),
        marker=dict(
            color=['rgba(204,204,204,1)', 'rgba(204,204,204,1)',
                   'rgba(222,45,38,0.8)', 'rgba(204,204,204,1)',
                   'rgba(222,45,38,0.8)']
        ),
        orientation = 'h'
    )
]

layout = go.Layout(title='Top 5 countries that has > 80 score wines')

fig = go.Figure(layout=layout, data=data)

iplot(fig)


In [42]:
US_sc = df.loc[df['country'] == 'US', 'points']
FR_sc = df.loc[df['country'] == 'France', 'points']

data = [
    go.Box(
        y=US_sc,
        name='US',
        marker=dict(
            color='Red'
        )
    ),
    go.Box(
        y=FR_sc,
        name='France',
        marker=dict(
            color='Blue'
        )
    )
]

layout = go.Layout(title='Wines score')

fig = go.Figure(data=data, layout=layout)

iplot(fig)

In [50]:
data = [
    US_sc,
    FR_sc
]

labels = [
    'US',
    'France'
]

colors = [
    "Red",
    "Blue"
]

fig = ff.create_distplot(data, labels, colors=colors,
                         bin_size=1, show_rug=False, curve_type='normal')

fig['layout'].update(title='Distribution of wines score')


iplot(fig)