In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.figure_factory as ff

from plotly.offline import iplot
from glob import glob
from collections import Counter
from tqdm import tqdm
from IPython.core.interactiveshell import InteractiveShell

from statsmodels.stats.weightstats import ztest

InteractiveShell.ast_node_interactivity = 'all'
%matplotlib inline

In [2]:
paths = glob('./data/*.csv')
paths

['./data/winemag-data-130k-v2.csv', './data/winemag-data_first150k.csv']

In [3]:
df = pd.read_csv(paths[1])
df = df[df.columns[1:]]
df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


In [4]:
US_sc = df.loc[df['country'] == 'US', 'points']
FR_sc = df.loc[df['country'] == 'France', 'points']

# Statistic

### Number of populations

In [5]:
len(US_sc)
len(FR_sc)

62397

21098

### Mean of score

In [6]:
np.mean(US_sc)
np.mean(FR_sc)

87.81878936487331

88.92586975068727

### Variance of score

In [7]:
np.std(US_sc) ** 2
np.std(FR_sc) ** 2

11.6299189279713

10.237560919995248

# Hypothesis tesing

$$H_0:  \mu_A \geq \mu_F$$
$$H_1: \mu_A < \mu_F$$

$$\alpha = 0.05$$

<hr>

$$Z_{cal}: \frac{(\bar{x}_A - \bar{x}_F) - d}{\sqrt{\frac{S_A}{n_A} + \frac{S_F}{n_F}}}$$

In [8]:
ztest(x1=US_sc, x2=FR_sc, alternative='smaller')

(-41.39314684814284, 0.0)

P-value: 0.0

$$Reject: H_0$$
$$when$$
$$Pvalue < \alpha$$
$$0.0 < 0.05$$

$$\therefore Accept: H_{1}$$