**Are ratings and continent dependent on each other ?**

In [25]:
import pandas as pd 
import numpy as np
from scipy import stats as st
from scipy.stats import chi2_contingency
from scipy.stats import chi2

**Prepare Data that is rating and continent**

In [26]:
all_rating_processed = pd.read_csv('all_rating_processed.csv', sep = '\t', encoding='utf-8')

In [27]:
all_rating_processed['Country'].unique()

array(['Undefined', 'UnitedStates', 'France', 'Poland', 'Belgium',
       'UnitedKingdom', 'Sweden', 'Finland', 'Austria', 'Norway',
       'Germany', 'Netherlands', 'Canada', 'Switzerland', 'Italy',
       'Russia', 'Spain', 'Portugal', 'Lithuania', 'Denmark', 'England',
       'Scotland', 'Ireland', 'Hungary', 'Greece', 'Bulgaria', 'Croatia',
       'Belarus', 'Serbia', 'Slovakia', 'Ukraine', 'Estonia',
       'Luxembourg', 'CzechRepublic', 'Turkey', 'Latvia',
       'BosniaandHerzegovina', 'FaroeIslands'], dtype=object)

In [28]:
all_rating_processed['Continent'].unique()

array(['Undefined', 'NorthAmerica', 'Europe'], dtype=object)

In [29]:
all_rating_processed.head(5)

Unnamed: 0.1,Unnamed: 0,Game_name,Gamer,Date,Rating,Location,file,Country,Continent
0,0,Gloomhaven,guile678,Mar2020,8.7,Undefined,0_174430_20_pages.csv,Undefined,Undefined
1,1,Gloomhaven,purplephoenixgames,Mar2020,-1e-05,"Milan,Illinois·UnitedStates",0_174430_20_pages.csv,UnitedStates,NorthAmerica
2,2,Gloomhaven,oneover,Mar2020,8.0,Undefined,0_174430_20_pages.csv,Undefined,Undefined
3,3,Gloomhaven,MrKek,Mar2020,-1e-05,"Honolulu,Hawaii·UnitedStates",0_174430_20_pages.csv,UnitedStates,NorthAmerica
4,4,Gloomhaven,queerchameleon,Mar2020,4.0,"GroveCity,Ohio·UnitedStates",0_174430_20_pages.csv,UnitedStates,NorthAmerica


In [30]:
all_rating_processed.drop(['Unnamed: 0','Location','file'], axis=1, inplace = True)
all_rating_processed.describe()

Unnamed: 0,Rating
count,908577.0
mean,6.970318
std,2.325704
min,-1e-05
25%,6.5
50%,7.5
75%,8.0
max,10.0


In [31]:
NA_data = all_rating_processed[all_rating_processed['Continent'] =='NorthAmerica']
Undef_data = all_rating_processed[all_rating_processed['Continent'] =='Undefined']
EU_data = all_rating_processed[all_rating_processed['Continent'] =='Europe']

In [32]:
all_rating_processed.dtypes

Game_name     object
Gamer         object
Date          object
Rating       float64
Country       object
Continent     object
dtype: object

**Contingency Table showing correlation between Rating and Continent**

In [33]:
contigency_data = pd.crosstab(all_rating_processed['Rating'],
                            all_rating_processed['Continent'], 
                               margins = False)
contigency_data

Continent,Europe,NorthAmerica,Undefined
Rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-0.00001,19108,34183,8982
1.00000,665,755,709
1.10000,1,5,12
1.20000,0,1,0
1.30000,1,2,1
1.40000,1,1,2
1.50000,11,11,8
1.60000,2,0,0
1.70000,0,0,2
1.80000,2,2,1


**Get chi-square value , p-value, degrees of freedom, expected frequencies using the function chi2_contingency**

In [34]:
stat, p, dof, expected = chi2_contingency(contigency_data)
print("Chi-Square Test Statistic",stat)
print("P-value", p)
print("degree of freedom", dof)
print("expected",expected)

Chi-Square Test Statistic 7414.277773903627
P-value 0.0
degree of freedom 182
expected [[2.34815454e+04 2.69564750e+04 1.18349796e+04]
 [8.02791100e+02 9.21592588e+02 4.04616312e+02]
 [6.78733668e+00 7.79176448e+00 3.42089883e+00]
 [3.77074260e-01 4.32875805e-01 1.90049935e-01]
 [1.50829704e+00 1.73150322e+00 7.60199741e-01]
 [1.50829704e+00 1.73150322e+00 7.60199741e-01]
 [1.13122278e+01 1.29862741e+01 5.70149806e+00]
 [7.54148520e-01 8.65751609e-01 3.80099870e-01]
 [7.54148520e-01 8.65751609e-01 3.80099870e-01]
 [1.88537130e+00 2.16437902e+00 9.50249676e-01]
 [1.13122278e+00 1.29862741e+00 5.70149806e-01]
 [1.04524985e+03 1.19993173e+03 5.26818420e+02]
 [3.39366834e+00 3.89588224e+00 1.71044942e+00]
 [4.52489112e+00 5.19450966e+00 2.28059922e+00]
 [2.63951982e+00 3.03013063e+00 1.33034955e+00]
 [1.50829704e+00 1.73150322e+00 7.60199741e-01]
 [4.26093914e+01 4.89149659e+01 2.14756427e+01]
 [3.01659408e+00 3.46300644e+00 1.52039948e+00]
 [7.54148520e-01 8.65751609e-01 3.80099870e-01]
 

**select significance value as alpha = 0.05 and determine whether to reject or keep your null hypothesis**

In [35]:
alpha = 0.05
print('significance=%.3f, p=%.3f' % (alpha, p))
if p <= alpha:
    print('Variables are associated (reject H0)')
else:
    print('Variables are not associated(fail to reject H0)')

significance=0.050, p=0.000
Variables are associated (reject H0)


**chi-squared test with similar proportions**

**interpret test-statistic**

In [36]:
prob = 0.95
critical = chi2.ppf(prob, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
if abs(stat) >= critical:
    print('Dependent (reject H0)')
else:
    print('Independent (fail to reject H0)')

probability=0.950, critical=214.477, stat=7414.278
Dependent (reject H0)


**interpret p-value**

In [37]:
alpha = 1.0 - prob
print('significance=%.3f, p=%.3f' % (alpha, p))
if p <= alpha:
    print('Dependent (reject H0)')
else:
    print('Independent (fail to reject H0)')

significance=0.050, p=0.000
Dependent (reject H0)


**Chi-squared test statistic, sample size, and minimum of rows and columns**

In [38]:
X2 = chi2_contingency(contigency_data, correction=False)[0]
n = np.sum(contigency_data)
minDim = min(contigency_data.shape)-1

**calculate Cramer's V**

In [39]:
V = np.sqrt((X2/n) / minDim)
print(V)

Continent
Europe          0.104022
NorthAmerica    0.097086
Undefined       0.146523
dtype: float64
