In [190]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from IPython.display import display
import scipy
from scipy.stats import chi2

In [192]:
chi2.pdf(6, 0.05)

0.00021621200880850682

In [93]:
data = pd.read_csv('data.csv').drop('Unnamed: 0', axis=1)

In [94]:
data.nunique()

LotArea         1073
GarageArea       441
MSZoning           5
LotFrontage      110
Condition1         9
HouseStyle         8
SalePrice        663
YearBuilt        112
YrSold             5
Neighborhood      25
dtype: int64

In [95]:
data.head()

Unnamed: 0,LotArea,GarageArea,MSZoning,LotFrontage,Condition1,HouseStyle,SalePrice,YearBuilt,YrSold,Neighborhood
0,5925,672,RM,50.0,Norm,1Story,89471,1937,2007,OldTown
1,9158,474,RL,76.0,Norm,1Story,225000,2007,2007,Somerst
2,12400,297,RL,66.0,Feedr,1Story,155000,1958,2009,NAmes
3,8520,720,RL,71.0,Artery,1.5Fin,166000,1952,2010,NAmes
4,17217,0,RL,90.0,Norm,1Story,84500,2006,2006,Mitchel


In [96]:
data['SalePriceCat'] = pd.cut(data.SalePrice, bins=[0, 100*1000, 200*1000, 300*1000,1000*1000],
                                            labels=["<100k", "100-200k", "200-300k", "300k+"]) 

In [97]:
data.SalePriceCat.value_counts()

100-200k    910
200-300k    312
<100k       123
300k+       115
Name: SalePriceCat, dtype: int64

In [98]:
data['LotAreaCat'] = pd.cut(data.LotArea, 
                            bins=[0, 
                                  np.percentile(data.LotArea, 33),
                                  np.percentile(data.LotArea, 66),
                                  np.percentile(data.LotArea, 100)],
                            labels=["small", "mid", "high"]) 
data.LotAreaCat.value_counts()

high     497
small    482
mid      481
Name: LotAreaCat, dtype: int64

In [99]:
work_data = data[['YrSold', 'LotAreaCat', 'SalePriceCat']]

In [100]:
work_data

Unnamed: 0,YrSold,LotAreaCat,SalePriceCat
0,2007,small,<100k
1,2007,mid,200-300k
2,2009,high,100-200k
3,2010,mid,100-200k
4,2006,high,<100k
...,...,...,...
1455,2006,small,100-200k
1456,2006,small,100-200k
1457,2009,mid,200-300k
1458,2010,high,200-300k


In [210]:
def calculate_all(first_col, second_col):
    marginal_probs = second_col.value_counts(normalize=True)
    cont = pd.crosstab(first_col, second_col, margins=True)
    print("Contingency table")
    display(cont)
    
    probs = pd.crosstab(first_col, second_col, normalize=True)
    print("Probabilities p(a, b)")
    display(probs)
    
    cond_probs =  pd.crosstab(first_col, second_col, normalize='index')
    print("Conditional probabilities p(a | b)")
    display(cond_probs)
    
    quetelet = (cond_probs - marginal_probs)/marginal_probs
    print("Quetelet relative index table")
    display(quetelet)
    
    sum_quetelet = (quetelet*probs).to_numpy().sum()
    print("Summary Quetelet association index", sum_quetelet)
    print("Quetelet association index means how strongly the data is correlated to each other")
    
    chi2_val, p, dof, expected = scipy.stats.chi2_contingency(
        cont.to_numpy()[:-1,:-1], correction=True)
    print("Chi-squared", chi2_val)
    print("%s*%s=%s" % (first_col.size, sum_quetelet, chi2_val) )
    print("Deegress of freedom:", dof)
    print()
    crit = chi2.ppf(0.95, df=dof)
    print("chi-squared with 95% confidence is ", crit)
    print("There should be at least %s samples " %(crit/sum_quetelet))
    crit = chi2.ppf(0.99, df=dof)
    print("chi-squared with 99% confidence is", crit)
    print("There should be at least %s samples " %(crit/sum_quetelet))

In [211]:
calculate_all(work_data.SalePriceCat, work_data.LotAreaCat)

Contingency table


LotAreaCat,small,mid,high,All
SalePriceCat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
<100k,74,34,15,123
100-200k,357,321,232,910
200-300k,46,106,160,312
300k+,5,20,90,115
All,482,481,497,1460


Probabilities p(a, b)


LotAreaCat,small,mid,high
SalePriceCat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
<100k,0.050685,0.023288,0.010274
100-200k,0.244521,0.219863,0.158904
200-300k,0.031507,0.072603,0.109589
300k+,0.003425,0.013699,0.061644


Conditional probabilities p(a | b)


LotAreaCat,small,mid,high
SalePriceCat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
<100k,0.601626,0.276423,0.121951
100-200k,0.392308,0.352747,0.254945
200-300k,0.147436,0.339744,0.512821
300k+,0.043478,0.173913,0.782609


Quetelet relative index table


Unnamed: 0_level_0,small,mid,high
SalePriceCat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
<100k,0.822353,-0.160962,-0.641753
100-200k,0.188318,0.070709,-0.251067
200-300k,-0.55341,0.031238,0.506475
300k+,-0.868302,-0.472114,1.299011


Summary Quetelet association index 0.16400838369793413
Quetelet association index means how strongly the data is correlated to each other
Chi-squared 239.45224019898382
1460*0.16400838369793413=239.45224019898382
Deegress of freedom: 6

chi-squared with 95% confidence is  12.591587243743977
There should be at least 76.7740463007964 samples 
chi-squared with 99% confidence is 16.811893829770927
There should be at least 102.50630760885116 samples 


# YrSold vs LotAreaCat

In [199]:
calculate_all(work_data.YrSold, work_data.LotAreaCat)

Contingency table


LotAreaCat,small,mid,high,All
YrSold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2006,107,86,121,314
2007,109,112,108,329
2008,99,101,104,304
2009,113,114,111,338
2010,54,68,53,175
All,482,481,497,1460


Probabilities p(a, b)


LotAreaCat,small,mid,high
YrSold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2006,0.073288,0.058904,0.082877
2007,0.074658,0.076712,0.073973
2008,0.067808,0.069178,0.071233
2009,0.077397,0.078082,0.076027
2010,0.036986,0.046575,0.036301


Conditional probabilities p(a | b)


LotAreaCat,small,mid,high
YrSold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2006,0.340764,0.273885,0.38535
2007,0.331307,0.340426,0.328267
2008,0.325658,0.332237,0.342105
2009,0.33432,0.337278,0.328402
2010,0.308571,0.388571,0.302857


Quetelet relative index table


Unnamed: 0_level_0,small,mid,high
YrSold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2006,0.032191,-0.168664,0.132015
2007,0.003544,0.033308,-0.035673
2008,-0.013567,0.008453,0.004977
2009,0.012669,0.023755,-0.035277
2010,-0.065323,0.179448,-0.110319


Summary Quetelet association index 0.005655795301277622
Quetelet association index means how strongly the data is correlated to each other
Chi-squared 8.25746113986533
1460*0.005655795301277622=8.25746113986533
Deegress of freedom: 8
chi-squared with 95% confidence is 12.592
There should be at least 2226.388921316781 samples 
chi-squared with 959% confidence is 22.458
There should be at least 3970.7943452138065 samples 
