# Libraries

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import pylab as plt
import scipy.stats

from IPython.display import display

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Explore the Data

In [None]:
# Read the .csv file
house_train = pd.read_csv("../input/train.csv")
house_train.head().T[:30]

In [None]:
house_train['HeatingQC'].dtype

In [None]:
set(house_train['HeatingQC'].values)

In [None]:
house_train['HeatingQC'].isnull().sum()

In [None]:
house_train['BldgType'].dtype

In [None]:
set(house_train['BldgType'].values)

In [None]:
house_train['BldgType'].isnull().sum()

# Chi-Square Tests

In [None]:
# Create a random sample
sample = house_train.sample(300)

In [None]:
# By default the null hypothesis for one variable is 
# "the categories are assumed to be equally likely".

In [None]:
# A chi-square test for BldgType
scipy.stats.chisquare(pd.factorize(sample['BldgType'])[0])

In [None]:
# pvalue < 0.05, reject the null hypothesis

In [None]:
# A chi-square test for RoofStyle
scipy.stats. chisquare(pd.factorize(sample['HeatingQC'])[0])

In [None]:
# pvalue > 0.05, accept the null hypothesis

In [None]:
# The null hypothesis for two variables is 
# "the variable HeatingQC and the variable BldgType are independent".

In [None]:
# Build the crosstable sums (contingency table) of each category-relationship
cross_table = pd.crosstab(sample['HeatingQC'], sample['BldgType'])
cross_table

In [None]:
# A chi-square test for independence of variables in a contingency table
chi2, p, dof, ex = scipy.stats.chi2_contingency(cross_table)
chi2, p, dof

In [None]:
# We can't belive in the test results, because there are lots of small values (<5)
# in the contingency table

# PLots

In [None]:
plt.figure(figsize=(15,10))
sns.countplot(x="BldgType", data=sample,
              facecolor=(0, 0, 0, 0), linewidth=7,
              edgecolor=sns.color_palette("Set1", 7))
plt.title('Sample Distribution of "BldgType" Categories', fontsize=20);

In [None]:
plt.figure(figsize=(15,10))
sns.countplot(x="HeatingQC", data=sample,
              facecolor=(0, 0, 0, 0), linewidth=7,
              edgecolor=sns.color_palette("Set1", 7))
plt.title('Sample Distribution of "HeatingQC" Categories', fontsize=20);

In [None]:
plt.figure(figsize=(15,10))
sns.countplot(y="BldgType", hue="HeatingQC", data=sample, palette='Set1')
plt.legend(loc=4)
plt.title('Sample Distribution of "BldgType" Categories Grouped by "HeatingQC"', 
          fontsize=20);