# Statistics Lab2

## Imports

In [342]:
import numpy as np
import pandas as pd
from sklearn import datasets
from scipy.stats import chi2_contingency, chi2
import math

## Data Loading

In [343]:
def load_iris():
    """
    Loads Iris dataset
    
    Returns:
    iris (optional datatype): the Iris dataset
    """
    iris = datasets.load_iris() #Hint: use datasets from sklearn
    return iris

## Data Representation

In [344]:
def represent_data(iris):
    """
    Represents the data in a tabular form with column names as specified
    
    Parameters:
    iris (optional datatype): the Iris dataset in row format
    
    Returns:
    data (pandas dataframe): the new form of Iris as customized table
    """
    #The column names of the new table
    col_names = ['sepal length', 'sepal width', 'petal length', 'petal width', 'species']
    
    #Constructing the new table
    data = pd.DataFrame(iris.data)#Hint: use pandas to construct the new data format with col_names as the columns names
    data.columns = ['sepal length', 'sepal width', 'petal length', 'petal width']
    data['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
    print('Samples from the Data:')
    display(data.head())
    return data

## Contingency Table

In [345]:
def calculate_chi2_value(freq_table, expected_table):
    """
    Calculates the chi2 square value using both the observed and expected tables
    
    Parameters:
    freq_table (pandas dataframe): The table of observed count of each combination
    expected_table (pandas dataframe): The table of expected value for each cell 
    
    Returns:
    chi2_value (float): The value resulted from the summation of chi square independence test
    """
    
    #TODO
    chi2_value = (((freq_table - expected_table)**2)/expected_table).sum()
    chi2_value = chi2_value.sum()
    return chi2_value

In [346]:
def calculate_p_value(chi2_value, df):
    """
    Calculates the probability p-value where P{chi > chi2_value} = p-value
    
    Parameters:
    chi2_value (float): The value resulted from the summation of chi square independence test
    df (int): The degrees of freedom of the resulted distribution
    
    Returns:
    p_value (float): The probability of the resulted chi2_value
    """
    p_value = chi2.sf(chi2_value, df) #TODO
    return p_value

In [347]:
def tabulate(data, label1, label2):
    """
    Constructs the contingency table of the dataset with respect to the given labels
    
    Parameters:
    data (pandas dataframe): Raw data to extract the two columns from
    label1 (string): The name of the 1st column
    label2 (string): The name of the 2nd column
    
    Returns:
    freq_table (pandas dataframe): The table of observed count of each combination
    chi2_value (float): The summed value from the independence test
    p_value (float): The probability of the resulted chi2_value
    df (int): The degrees of freedom of the resulted distribution
    expected_table: The corresponding table of the observed table
    """
    freq_table = pd.crosstab(index=data[label1], columns=data[label2]) #TODO
    
    rows = len(freq_table)
    cols = len(freq_table.columns)
    df = (rows - 1) * (cols - 1) #TODO
    

    grandTotal = freq_table.sum()
    grandTotal = grandTotal.sum()
    expected_table = np.zeros(freq_table.shape)
    for i in range (rows):
      for j in range (cols):
        expected_table[i][j] = (freq_table.iloc[i].sum() * freq_table.iloc[:, j].sum())/grandTotal #TODO()
    
    chi2_value = calculate_chi2_value(freq_table, expected_table)
    
    p_value = calculate_p_value(chi2_value, df)
    
    print('The observed tabel:')
    display(freq_table)
    
    return freq_table, chi2_value, p_value, df, expected_table

## Work Assertion

In [348]:
def assert_results(freq_table, chi2_value, p_value, df, expected_table):
    """
    Checks if the above functions are working well according to the built-in function results
    
    Parameters:
    freq_table (pandas dataframe): The table of observed count of each combination
    chi2_value (float): The summed value from the independence test
    p_value (float): The probability of the resulted chi2_value
    df (int): The degrees of freedom of the resulted distribution
    expected_table: The corresponding table of the observed table
    
    Returns:
    good (boolean): true if and only if the 4 tests passed.
    """
    good = True
    
    stat, p, dof, expected = chi2_contingency(freq_table)
    rows = len(freq_table)
    cols = len(freq_table.columns)
    
    #Check the expected values
    for i in range (rows):
      for j in range (cols):
        if(expected_table[i][j] != expected[i][j]):
          good = False
          break

    #Check the statistic value
    if(abs(chi2_value - stat) > 1e-10):
      good = False

    #Check the degrees of freedom
    if(df != dof):
      good = False
    
    #Check the p-value
    if(abs(p_value - p) > 1e-10):
      good = False
    
    return good

## Driver Code

In [349]:
#Loading the dataset
iris = load_iris()

#Tabulating the raw data
data = represent_data(iris)

#Define the level of significance
alpha = 0.1

col_names = data.columns
n = len(col_names)
#Traverse the features except the target label
for j in range(0, n-1):
    print('Working on the target label(species) vs. ' + col_names[j])
    
    #Apply the independence test
    freq_table, chi2_value, p_value, df, expected_table = tabulate(data, 'species', col_names[j])
    print('\t-chi square sum: ' + str(chi2_value))
    print('\t-p-value: ' + str(p_value))
    print('\t-degrees of freedom: ' + str(df))
    
    #Check the results
    good = assert_results(freq_table, chi2_value, p_value, df, expected_table)
    
    print('\t-Are these results correct? --- ' + str(good))
    if p_value < alpha:
        print('\t-The target label is dependent on the feature: ' + col_names[j])
    else:
        print('\t-The target label: is independent of the label: ' + col_names[j])
    print('\n')

Samples from the Data:


Unnamed: 0,sepal length,sepal width,petal length,petal width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


Working on the target label(species) vs. sepal length
The observed tabel:


sepal length,4.3,4.4,4.5,4.6,4.7,4.8,4.9,5.0,5.1,5.2,...,6.8,6.9,7.0,7.1,7.2,7.3,7.4,7.6,7.7,7.9
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
setosa,1,3,1,4,2,5,4,8,8,3,...,0,0,0,0,0,0,0,0,0,0
versicolor,0,0,0,0,0,0,1,2,1,1,...,1,1,1,0,0,0,0,0,0,0
virginica,0,0,0,0,0,0,1,0,0,0,...,2,3,0,1,3,1,1,1,4,1


	-chi square sum: 156.26666666666665
	-p-value: 6.665987344005566e-09
	-degrees of freedom: 68
	-Are these results correct? --- True
	-The target label is dependent on the feature: sepal length


Working on the target label(species) vs. sepal width
The observed tabel:


sepal width,2.0,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9,3.0,...,3.4,3.5,3.6,3.7,3.8,3.9,4.0,4.1,4.2,4.4
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
setosa,0,0,1,0,0,0,0,0,1,6,...,9,6,3,3,4,2,1,1,1,1
versicolor,1,2,3,3,4,3,5,6,7,8,...,1,0,0,0,0,0,0,0,0,0
virginica,0,1,0,0,4,2,4,8,2,12,...,2,0,1,0,2,0,0,0,0,0


	-chi square sum: 89.54628704628705
	-p-value: 6.0160314822070716e-05
	-degrees of freedom: 44
	-Are these results correct? --- True
	-The target label is dependent on the feature: sepal width


Working on the target label(species) vs. petal length
The observed tabel:


petal length,1.0,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.9,3.0,...,5.7,5.8,5.9,6.0,6.1,6.3,6.4,6.6,6.7,6.9
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
setosa,1,1,2,7,13,13,7,4,2,0,...,0,0,0,0,0,0,0,0,0,0
versicolor,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
virginica,0,0,0,0,0,0,0,0,0,0,...,3,3,2,2,3,1,1,1,2,1


	-chi square sum: 271.8
	-p-value: 1.1775672327529825e-21
	-degrees of freedom: 84
	-Are these results correct? --- True
	-The target label is dependent on the feature: petal length


Working on the target label(species) vs. petal width
The observed tabel:


petal width,0.1,0.2,0.3,0.4,0.5,0.6,1.0,1.1,1.2,1.3,...,1.6,1.7,1.8,1.9,2.0,2.1,2.2,2.3,2.4,2.5
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
setosa,5,29,7,7,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
versicolor,0,0,0,0,0,0,7,3,5,13,...,3,1,1,0,0,0,0,0,0,0
virginica,0,0,0,0,0,0,0,0,0,0,...,1,1,11,5,6,6,3,8,3,3


	-chi square sum: 271.75
	-p-value: 2.1648103108815957e-35
	-degrees of freedom: 42
	-Are these results correct? --- True
	-The target label is dependent on the feature: petal width


