In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
from tabulate import tabulate

In [3]:
data = pd.read_csv('non_empty_sample.csv', index_col=0)
data.head()

Unnamed: 0,ID,CNTYFIPS,Ori,State,Agency,Agentype,Source,Solved,Year,StateName,...,OffSex,OffRace,Weapon,Relationship,Circumstance,VicCount,OffCount,FileDate,fstate,MSA
0,197601001AKASP00,"Juneau, AK",AKASP00,Alaska,State Troopers,Primary state LE,FBI,Yes,1976,ALASKA,...,Female,American Indian or Alaskan Native,Knife or cutting instrument,Husband,Other arguments,0,0,30180.0,Alaska,Rural Alaska
1,197601001AL00102,"Jefferson, AL",AL00102,Alabama,Birmingham,Municipal police,FBI,Yes,1976,ALA,...,Male,Black,Shotgun,Acquaintance,Felon killed by private citizen,0,0,30180.0,Alabama,"Birmingham-Hoover, AL"
2,197601001AL00104,"Jefferson, AL",AL00104,Alabama,Fairfield,Municipal police,FBI,Yes,1976,ALA,...,Male,Black,Shotgun,Wife,Other,0,0,30180.0,Alabama,"Birmingham-Hoover, AL"
3,197601001AL00106,"Jefferson, AL",AL00106,Alabama,Leeds,Municipal police,FBI,Yes,1976,ALA,...,Female,Black,Knife or cutting instrument,Brother,Other arguments,0,0,30180.0,Alabama,"Birmingham-Hoover, AL"
5,197601001AL00202,"Mobile, AL",AL00202,Alabama,Prichard,Municipal police,FBI,Yes,1976,ALA,...,Male,Black,Rifle,Friend,Other arguments,0,2,30180.0,Alabama,"Mobile, AL"


In [4]:
print(data.columns)
crosstab = pd.crosstab(data['VicRace'], data['OffSex'])
print(crosstab)
chi2_contingency(crosstab)[1]
data_mod = data.drop(['ID'], axis = 1)

Index(['ID', 'CNTYFIPS', 'Ori', 'State', 'Agency', 'Agentype', 'Source',
       'Solved', 'Year', 'StateName', 'Month', 'Incident', 'ActionType',
       'Homicide', 'Situation', 'VicAge', 'VicSex', 'VicRace', 'OffAge',
       'OffSex', 'OffRace', 'Weapon', 'Relationship', 'Circumstance',
       'VicCount', 'OffCount', 'FileDate', 'fstate', 'MSA'],
      dtype='object')
OffSex                             Female    Male
VicRace                                          
American Indian or Alaskan Native     472    3228
Asian or Pacific Islander             508    5858
Black                               26918  163684
White                               21725  203493


In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from typing import List

def p_of_chi_squared(feature_1: pd.Series, feature_2: pd.Series) -> float:
    """Performs the chi-squared test of independence between the two passed features, returns the p-value

    Args:
        feature_1 (pd.Series): All observations for feature no. 1 to be used in the test
        feature_2 (pd.Series): All observations for feature no. 2 to be used in the test

    Returns:
        float: p-value of the test
    """
    return stats.chi2_contingency(pd.crosstab(feature_1, feature_2))[1]

def find_dependent_chi(data: pd.DataFrame, significance_level: float) -> pd.DataFrame:
    """Performs the chi-squared test of independen between all columns of the data frame, returns siginificant pairs

    Args:
        data (pd.DataFrame): DataFrame that only contains feature variables
        significance_level (float): bounded between [0,1]

    Returns:
        pd.DataFrame: DataFrame with 3 columns: feature_1 and feature_2 are the name of the variable pair, p_value the result of the chi-squared test for the feature pair.
        Only lists pairs for which the Chi-squared test gave a p-value <= significance_level
    """
    assert 0 <= significance_level <= 1, 'Invalid significance level, must be in range [0,1]'

    feature_1 = []
    feature_2 = []
    p_value = []
    num_features = len(data.columns)
    for i in range(num_features):
        feature_1.extend([data.columns[i]]*(num_features-(i+1)))
        for j in range(i+1, len(data.columns)):
            feature_2.append(data.columns[j])
            p_value.append(p_of_chi_squared(data.iloc[:, i], data.iloc[:, j]))
    df_dict = {'Feature 1': feature_1, 'Feature 2': feature_2, 'p-value': p_value}
    df = pd.DataFrame(df_dict)
    insignificants = df[df['p-value'] > significance_level].index
    df.drop(insignificants, inplace=True)
    return df.sort_values(by='p-value')

#dep = find_dependent_chi(data_mod.iloc[:100], 0.05)
dep = find_dependent_chi(data_mod, 0.05)
print(tabulate(dep, headers='keys', tablefmt='psql'))

+-----+--------------+--------------+--------------+
|     | Feature 1    | Feature 2    |      p-value |
|-----+--------------+--------------+--------------|
|   0 | CNTYFIPS     | Ori          | 0            |
| 272 | Homicide     | MSA          | 0            |
| 271 | Homicide     | fstate       | 0            |
| 270 | Homicide     | FileDate     | 0            |
| 267 | Homicide     | Circumstance | 0            |
| 266 | Homicide     | Relationship | 0            |
| 265 | Homicide     | Weapon       | 0            |
| 262 | Homicide     | OffAge       | 0            |
| 259 | Homicide     | VicAge       | 0            |
| 257 | ActionType   | MSA          | 0            |
| 256 | ActionType   | fstate       | 0            |
| 255 | ActionType   | FileDate     | 0            |
| 254 | ActionType   | OffCount     | 0            |
| 253 | ActionType   | VicCount     | 0            |
| 252 | ActionType   | Circumstance | 0            |
| 251 | ActionType   | Relationship | 0       