In [1]:
#Imports
import numpy as np
import pandas as pd
pd.options.display.max_rows = 600
pd.options.display.max_columns=21

from IPython.display import display

import matplotlib.pyplot as plt
plt.style.use('classic')

import seaborn as sns
sns.set()
sns.set_context('paper')
#%matplotlib inline

import random as rand

from pandas.api.types import CategoricalDtype

#from IPython.display import display
#display(df)  # OR
#print df.to_html()

In [2]:
#Datafile to use
#Full: NCDB_1999_to_2016.csv
#Final: NCDB_FINAL_Cleaned.csv
#FullClean: NCDB_FULL_Removed_All_Missing_Values.csv
dataSet = "FullCleanBinary"
if (dataSet == "FullCleanBinary"):
    datafile = 'NCDB_FULL_Removed_All_Missing_Values_Binary_Class.csv'
elif (dataSet == "FullCleanMulti"):
    datafile = 'NCDB_FULL_Removed_All_Missing_Values_Multi_Class.csv'
elif(datafile == "FinalCleanedBinary"):
    datafile = 'NCDB_FINAL_CleanedBinary.csv'
elif(datafile == "FinalCleanedMulti"):
    datafile = 'NCDB_FINAL_CleanedMulti.csv'
else:
    datafile = 'NCDB_1999_to_2016.csv'


In [3]:
# read data
df = pd.read_csv(datafile, engine = 'python')

### 1. Look at the Data

In [4]:
# randomly pick 10 rows from the data set to view
rand.seed(101)
df_rows = list(range(1, len(df.index)))
random_10_rows = rand.sample(df_rows, 10)

print(df.loc[random_10_rows])

         C_YEAR  C_MNTH  C_WDAY  C_HOUR  C_VEHS  C_CONF  C_RCFG  C_WTHR  \
2437529    2010       4       3      11       4      21       1       1   
3553069    2016       6       5       9       2      33       2       1   
816860     2002       7       5      19       2      21       2       1   
2261683    2009       4       7      13       3      21       1       1   
1504391    2005       7       7      22       2      21       1       1   
1959322    2007       9       5      19       2      31       2       3   
203512     1999      11       7       7       1       6       2       1   
2782798    2012       1       7       5       2      41       1       1   
2111918    2008       7       2       7       2      21       2       1   
899775     2002      11       5      23       1       6       3       4   

         C_RSUR  C_RALN  C_TRAF  V_ID  V_TYPE  V_YEAR  P_ID  P_SEX  P_AGE  \
2437529       1       1      18     2       1    2003     2      0     24   
3553069       1     

### 2. Dimensions of Your Data

In [5]:
print('Number of Rows: {}'.format(df.shape[0]))
print('Number of Columns: {}'.format(df.shape[1]))
#check for nun numeric values
print("Number of Null values: {}".format(df.isnull().sum().sum()))
print("Number of NaN: {0}".format(df.isna().sum().sum()))
print("Number of Non Numeric: {}".format(df[df.columns].apply(lambda x: x.astype(str).str.contains('[^0-9]')).sum().sum()))

Number of Rows: 3655334
Number of Columns: 21
Number of Null values: 0
Number of NaN: 0
Number of Non Numeric: 0


### 3. Data Type For Each Attribute

In [6]:
df = df.astype('category')
df['C_YEAR'] = df['C_YEAR'].astype(CategoricalDtype(ordered=True))
df['C_MNTH'] = df['C_MNTH'].astype(CategoricalDtype(ordered=True))
df['C_WDAY'] = df['C_WDAY'].astype(CategoricalDtype(ordered=True))
df['C_HOUR'] = df['C_HOUR'].astype(CategoricalDtype(ordered=True))
df['V_YEAR'] = df['V_YEAR'].astype(CategoricalDtype(ordered=True))
df['P_AGE'] = df['P_AGE'].astype('int')
#print(df.dtypes)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3655334 entries, 0 to 3655333
Data columns (total 21 columns):
C_YEAR    category
C_MNTH    category
C_WDAY    category
C_HOUR    category
C_VEHS    category
C_CONF    category
C_RCFG    category
C_WTHR    category
C_RSUR    category
C_RALN    category
C_TRAF    category
V_ID      category
V_TYPE    category
V_YEAR    category
P_ID      category
P_SEX     category
P_AGE     int32
P_PSN     category
P_SAFE    category
P_USER    category
P_ISEV    category
dtypes: category(20), int32(1)
memory usage: 83.7 MB
None


### 4. Descriptive Statistics

In [7]:
pd.set_option('display.width', 100)
pd.set_option('precision', 3)
print('Descriptive Summary of Categorical Variable(s)')
print(df.describe(exclude=[int]))
print()
print('Descriptive Summary of Numeric Variable(s)')
print(df.describe())

Descriptive Summary of Categorical Variable(s)
         C_YEAR   C_MNTH   C_WDAY   C_HOUR   C_VEHS   C_CONF   C_RCFG   C_WTHR   C_RSUR   C_RALN  \
count   3655334  3655334  3655334  3655334  3655334  3655334  3655334  3655334  3655334  3655334   
unique       18       12        7       24       47       18       10        7        9        6   
top        2002        8        5       16        2       21        2        1        1        1   
freq     237313   341307   622867   330358  2333009  1271029  1932020  2611663  2492364  2851829   

         C_TRAF     V_ID   V_TYPE   V_YEAR     P_ID    P_SEX    P_PSN   P_SAFE   P_USER   P_ISEV  
count   3655334  3655334  3655334  3655334  3655334  3655334  3655334  3655334  3655334  3655334  
unique       17       77       13      111       93        2       12        6        4        2  
top          18        1        1     2000        1        1       11        2        1        2  
freq    1965540  1990947  3302981   214334  2468055  196

### 5. Class Distribution

In [8]:
print("Class Distribution")
print(df.groupby('P_ISEV').size())

Class Distribution
P_ISEV
1    1570775
2    2084559
dtype: int64


### 6. Correlation Between Attributes

In [None]:
print('Spearman rank correlation')
corr_col = df.columns[0:len(df.columns)-1]
print(df[corr_col].astype('int').corr(method='spearman'))

Spearman rank correlation


In [None]:
# we are really only interested in the ordinal independent variable
corr_columns = ['C_YEAR','C_MNTH','C_WDAY','C_HOUR','V_YEAR','P_AGE']
print(df[corr_columns].astype('int').corr(method='spearman'))

### 7. Skewness of independent Variables

In [None]:
print(df[corr_col].skew())

In [None]:
print(df['C_YEAR'].value_counts())
t1 = df['C_YEAR'].value_counts().sort_index()
print(t1.index[0])
print(t1.iloc[0])

In [None]:
df_levels = pd.DataFrame(columns=['Column', 'Level', 'Count'])
df_var = df.columns
for col in df_var:
    c1 = list()
    c2 = list()
    c3 = list()
    tmp = df[col].value_counts().sort_index()
    for i in list(range(0,df[col].nunique())):
        c1.append(col)
        c2.append(tmp.index[i])
        c3.append(tmp.iloc[i])
    
    df_t2 = pd.DataFrame()
    df_t2['Column'] = c1
    df_t2['Level'] = c2
    df_t2['Count'] = c3
    df_levels = pd.concat([df_levels, df_t2])
    

df_levels['Count'] = df_levels['Count'].astype('int')
#display(df_levels)
display(pd.pivot_table(df_levels,index=['Column', 'Level']))


#>>> data = pd.DataFrame({"A": range(3)})
#>>> df.append(data)

### Chi Square Test - Independent vs Dependent

In [None]:
# modified code from https://machinelearningmastery.com/chi-squared-test-for-machine-learning/

# chi-squared test with similar proportions
from scipy.stats import chi2_contingency
from scipy.stats import chi2

cat_1 = df['P_ISEV']

df_chi_dep_vs_indep = pd.DataFrame(columns=[df.columns[0: len(df.columns) -1]], index = ['P_ISEV'])
display(df_chi_dep_vs_indep)

for col in df.columns[0: len(df.columns) -1]:
    print(col)
    
    # contingency table
    table = np.array([df[col], cat_1])
    print(table)
    #table_array = np.array(table_series, dtype=pd.Series)
    #print(type(table))

    stat, p, dof, expected = chi2_contingency(table)
    print('dof=%d' % dof)
    print(expected)

    # interpret test-statistic
    prob = 0.95
    critical = chi2.ppf(prob, dof)
    print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
    if abs(stat) >= critical:
        print('Dependent (reject H0)')
        df_chi_dep_vs_indep.loc['P_ISEV',col] = 1  
    else:
        print('Independent (fail to reject H0)')
        df_chi_dep_vs_indep.loc['P_ISEV',col] = 0

    # interpret p-value
    #alpha = 1.0 - prob
    #print('significance=%.3f, p=%.3f' % (alpha, p))
    #if p <= alpha:
    #    print('Dependent (reject H0)')
    #else:
    #    print('Independent (fail to reject H0)')
display(df_chi_dep_vs_indep)

### Chi Square Test - compare correlation between independent variables

In [None]:
# modified code from https://machinelearningmastery.com/chi-squared-test-for-machine-learning/

# chi-squared test with similar proportions
from scipy.stats import chi2_contingency
from scipy.stats import chi2

df_chi = pd.DataFrame(columns=[df.columns[0: len(df.columns) -1]], index = [df.columns[0: len(df.columns) -1]])

i=0
for c1 in df.columns[0: len(df.columns) -1]:
    i = i + 1;
    for c2 in df.columns[i: len(df.columns) -1]:
        print('c1: {0}, c2: {1}'.format(c1, c2))
        # contingency table
        table = np.array([df[col], cat_1])

        
        stat, p, dof, expected = chi2_contingency(table)
        print('dof=%d' % dof)
        print(expected)

        # interpret test-statistic
        prob = 0.95
        critical = chi2.ppf(prob, dof)
        print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
        if abs(stat) >= critical:
            df_chi.loc[c1,c2] = 1
            print('Dependent (reject H0)')
        else:
            print('Independent (fail to reject H0)')
            df_chi.loc[c1,c2] = 0

display(df_chi)
            


## 8. Data Visualization

### Histogram

In [None]:
for col in df.columns:
    g = sns.catplot(col, data=df, kind="count", legend = True, height = 5, aspect = 4)
    g.set_xticklabels(step=1)
    
    h = sns.catplot(col, data=df, kind="count", legend = True, height = 5, aspect = 4, hue = 'P_ISEV')
    h.set_xticklabels(step=1)

### Box Plot

In [None]:
df['P_ISEV'].dtypes

In [None]:

sns.boxplot("P_ISEV", "C_WDAY", data=df, orient="v")
#h = sns.catplot(x=col, data=df, kind="box", orient="v", hue = "P_ISEV", legend = True, height = 10, aspect = 0.2)
#h.set_xticklabels(step=1)

In [None]:
g = sns.catplot(data=df, kind="box", orient="v", height = 10, aspect = 2)
g.set_xticklabels(step=1)

plt.figure(figsize=(5,10))
for col in df.columns:
    g = sns.catplot(col, data=df, kind="box", orient="v", height = 10, aspect = 0.25)
    g.set_xticklabels(step=1)
    
    h = sns.catplot(x = 'P_ISEV', y = col, data=df, kind="box", orient="v", height = 15, aspect = 0.6, legend = True)
    h.set_xticklabels(step=1)

### Heat Map and Corr Plot

In [None]:
corr_col = df.columns[0:len(df.columns)-15]
sns.heatmap(df[corr_col].astype('int').corr(method='spearman'),cmap='coolwarm',annot=True, height = 20, linecolor='white',linewidths=1)

In [None]:
#sns.pairplot(df,hue='P_ISEV',palette='coolwarm', height = 20, linecolor='white',linewidths=1)

In [None]:
k = sns.FacetGrid(df, col="C_WDAY",  row="P_ISEV")
k = k.map(plt.hist)