# Simple feature selection - Pandas and Scipy
***

## Imports

In [2]:
import pandas as pd              # pandas for handling mixed data sets 
import numpy as np               # numpy for basic math and matrix operations

# scipy for stats and more advanced calculations
from scipy.stats import chi2_contingency

## Perform simple feature selection

#### Create a sample data set

In [3]:
scratch_df = pd.DataFrame({'x1': pd.Series(np.arange(0, 10)),
                           'x2': pd.Series(np.random.randn(10)), 
                           'x3': ['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B'],
                           'x4': ['C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L'],
                           'y' : [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]})

scratch_df

Unnamed: 0,x1,x2,x3,x4,y
0,0,-0.965284,A,C,0
1,1,0.278069,A,D,0
2,2,0.223738,A,E,0
3,3,0.158793,A,F,0
4,4,-0.43319,A,G,0
5,5,-0.368828,B,H,1
6,6,-0.947958,B,I,1
7,7,-0.873526,B,J,1
8,8,0.820806,B,K,1
9,9,-0.755244,B,L,1


#### Calculate Pearson correlation for numeric variables
`pandas.DataFrame.corr()` function shows that `x1` is much more correlated with `y` than `x2`.

In [4]:
scratch_df.corr()

Unnamed: 0,x1,x2,y
x1,1.0,-0.089896,0.870388
x2,-0.089896,1.0,-0.234613
y,0.870388,-0.234613,1.0


#### Calculate Chi-Square statistic for categorical variables
* `pandas.crosstab()` creates frequency tables
* `scipy.stats.chi2_contingency()` function on the contingency tables shows that the frequency of values in `x3` is related to `y` more so than the frequency of values in `x4`.

In [5]:
pd.crosstab(scratch_df.x3, scratch_df.y)

y,0,1
x3,Unnamed: 1_level_1,Unnamed: 2_level_1
A,5,0
B,0,5


In [6]:
chi2, p, dof, ex = chi2_contingency(pd.crosstab(scratch_df.x3, scratch_df.y))
print('chi2 =', chi2)
print('p-value =', p)

chi2 = 6.4
p-value = 0.011412036386


In [7]:
pd.crosstab(scratch_df.x4, scratch_df.y)

y,0,1
x4,Unnamed: 1_level_1,Unnamed: 2_level_1
C,1,0
D,1,0
E,1,0
F,1,0
G,1,0
H,0,1
I,0,1
J,0,1
K,0,1
L,0,1


In [8]:
chi2, p, dof, ex = chi2_contingency(pd.crosstab(scratch_df.x4, scratch_df.y))
print('chi2 =', chi2)
print('p-value =', p)

chi2 = 10.0
p-value = 0.350485212323
