In [44]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris

iris = load_iris()

X = iris.data
y = iris.target

In [45]:
iris.feature_names
iris.target_names

array(['setosa', 'versicolor', 'virginica'], 
      dtype='|S10')

In [46]:
df = pd.DataFrame(X, columns=iris.feature_names)
df.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [47]:
df["badfeature1"] = 5*np.random.randn(150)+3
df["badfeature2"] = np.random.rand(150)
df["badfeature3"] = np.random.randn(150)*-1 -0.5

In [48]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),badfeature1,badfeature2,badfeature3
0,5.1,3.5,1.4,0.2,6.932508,0.972141,-1.479758
1,4.9,3.0,1.4,0.2,4.47964,0.354399,0.419466
2,4.7,3.2,1.3,0.2,1.65647,0.970451,-1.23969
3,4.6,3.1,1.5,0.2,-8.371294,0.513462,-1.864537
4,5.0,3.6,1.4,0.2,8.268674,0.38397,-1.113499


In [50]:
df["class"] = y

In [53]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),badfeature1,badfeature2,badfeature3,class
0,5.1,3.5,1.4,0.2,6.932508,0.972141,-1.479758,0
1,4.9,3.0,1.4,0.2,4.47964,0.354399,0.419466,0
2,4.7,3.2,1.3,0.2,1.65647,0.970451,-1.23969,0
3,4.6,3.1,1.5,0.2,-8.371294,0.513462,-1.864537,0
4,5.0,3.6,1.4,0.2,8.268674,0.38397,-1.113499,0


## Using built-in correlation function from pandas dataframe

In [57]:
df.corr()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),badfeature1,badfeature2,badfeature3,class
sepal length (cm),1.0,-0.109369,0.871754,0.817954,-0.134881,-0.020976,-0.080945,0.782561
sepal width (cm),-0.109369,1.0,-0.420516,-0.356544,-0.013059,0.001926,-0.065253,-0.419446
petal length (cm),0.871754,-0.420516,1.0,0.962757,-0.08535,-0.070662,-0.043296,0.949043
petal width (cm),0.817954,-0.356544,0.962757,1.0,-0.099245,-0.064935,-0.059599,0.956464
badfeature1,-0.134881,-0.013059,-0.08535,-0.099245,1.0,0.007628,0.183636,-0.04433
badfeature2,-0.020976,0.001926,-0.070662,-0.064935,0.007628,1.0,0.066909,-0.0633
badfeature3,-0.080945,-0.065253,-0.043296,-0.059599,0.183636,0.066909,1.0,-0.036653
class,0.782561,-0.419446,0.949043,0.956464,-0.04433,-0.0633,-0.036653,1.0


In [59]:
df.corr(method="kendall")

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),badfeature1,badfeature2,badfeature3,class
sepal length (cm),1.0,-0.072112,0.717624,0.65496,-0.113236,-0.01238,-0.058803,0.670444
sepal width (cm),-0.072112,1.0,-0.182391,-0.146988,-0.023785,0.004646,-0.057977,-0.333435
petal length (cm),0.717624,-0.182391,1.0,0.803014,-0.055947,-0.051217,-0.033022,0.822949
petal width (cm),0.65496,-0.146988,0.803014,1.0,-0.067395,-0.039732,-0.03936,0.838757
badfeature1,-0.113236,-0.023785,-0.055947,-0.067395,1.0,-0.004027,0.10604,-0.029929
badfeature2,-0.01238,0.004646,-0.051217,-0.039732,-0.004027,1.0,0.058076,-0.049591
badfeature3,-0.058803,-0.057977,-0.033022,-0.03936,0.10604,0.058076,1.0,-0.028182
class,0.670444,-0.333435,0.822949,0.838757,-0.029929,-0.049591,-0.028182,1.0


## Using SelectKBest and Chi2 from scikit-learn

In [35]:
from sklearn.feature_selection import SelectKBest, chi2

In [62]:
X = df[df.columns[0:-1]]
y = df["class"]

skb = SelectKBest(k=3)
skb.fit_transform(X, y)

skb.get_support()

array([ True, False,  True,  True, False, False, False], dtype=bool)

In [63]:
#Chi2 statistical test does not work on negative features
chi2(X,y)

ValueError: Input X must be non-negative.

In [64]:
Xpos = X[[df.columns[0], df.columns[1], df.columns[2], df.columns[3], df.columns[5]]]
chi2(Xpos, y)

(array([  10.81782088,    3.59449902,  116.16984746,   67.24482759,
           0.15394315]),
 array([  4.47651499e-03,   1.65754167e-01,   5.94344354e-26,
          2.50017968e-15,   9.25916173e-01]))

## Using Regularization (Model Building)

In [65]:
from sklearn.linear_model import LogisticRegression, LassoCV


In [74]:
lasso = LassoCV(fit_intercept=False)

In [75]:
X.head()
y.head()

0    0
1    0
2    0
3    0
4    0
Name: class, dtype: int64

In [76]:
lasso.fit(X, y)
lasso.coef_

array([-0.09178327, -0.02396113,  0.30825078,  0.37297614,  0.00440578,
       -0.        ,  0.        ])

In [73]:
X

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),badfeature1,badfeature2,badfeature3
0,5.1,3.5,1.4,0.2,6.932508,0.972141,-1.479758
1,4.9,3.0,1.4,0.2,4.479640,0.354399,0.419466
2,4.7,3.2,1.3,0.2,1.656470,0.970451,-1.239690
3,4.6,3.1,1.5,0.2,-8.371294,0.513462,-1.864537
4,5.0,3.6,1.4,0.2,8.268674,0.383970,-1.113499
5,5.4,3.9,1.7,0.4,-2.119431,0.888399,-0.411573
6,4.6,3.4,1.4,0.3,2.275794,0.810779,-1.021834
7,5.0,3.4,1.5,0.2,-0.371692,0.338248,-0.649358
8,4.4,2.9,1.4,0.2,-1.411964,0.165891,-0.318262
9,4.9,3.1,1.5,0.1,4.170493,0.189293,1.033530
