In [3]:
# As we have said, the coefficients in logistic regression can be interpreted as 
#feature importance if the data is normalized. We will show this on the Iris dataset.

# First load the data.

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris

iris = load_iris()

In [4]:
X = iris.data
y = iris.target
iris.feature_names
['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [5]:
#Then let's initialize a logistic regression model:

model = LogisticRegression()
model.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [6]:
#The LogisticRegression class exposes an attribute called coef_. Let's have a look at it:

model.coef_

array([[ 0.41498833,  1.46129739, -2.26214118, -1.0290951 ],
       [ 0.41663969, -1.60083319,  0.57765763, -1.38553843],
       [-1.70752515, -1.53426834,  2.47097168,  2.55538211]])

In [7]:
#According to the documentation this is:
# coef_ : array, shape (n_classes, n_features)
#     Coefficient of the features in the decision function.

#Let's display it in a nicer way:

coeffs = pd.DataFrame(model.coef_, columns = iris.feature_names, index =iris.target_names)
coeffs

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
setosa,0.414988,1.461297,-2.262141,-1.029095
versicolor,0.41664,-1.600833,0.577658,-1.385538
virginica,-1.707525,-1.534268,2.470972,2.555382


Check: Can we conclude that petal length (cm) is the most significant feature to identify setosa ?

<details> <summary>Answer?</summary>
Answer! No! Since we have not normalized the data, the magnitude of coefficients does not necessarily reflect their  importance.

In [8]:
from sklearn.preprocessing import StandardScaler
X_norm =  StandardScaler().fit_transform(X)

model.fit(X_norm, y)

coeffs = pd.DataFrame(model.coef_, columns = iris.feature_names, index =iris.target_names)
coeffs

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
setosa,-0.810166,1.393699,-1.687386,-1.518991
versicolor,0.13038,-1.246338,0.789195,-0.88944
virginica,0.01299,-0.144535,1.863173,2.698873


Notice that normalization did change the sign and magnitude of the logistic regression coefficients. Also notice that the LogisticRegression class has a penalty parameter that allows us to choose between l1 and l2 regularization. Notice that some of the solvers only support l2 regularization.



In [9]:
model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [10]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

df = pd.read_csv('train.csv')

In [11]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [12]:
df.drop('PassengerId', inplace=True, axis=1)
df.drop('Name', inplace=True, axis=1)
df.drop('Ticket', inplace=True, axis=1)
df.drop('Cabin', inplace=True, axis=1)

In [13]:
df.Age.fillna(df.Age.median(), inplace=True)

In [14]:
df = pd.concat([df, pd.get_dummies(df.Embarked)], axis=1)
df.drop('S', inplace=True, axis=1)
df.drop('Embarked', inplace=True, axis=1)
# I could just use drop_first = true, but there is more than one way to do anything.

In [15]:
#instead of sex, create a column called 'male' with a binary value
df['Male'] = df.Sex.apply(lambda x: 'female' not in str(x))

#drop the original Sex column
df.drop('Sex', inplace=True, axis=1)

# We haven't use a true boolean feature yet.  Lets see what happens!


In [16]:
df.head()
# Data After cleaning and parsing

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,C,Q,Male
0,0,3,22.0,1,0,7.25,0,0,True
1,1,1,38.0,1,0,71.2833,1,0,False
2,1,3,26.0,0,0,7.925,0,0,False
3,1,1,35.0,1,0,53.1,0,0,False
4,0,3,35.0,0,0,8.05,0,0,True


### Feature selection

In [18]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,C,Q,Male
0,3,22.0,1,0,7.25,0,0,True
1,1,38.0,1,0,71.2833,1,0,False
2,3,26.0,0,0,7.925,0,0,False
3,1,35.0,1,0,53.1,0,0,False
4,3,35.0,0,0,8.05,0,0,True


In [17]:
from sklearn.neighbors import KNeighborsClassifier

# this list of column names will come in handy later.
cols = list(df.columns)
cols.remove('Survived')

X = df[cols]
y = df.Survived.values

In [19]:
from sklearn.feature_selection import SelectKBest, SelectFpr, chi2, SelectPercentile, GenericUnivariateSelect, f_classif
knn = KNeighborsClassifier()
#build the selector (we'll build one with each score type)
skb_f = SelectKBest(f_classif, k=5)
skb_chi2 = SelectKBest(chi2, k=5)

#train the selector on our data
skb_f.fit(X, y)
skb_chi2.fit(X, y)

#examine results
pd.DataFrame([cols, list(skb_f.scores_), list(skb_chi2.scores_)], index=['feature','f_classif','chi2 score']).T.sort_values('f_classif', ascending=False)

Unnamed: 0,feature,f_classif,chi2 score
7,Male,372.406,92.7024
0,Pclass,115.031,30.8737
4,Fare,63.0308,4518.32
5,C,25.896,20.4644
3,Parch,5.96346,10.0975
1,Age,3.76153,21.6492
2,SibSp,1.11057,2.58187
6,Q,0.0118463,0.0108468


### Independent practice: Do this with RFECV and Logistic Regression

In [18]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression