In [80]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder


In [81]:
df = pd.read_csv(os.environ['DATA_PATH'] + '/titanic/train.csv')

In [82]:
df.sample(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
110,111,0,1,"Porter, Mr. Walter Chamberlain",male,47.0,0,0,110465,52.0,C110,S
838,839,1,3,"Chip, Mr. Chang",male,32.0,0,0,1601,56.4958,,S
480,481,0,3,"Goodwin, Master. Harold Victor",male,9.0,5,2,CA 2144,46.9,,S


In [83]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PassengerId,891.0,446.0,257.353842,1.0,223.5,446.0,668.5,891.0
Survived,891.0,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,891.0,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
Age,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
SibSp,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
Parch,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292


In [84]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [86]:
feature_type_map = {
    'categorical': ['Sex', 'Pclass'],
    'numerical': ['Age', 'Fare']
}

In [73]:
ct = ColumnTransformer(
    transformers=[
        ('cat', Pipeline([
            ('impute', SimpleImputer(strategy='most_frequent')), 
            ('trans', OneHotEncoder(handle_unknown='ignore'))]),
         feature_type_map['categorical']),
        ('num', Pipeline([
            ('impute', SimpleImputer(strategy='most_frequent')), 
            ('trans', StandardScaler())]),
         feature_type_map['numerical'])],
    remainder='drop')

In [95]:
def print_accuracy(model, X, y, partition='train'):
    """ accuracy of model on data """
    accuracy = accuracy_score(y, lr.predict(X))
    print(f"{partition} accuracy is {accuracy:.2f}.")

In [100]:
train, test = train_test_split(df)

In [101]:
X_train = ct.fit_transform(train)
y_train = train['Survived']

In [102]:
lr = LogisticRegression(solver='lbfgs').fit(X_train, y_train)
print_accuracy(lr, X_train, y_train, 'train')

train accuracy is 0.80.


In [103]:
X_test = ct.transform(test)
y_test = test['Survived']
print_accuracy(lr, X_test, y_test, 'test')

test accuracy is 0.78.


In [120]:
from sklearn.feature_selection import chi2, f_classif

In [122]:
chi2(X_train[:,:5], y_train)

(array([126.59946113,  68.70871447,  37.30925708,  10.17777613,
         35.76227477]),
 array([2.27318732e-29, 1.14135451e-16, 1.00804975e-09, 1.42143519e-03,
        2.22922998e-09]))

In [121]:
f_classif(X_train[:,5:], y_train)

(array([ 2.43732235, 45.53931028]), array([1.18953341e-01, 3.25531692e-11]))

In [131]:
rng = [0,1,2,3,4,6]
lr = LogisticRegression(solver='lbfgs').fit(X_train[:, rng], y_train)
print_accuracy(lr, X_train[:, rng], y_train, 'train')
print_accuracy(lr, X_test[:, rng], y_test, 'test')

train accuracy is 0.79.
test accuracy is 0.78.


In [132]:
from sklearn.feature_selection import SelectFromModel

In [139]:
clf = LogisticRegression(solver='lbfgs')
sfm = SelectFromModel(clf, max_features=3)

In [140]:
sfm.fit(X_train, y_train)

SelectFromModel(estimator=LogisticRegression(C=1.0, class_weight=None,
                                             dual=False, fit_intercept=True,
                                             intercept_scaling=1, l1_ratio=None,
                                             max_iter=100, multi_class='warn',
                                             n_jobs=None, penalty='l2',
                                             random_state=None, solver='lbfgs',
                                             tol=0.0001, verbose=0,
                                             warm_start=False),
                max_features=3, norm_order=1, prefit=False, threshold=None)