In [137]:
from sklearn.ensemble import RandomForestRegressor
import sklearn.preprocessing as preprocessing

class data_util:
    def set_missing_fare(self, df):
        df.loc[(df.Fare.isnull()), 'Fare'] = 0
        
    def set_missing_ages(self, df):
        # get numerial fields
        age_df = df[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
        
        known_ages = age_df[age_df.Age.notnull()].as_matrix()
        unknown_ages = age_df[age_df.Age.isnull()].as_matrix()
        
        y = known_ages[:, 0]
        x = known_ages[:, 1:]
        
        rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
        rfr.fit(x, y)
        
        predictedAges = rfr.predict(unknown_ages[:, 1:])
        df.loc[(df.Age.isnull()), 'Age'] = predictedAges
        
        return df, rfr
    
    def change_cabin_to_enum(self, df):
        # the order matters here, MUST check NOT null, then check is null!!!
        df.loc[(df.Cabin.notnull()), 'Cabin'] = 'Yes'
        df.loc[(df.Cabin.isnull()), 'Cabin'] = 'No'
        return df
    
    def replace_with_dummies(self, df):
        dummies_Cabin = pd.get_dummies(df['Cabin'], prefix = 'Cabin')
        dummies_Pclass = pd.get_dummies(df['Pclass'], prefix = 'Pclass')
        dummies_Sex = pd.get_dummies(df['Sex'], prefix = 'Sex')
        dummies_Embarked = pd.get_dummies(df['Embarked'], prefix = 'Embarked')
        
        data_to_train = pd.concat([df, dummies_Cabin, dummies_Pclass, dummies_Sex, dummies_Embarked], axis=1)
        # remove Ticket as well, because it is useless
        data_to_train.drop(['Cabin', 'Pclass', 'Sex', 'Embarked', 'Ticket'], axis=1, inplace=True)
        
        return data_to_train
    
    def scale_fields(self, df):
        scaler = preprocessing.StandardScaler()
        age_scale_param = scaler.fit(df['Age'].values.reshape(-1,1))
        df['Age_scaled'] = scaler.fit_transform(df['Age'].values.reshape(-1,1), age_scale_param)

        fare_scale_param = scaler.fit(df['Fare'].values.reshape(-1,1))
        df['Fare_scaled'] = scaler.fit_transform(df['Fare'].values.reshape(-1,1), fare_scale_param)
        
        return df
    
    def get_features(self, df):
        return df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')

In [138]:
import pandas as pd

data_train = pd.read_csv('data/train.csv')

In [139]:
util = data_util()
data_train, rfr = util.set_missing_ages(data_train)
data_train = util.change_cabin_to_enum(data_train)
data_train = util.replace_with_dummies(data_train)
data_train = util.scale_fields(data_train)
train_df = util.get_features(data_train)

train_df

Unnamed: 0,Survived,SibSp,Parch,Cabin_No,Cabin_Yes,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Age_scaled,Fare_scaled
0,0,1,0,1,0,0,0,1,0,1,0,0,1,-0.561380,-0.502445
1,1,1,0,0,1,1,0,0,1,0,1,0,0,0.613171,0.786845
2,1,0,0,1,0,0,0,1,1,0,0,0,1,-0.267742,-0.488854
3,1,1,0,0,1,1,0,0,1,0,0,0,1,0.392942,0.420730
4,0,0,0,1,0,0,0,1,0,1,0,0,1,0.392942,-0.486337
5,0,0,0,1,0,0,0,1,0,1,0,1,0,-0.426384,-0.478116
6,0,0,0,0,1,1,0,0,0,1,0,0,1,1.787722,0.395814
7,0,3,1,1,0,0,0,1,0,1,0,0,1,-2.029569,-0.224083
8,1,0,2,1,0,0,0,1,1,0,0,0,1,-0.194333,-0.424256
9,1,1,0,1,0,0,1,0,1,0,1,0,0,-1.148655,-0.042956


In [140]:
## Define the linear model

from sklearn import linear_model

train_np = train_df.as_matrix()
y = train_np[:, 0]

X = train_np[:, 1:]

clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
clf.fit(X, y)

clf

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=1e-06,
          verbose=0, warm_start=False)

In [141]:
data_test = pd.read_csv('data/test.csv')
util.set_missing_fare(data_test)
data_test, rfr2 = util.set_missing_ages(data_test)
data_test = util.change_cabin_to_enum(data_test)

data_test = util.replace_with_dummies(data_test)
data_test = util.scale_fields(data_test)
test_df = util.get_features(data_test)


In [142]:
predictions = clf.predict(test_df)

In [143]:
import numpy as np 

result = pd.DataFrame({'PassengerId': data_test['PassengerId'].as_matrix(), 'Survived': predictions.astype(np.int32)})

result.to_csv("lr_predict_result.csv", index=False)

In [144]:
pd.DataFrame({'columns': list(train_df.columns)[1:], 'coef': list(clf.coef_.T)})

Unnamed: 0,coef,columns
0,[-0.344235347189],SibSp
1,[-0.104915327247],Parch
2,[0.0],Cabin_No
3,[0.90210545334],Cabin_Yes
4,[0.341169454021],Pclass_1
5,[0.0],Pclass_2
6,[-1.19412332133],Pclass_3
7,[1.95655994588],Sex_female
8,[-0.677432441339],Sex_male
9,[0.0],Embarked_C
