In [12]:
## with cross validation

from sklearn.ensemble import RandomForestRegressor
from sklearn import cross_validation
import sklearn.preprocessing as preprocessing

class data_util:
    def set_missing_fare(self, df):
        df.loc[(df.Fare.isnull()), 'Fare'] = 0
        
    def set_missing_ages(self, df):
        # get numerial fields
        age_df = df[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
        
        known_ages = age_df[age_df.Age.notnull()].as_matrix()
        unknown_ages = age_df[age_df.Age.isnull()].as_matrix()
        
        y = known_ages[:, 0]
        x = known_ages[:, 1:]
        
        rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
        rfr.fit(x, y)
        
        predictedAges = rfr.predict(unknown_ages[:, 1:])
        df.loc[(df.Age.isnull()), 'Age'] = predictedAges
        
        return df, rfr
    
    def change_cabin_to_enum(self, df):
        # the order matters here, MUST check NOT null, then check is null!!!
        df.loc[(df.Cabin.notnull()), 'Cabin'] = 'Yes'
        df.loc[(df.Cabin.isnull()), 'Cabin'] = 'No'
        return df
    
    def replace_with_dummies(self, df):
        dummies_Cabin = pd.get_dummies(df['Cabin'], prefix = 'Cabin')
        dummies_Pclass = pd.get_dummies(df['Pclass'], prefix = 'Pclass')
        dummies_Sex = pd.get_dummies(df['Sex'], prefix = 'Sex')
        dummies_Embarked = pd.get_dummies(df['Embarked'], prefix = 'Embarked')
        
        data_to_train = pd.concat([df, dummies_Cabin, dummies_Pclass, dummies_Sex, dummies_Embarked], axis=1)
        # remove Ticket as well, because it is useless
        data_to_train.drop(['Cabin', 'Pclass', 'Sex', 'Embarked', 'Ticket'], axis=1, inplace=True)
        
        return data_to_train
    
    def scale_fields(self, df):
        scaler = preprocessing.StandardScaler()
        age_scale_param = scaler.fit(df['Age'].values.reshape(-1,1))
        df['Age_scaled'] = scaler.fit_transform(df['Age'].values.reshape(-1,1), age_scale_param)

        fare_scale_param = scaler.fit(df['Fare'].values.reshape(-1,1))
        df['Fare_scaled'] = scaler.fit_transform(df['Fare'].values.reshape(-1,1), fare_scale_param)
        
        return df
    
    def get_features(self, df):
        return df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')

In [13]:
import pandas as pd

data_train = pd.read_csv('data/train.csv')

In [14]:
util = data_util()
data_train, rfr = util.set_missing_ages(data_train)
data_train = util.change_cabin_to_enum(data_train)
data_train = util.replace_with_dummies(data_train)
data_train = util.scale_fields(data_train)

In [15]:
## Define the linear model

from sklearn import linear_model

split_train, split_cv = cross_validation.train_test_split(data_train, test_size=0.3, random_state=0)
train_df = util.get_features(split_train)
train_np = train_df.as_matrix()
y = train_np[:, 0]
X = train_np[:, 1:]

clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
clf.fit(X, y)

clf

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=1e-06,
          verbose=0, warm_start=False)

In [16]:
cv_df = util.get_features(split_cv)
cv_predictions = clf.predict(cv_df.as_matrix()[:, 1:])

origin_data_train = pd.read_csv('data/train.csv')
not_matched_passengerids = split_cv[cv_predictions != cv_df.as_matrix()[:,0]]['PassengerId'].values
bad_cases = origin_data_train.loc[origin_data_train['PassengerId'].isin(not_matched_passengerids)]
bad_cases

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
14,15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14.0,0,0,350406,7.8542,,S
49,50,0,3,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)",female,18.0,1,0,349237,17.8,,S
55,56,1,1,"Woolner, Mr. Hugh",male,,0,0,19947,35.5,C52,S
65,66,1,3,"Moubarek, Master. Gerios",male,,1,1,2661,15.2458,,C
68,69,1,3,"Andersson, Miss. Erna Alexandra",female,17.0,4,2,3101281,7.925,,S
85,86,1,3,"Backstrom, Mrs. Karl Alfred (Maria Mathilda Gu...",female,33.0,3,0,3101278,15.85,,S
113,114,0,3,"Jussila, Miss. Katriina",female,20.0,1,0,4136,9.825,,S
140,141,0,3,"Boulos, Mrs. Joseph (Sultana)",female,,0,2,2678,15.2458,,C
204,205,1,3,"Cohen, Mr. Gurshon ""Gus""",male,18.0,0,0,A/5 3540,8.05,,S
240,241,0,3,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C


In [17]:
float(len(not_matched_passengerids))/float(len(cv_df.as_matrix()[:,0]))

0.1865671641791045

In [18]:
data_test = pd.read_csv('data/test.csv')
util.set_missing_fare(data_test)
data_test, rfr2 = util.set_missing_ages(data_test)
data_test = util.change_cabin_to_enum(data_test)

data_test = util.replace_with_dummies(data_test)
data_test = util.scale_fields(data_test)
test_df = util.get_features(data_test)


In [19]:
predictions = clf.predict(test_df)

In [20]:
import numpy as np 

result = pd.DataFrame({'PassengerId': data_test['PassengerId'].as_matrix(), 'Survived': predictions.astype(np.int32)})

result.to_csv("lr_2_predict_result.csv", index=False)

In [21]:
pd.DataFrame({'columns': list(train_df.columns)[1:], 'coef': list(clf.coef_.T)})

Unnamed: 0,coef,columns
0,[-0.415535468778],SibSp
1,[-0.108981136163],Parch
2,[0.0],Cabin_No
3,[0.718766815703],Cabin_Yes
4,[0.373722077374],Pclass_1
5,[0.0],Pclass_2
6,[-1.18662702642],Pclass_3
7,[2.0833997602],Sex_female
8,[-0.555831204325],Sex_male
9,[0.0],Embarked_C
