In [196]:
## with cross validation

from sklearn.ensemble import RandomForestRegressor
from sklearn import cross_validation
import sklearn.preprocessing as preprocessing
import pandas as pd

class data_util:
    def set_missing_fare(self, df):
        df.loc[(df.Fare.isnull()), 'Fare'] = 0
        
    def set_missing_ages(self, df):
        # get numerial fields
        age_df = df[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
        
        known_ages = age_df[age_df.Age.notnull()].as_matrix()
        unknown_ages = age_df[age_df.Age.isnull()].as_matrix()
        
        y = known_ages[:, 0]
        x = known_ages[:, 1:]
        
        rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
        rfr.fit(x, y)
        
        predictedAges = rfr.predict(unknown_ages[:, 1:])
        df.loc[(df.Age.isnull()), 'Age'] = predictedAges
        
        return df, rfr
    
    def change_cabin_to_enum(self, df):
        # the order matters here, MUST check NOT null, then check is null!!!
        df.loc[(df.Cabin.notnull()), 'Cabin'] = 'Yes'
        df.loc[(df.Cabin.isnull()), 'Cabin'] = 'No'
        return df
    
    def replace_with_dummies(self, df):
        dummies_Age = pd.get_dummies(df['AgeCat'], prefix = 'AgeCat')
        dummies_Cabin = pd.get_dummies(df['Cabin'], prefix = 'Cabin')
        dummies_Pclass = pd.get_dummies(df['Pclass'], prefix = 'Pclass')
        dummies_Sex = pd.get_dummies(df['Sex'], prefix = 'Sex')
        dummies_Embarked = pd.get_dummies(df['Embarked'], prefix = 'Embarked')
        
        data_to_train = pd.concat([df, dummies_Age, dummies_Cabin, dummies_Pclass, dummies_Sex, dummies_Embarked], axis=1)
        # remove Ticket as well, because it is useless
        data_to_train.drop(['Cabin', 'Pclass', 'Sex', 'Embarked', 'Ticket'], axis=1, inplace=True)
        
        return data_to_train
    
    def scale_fields(self, df):
        scaler = preprocessing.StandardScaler()
        age_scale_param = scaler.fit(df['Age'].values.reshape(-1,1))
        df['Age_scaled'] = scaler.fit_transform(df['Age'].values.reshape(-1,1), age_scale_param)

        fare_scale_param = scaler.fit(df['Fare'].values.reshape(-1,1))
        df['Fare_scaled'] = scaler.fit_transform(df['Fare'].values.reshape(-1,1), fare_scale_param)
        
        return df
    
    def get_features(self, df):
        return df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
    
    def change_age_to_enum(self, df):
        df['AgeCat'] = df['Age']
        df.loc[ (df.Age <= 10) ,'AgeCat'] = 'child'
        df.loc[ (df.Age > 60),'AgeCat'] = 'old'
        df.loc[ (df.Age > 10) & (df.Age <=35) ,'AgeCat'] = 'youth'
        df.loc[ (df.Age > 35) & (df.Age <=60) ,'AgeCat'] = 'senior'
        return df
    
    # error rate are calculated on train data, which will has a column 'Survived' as the first column
    def get_error_rate(self, full_data_train, data_train, data_df, train_data_predictions):
        not_matched_passengerids = data_train[train_data_predictions != data_df.as_matrix()[:,0]]['PassengerId'].values
        bad_cases = full_data_train.loc[full_data_train['PassengerId'].isin(not_matched_passengerids)]
        return float(len(bad_cases))/float(len(data_train))
    

data_train = pd.read_csv('data/train.csv')
util = data_util()
data_train, rfr = util.set_missing_ages(data_train)
data_train = util.change_age_to_enum(data_train)
data_train = util.change_cabin_to_enum(data_train)
data_train = util.replace_with_dummies(data_train)
data_train = util.scale_fields(data_train)

## Define the linear model

from sklearn import linear_model

split_train, split_cv = cross_validation.train_test_split(data_train, test_size=0.3, random_state=0)
train_df = util.get_features(split_train)
train_df.columns

Index([u'Survived', u'SibSp', u'Parch', u'Cabin_No', u'Cabin_Yes', u'Pclass_1',
       u'Pclass_2', u'Pclass_3', u'Sex_female', u'Sex_male', u'Embarked_C',
       u'Embarked_Q', u'Embarked_S', u'Age_scaled', u'Fare_scaled'],
      dtype='object')

In [198]:
train_np = train_df.as_matrix()
y = train_np[:, 0]
X = train_np[:, 1:]

clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
clf.fit(X, y)

# predict on validation
cv_df = util.get_features(split_cv)
cv_predictions = clf.predict(cv_df.as_matrix()[:, 1:])
clf.score(X, y)
#print(util.get_error_rate(data_train, split_cv,cv_df, cv_predictions))

0.81380417335473521

In [199]:
data_test = pd.read_csv('data/test.csv')
util.set_missing_fare(data_test)
data_test, rfr2 = util.set_missing_ages(data_test)
data_test = util.change_age_to_enum(data_test)
data_test = util.change_cabin_to_enum(data_test)

data_test = util.replace_with_dummies(data_test)
data_test = util.scale_fields(data_test)
test_df = util.get_features(data_test)

print(test_df.columns)
predictions = clf.predict(test_df.as_matrix())

import numpy as np 

result = pd.DataFrame({'PassengerId': data_test['PassengerId'].as_matrix(), 'Survived': predictions.astype(np.int32)})
result.to_csv("lr_2_predict_result.csv", index=False)
print(pd.DataFrame({'columns': list(train_df.columns)[1:], 'coef': list(clf.coef_.T)}))

Index([u'SibSp', u'Parch', u'Cabin_No', u'Cabin_Yes', u'Pclass_1', u'Pclass_2',
       u'Pclass_3', u'Sex_female', u'Sex_male', u'Embarked_C', u'Embarked_Q',
       u'Embarked_S', u'Age_scaled', u'Fare_scaled'],
      dtype='object')
                 coef      columns
0    [-0.41553532724]        SibSp
1   [-0.108981344433]        Parch
2               [0.0]     Cabin_No
3    [0.718766623275]    Cabin_Yes
4    [0.373723189817]     Pclass_1
5               [0.0]     Pclass_2
6    [-1.18662592747]     Pclass_3
7     [2.08339799372]   Sex_female
8   [-0.555832812359]     Sex_male
9               [0.0]   Embarked_C
10              [0.0]   Embarked_Q
11  [-0.452048639152]   Embarked_S
12   [-0.54429156105]   Age_scaled
13  [0.0598811392978]  Fare_scaled


In [204]:
from sklearn.ensemble import BaggingRegressor
train_df = util.get_features(data_train)
train_np = train_df.as_matrix()

y = train_np[:, 0]
x = train_np[:, 1:]

clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
bagging_clf = BaggingRegressor(clf, n_estimators=10, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=-1)
bagging_clf.fit(x, y)
bagging_clf.score(x, y)

0.26881794650560809