In [1]:
import os
import pandas as pd
import numpy as np

In [98]:
#import processed data
train_df = pd.read_csv('../processed/train.csv', index_col='PassengerId')
test_df = pd.read_csv('../processed/test.csv', index_col='PassengerId')
test_df = test_df.drop(['Survived'], axis=1)


In [128]:
train_df.describe()

Unnamed: 0,Survived,Age,Fare,FamilySize,IsMale,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,...,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Sir,Embarked_C,Embarked_Q,Embarked_S,AgeState_Adult,AgeState_Child
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,...,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,29.311639,32.204208,1.904602,0.647587,0.016835,0.05275,0.066218,0.037037,0.035915,...,0.209877,0.580247,0.141414,0.005612,0.010101,0.190797,0.08642,0.722783,0.868687,0.131313
std,0.486592,13.243668,49.693429,1.613459,0.47799,0.128725,0.223659,0.248802,0.188959,0.186182,...,0.407449,0.493796,0.348644,0.074743,0.100051,0.39315,0.281141,0.447876,0.337932,0.337932
min,0.0,0.42,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,22.0,7.9104,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,0.0,29.0,14.4542,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
75%,1.0,35.5,31.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
max,1.0,80.0,512.3292,11.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
#data preparation
x = train_df.loc[:,'Age':].values.astype('float') #using .values which is replacing the as_matrix func in for converting dataframes into a matrix.
y = train_df['Survived'].ravel()


In [12]:
print (x.shape, y.shape)

(891, 27) (891,)


In [14]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

#print corresponding matrices
print(x_train.shape, x_test.shape)
print(y_train.shape, y_test.shape)

(712, 27) (179, 27)
(712,) (179,)


In [16]:
print(np.mean(y_train))
print(np.mean(y_test))

0.38342696629213485
0.3854748603351955


In [19]:
#Build a baseline model
import sklearn
from sklearn.dummy import DummyClassifier

In [20]:
# creating model
dummy_model = DummyClassifier(strategy='most_frequent', random_state=0)

In [22]:
dummy_model.fit(x_train, y_train)

DummyClassifier(constant=None, random_state=0, strategy='most_frequent')

In [23]:
#evaluate perfomance
dummy_model.score(x_test, y_test)

0.6145251396648045

In [24]:
#converting the test dataframe into a matrix

test_x = test_df.values.astype('float')

In [25]:
prediction = dummy_model.predict(test_x)

In [26]:
# dataframe to be submitted back to kaggle

sub_df = pd.DataFrame({'PassengerId': test_df.index, 'Survived' : prediction})

In [78]:
test_x.shape

(418, 28)

In [29]:
sub_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [31]:
#store the predictions on to a csv
sub_df.to_csv('../external/01_dummy.csv', index=False)

In [32]:
# function to create the submission file
def get_submission_file(model, filename):
    #convert test dataframe to matrix
    test_x = test_df.values.astype('float')
    #make predictions
    predition = model.predict(test_x)
    #generate submission dataframe
    sub_df = pd.DataFrame({'PassengerId':test_df.index, 'Survived':prediction})
    
    #saving dataframe to csv
    path = '../external/'+filename
    sub_df.to_csv(path, index=False)

In [33]:
# use the func to get file for submission
get_submission_file(dummy_model, '01_dummy.csv')


### Logistic Regression Model

In [64]:
from sklearn.linear_model import LogisticRegression

In [71]:
#create model to predict
model_lr = LogisticRegression(random_state=0, solver='liblinear')

In [75]:
#train the model
model_lr.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [76]:
model_lr.score(x_test, y_test)

0.8324022346368715

In [97]:
get_submission_file(model_lr, '02_dummy.csv')

### Hyperparameter optimization

In [113]:
# base model
model_hyp = LogisticRegression(random_state=0, solver='liblinear') 

In [114]:
from sklearn.model_selection import GridSearchCV

In [118]:
parameters = {'C':[1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty': ['l1', 'l2']}
clf = GridSearchCV(model_hyp, param_grid=parameters, cv=3)

In [119]:
clf.fit(x_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [121]:
clf.best_params_

{'C': 1.0, 'penalty': 'l2'}

In [122]:
clf.best_score_

0.8356741573033708

In [123]:
clf.score(x_test, y_test)

0.8324022346368715

In [124]:
get_submission_file(clf, '03_lr.csv')