# Build Predictive Model

In [8]:
import pandas as pd
import os
import numpy as np

## Import Data

In [2]:
processed_data_path = os.path.join(os.path.pardir, 'data', 'processed')
train_data_path = os.path.join(processed_data_path, 'train.csv')
test_data_path = os.path.join(processed_data_path, 'test.csv')

In [5]:
train_df = pd.read_csv(train_data_path, index_col='PassengerId')
test_df = pd.read_csv(test_data_path, index_col='PassengerId')

In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 32 columns):
Survived              891 non-null int64
Age                   891 non-null float64
Fare                  891 non-null float64
FamilySize            891 non-null int64
IsMother              891 non-null int64
IsMale                891 non-null int64
Deck_A                891 non-null int64
Deck_B                891 non-null int64
Deck_C                891 non-null int64
Deck_D                891 non-null int64
Deck_E                891 non-null int64
Deck_F                891 non-null int64
Deck_G                891 non-null int64
Deck_Z                891 non-null int64
Title_lady            891 non-null int64
Title_master          891 non-null int64
Title_miss            891 non-null int64
Title_mr              891 non-null int64
Title_mrs             891 non-null int64
Title_sir             891 non-null int64
Pclass_1              891 non-null int64
Pclass_2              891 non-

In [7]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 31 columns):
Age                   418 non-null float64
Fare                  418 non-null float64
FamilySize            418 non-null int64
IsMother              418 non-null int64
IsMale                418 non-null int64
Deck_A                418 non-null int64
Deck_B                418 non-null int64
Deck_C                418 non-null int64
Deck_D                418 non-null int64
Deck_E                418 non-null int64
Deck_F                418 non-null int64
Deck_G                418 non-null int64
Deck_Z                418 non-null int64
Title_lady            418 non-null int64
Title_master          418 non-null int64
Title_miss            418 non-null int64
Title_mr              418 non-null int64
Title_mrs             418 non-null int64
Title_sir             418 non-null int64
Pclass_1              418 non-null int64
Pclass_2              418 non-null int64
Pclass_3              418 n

## Prepare Data

In [20]:
x = train_df.loc[:,'Age':].as_matrix().astype(float)
y = train_df['Survived'].ravel()

  """Entry point for launching an IPython kernel.


In [21]:
print x.shape, y.shape

(891L, 31L) (891L,)


In [24]:
# Train, test, split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
print x_train.shape, x_test.shape, y_train.shape, y_test.shape

(712L, 31L) (179L, 31L) (712L,) (179L,)


In [27]:
print 'mean survival from train {0:.3f}'.format(np.mean(y_train))
print 'mean survival from train {0:.3f}'.format(np.mean(y_test))

mean survival from train 0.383
mean survival from train 0.385


In [32]:
# Check Sklean version
import sklearn
sklearn.__version__

'0.20.3'

## Build baseline model

In [34]:
from sklearn.dummy import DummyClassifier

In [37]:
model_dummy = DummyClassifier(strategy='most_frequent', random_state=0)

In [38]:
model_dummy.fit(x_train, y_train)

DummyClassifier(constant=None, random_state=0, strategy='most_frequent')

In [39]:
print 'score of the baseline model {0:.3f}'.format(model_dummy.score(x_test, y_test))

score of the baseline model 0.615


In [42]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

In [43]:
print 'accuracy score {0:.3f}'.format(accuracy_score(y_test, model_dummy.predict(x_test)))

accuracy score 0.615


In [47]:
print 'accuracy score \n{0}'.format(confusion_matrix(y_test, model_dummy.predict(x_test)))

accuracy score 
[[110   0]
 [ 69   0]]


## First Kaggle Submission

In [49]:
test_x = test_df.as_matrix().astype(float)

  """Entry point for launching an IPython kernel.


In [50]:
predictions = model_dummy.predict(test_x)

In [59]:
df_submissions = pd.DataFrame({'PassengerId': test_df.index, 'Survived': predictions})

In [64]:
df_submissions.head(5)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [63]:
submission_data_path = os.path.join(os.path.pardir, 'data', 'external')
submission_file_path = os.path.join(submission_data_path, '01_dummy.csv')

In [66]:
df_submissions.to_csv(submission_file_path, index=False)

In [67]:
# Function
def get_submission_file(model, file_name) :
    test_x = test_df.as_matrix().astype(float)
    predictions = model.predict(test_x)
    df_submissions = pd.DataFrame({'PassengerId': test_df.index, 'Survived': predictions})
    submission_data_path = os.path.join(os.path.pardir, 'data', 'external')
    submission_file_path = os.path.join(submission_data_path, file_name)
    df_submissions.to_csv(submission_file_path, index=False)
    

In [68]:
get_submission_file(model_dummy, '01_dummy.csv')

  This is separate from the ipykernel package so we can avoid doing imports until


## Logistic Regression Model

In [69]:
from sklearn.linear_model import LogisticRegression

In [70]:
model_lr_1 = LogisticRegression(random_state=0)

In [71]:
model_lr_1.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [72]:
print 'score {0:.3f}'.format(model_lr_1.score(x_test, y_test))

score 0.832


In [73]:
get_submission_file(model_lr_1, '02_lr.csv')

  This is separate from the ipykernel package so we can avoid doing imports until


## Hyperparameter Optimization

In [75]:
model_lr = LogisticRegression(random_state=0)

In [76]:
from sklearn.model_selection import GridSearchCV

In [84]:
parameter = {'C': [0.5, 1.0, 1.5, 100.0, 1000.0], 'penalty': ['l1', 'l2']}
clf = GridSearchCV(model_lr, parameter)

In [85]:
clf.fit(x_train, y_train)

GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.5, 1.0, 1.5, 100.0, 1000.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [86]:
clf.best_params_

{'C': 1.0, 'penalty': 'l1'}

In [88]:
clf.best_score_

0.8328651685393258

In [89]:
get_submission_file(clf, '03_lr.csv')

  This is separate from the ipykernel package so we can avoid doing imports until


## Feature normalization

In [91]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [92]:
scaler = MinMaxScaler()

In [93]:
x_train_scaled = scaler.fit_transform(x_train)

In [96]:
x_train_scaled[:,0].min(), x_train_scaled[:,0].max()

(0.0, 1.0)

In [97]:
x_test_scaled = scaler.fit_transform(x_test)

In [98]:
clf.fit(x_train_scaled, y_train)

GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.5, 1.0, 1.5, 100.0, 1000.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [99]:
clf.best_score_

0.8300561797752809

In [100]:
get_submission_file(clf, '04_lr.csv')

  This is separate from the ipykernel package so we can avoid doing imports until


## Model Presistance

In [115]:
import pickle

In [116]:
model_file_path = os.path.join(os.path.pardir, 'models', 'lr_model.pkl')
scaler_file_path = os.path.join(os.path.pardir, 'models', 'lr_scaler.pkl')

In [117]:
model_file_pickle = open(model_file_path, 'wb')
scaler_file_pickle = open(scaler_file_path, 'wb')

In [118]:
pickle.dump(clf, model_file_pickle)
pickle.dump(scaler, scaler_file_pickle)

In [119]:
model_file_pickle.close()
scaler_file_pickle.close()

### Read pickle files

In [120]:
model_file_pickle = open(model_file_path, 'r')
scaler_file_pickle = open(scaler_file_path, 'r')

In [121]:
clf_loaded = pickle.load(model_file_pickle)
scaler_loaded = pickle.load(scaler_file_pickle)

In [122]:
scaler_loaded

MinMaxScaler(copy=True, feature_range=(0, 1))

In [125]:
x_test_scaled = scaler_loaded.transform(x_test)

In [126]:
clf_loaded.score(x_test_scaled, y_test)

0.8268156424581006