# building predictive models 

In [1]:
import pandas as pd 
import os
import numpy as np 


# import data 


In [5]:
# set the path of the data 
processed_data_path = os.path.join(os.path.pardir, 'data', 'processed') # processed data folder
train_file_path = os.path.join(processed_data_path, 'train.csv') # path to train.csv
test_file_path = os.path.join(processed_data_path, 'test.csv') # path to test.csv

In [6]:
train_df = pd.read_csv(train_file_path, index_col='PassengerId')
test_df = pd.read_csv(test_file_path, index_col='PassengerId')

In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 33 columns):
Survived              891 non-null int64
Age                   891 non-null float64
Fare                  891 non-null float64
FamilySize            891 non-null int64
IsMother              891 non-null int64
IsMale                891 non-null int64
Deck_A                891 non-null int64
Deck_B                891 non-null int64
Deck_C                891 non-null int64
Deck_D                891 non-null int64
Deck_E                891 non-null int64
Deck_F                891 non-null int64
Deck_G                891 non-null int64
Deck_Z                891 non-null int64
Pclass_1              891 non-null int64
Pclass_2              891 non-null int64
Pclass_3              891 non-null int64
Title_Lady            891 non-null int64
Title_Master          891 non-null int64
Title_Miss            891 non-null int64
Title_Mr              891 non-null int64
Title_Mrs             891 non-

In [9]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 32 columns):
Age                   418 non-null float64
Fare                  418 non-null float64
FamilySize            418 non-null int64
IsMother              418 non-null int64
IsMale                418 non-null int64
Deck_A                418 non-null int64
Deck_B                418 non-null int64
Deck_C                418 non-null int64
Deck_D                418 non-null int64
Deck_E                418 non-null int64
Deck_F                418 non-null int64
Deck_G                418 non-null int64
Deck_Z                418 non-null int64
Pclass_1              418 non-null int64
Pclass_2              418 non-null int64
Pclass_3              418 non-null int64
Title_Lady            418 non-null int64
Title_Master          418 non-null int64
Title_Miss            418 non-null int64
Title_Mr              418 non-null int64
Title_Mrs             418 non-null int64
Title_Officer         418 n

# prepare data 

In [18]:
    # [:, 'Age':], means take all [rows, columns] take all rows and all columns after age
# x = train_df.loc[:, 'Age' : ].as_matrix().astype('float')  # use .values instead of as_matrix, as matrix 'future warning' 
x = train_df.loc[:, 'Age' : ].values 
y = train_df['Survived'].ravel()

In [22]:
print x.shape, y.shape 
#(rows, columns), (one dim array)

(891L, 32L) (891L,)


In [26]:
# train test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0) # remember to set the random_state to same number to get similar arrays 

print x_train.shape, y_train.shape
print x_test.shape, y_test.shape

(712L, 32L) (712L,)
(179L, 32L) (179L,)


In [27]:
# check the average rate of the y train and test we want the mean to be similar for the train and test
print 'survival rate train: {0:3f}'.format(np.mean(y_train))
print 'survival rate  test: {0:3f}'.format(np.mean(y_test))

survival rate train: 0.383427
survival rate  test: 0.385475


In [29]:
# Check scikit-learn version 
import sklearn
sklearn.__version__ # must be higher than 0.19, can be update with '!conda update -y scikit-learn'

'0.19.1'

# base line model 

In [30]:
from sklearn.dummy import DummyClassifier

In [31]:
# create the model 
model_dummy = DummyClassifier(strategy='most_frequent', random_state=0)

In [32]:
# train the model
            # .fit(input_data, output_data)
model_dummy.fit(x_train, y_train)

DummyClassifier(constant=None, random_state=0, strategy='most_frequent')

In [34]:
# evaluate the model performace 
print 'score for baseline model: {0:.3f}'.format(model_dummy.score(x_test, y_test)) #accuracy very important, baseline accuracy

score for baseline model: 0.615


In [38]:
# performance metrics
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

In [39]:
# accuracy score
print 'accuracy baseline model: {0:.3f}'.format(accuracy_score(y_test, model_dummy.predict(x_test)))

accuracy baseline model: 0.615


In [41]:
# confusion matrix
print 'confusion matrix: \n {0}'.format(confusion_matrix(y_test, model_dummy.predict(x_test)))

confusion matrix: 
 [[110   0]
 [ 69   0]]


In [43]:
# precision and recall scores
print 'baseline precision: {0:.3f}'.format(precision_score(y_test, model_dummy.predict(x_test)))
print 'recall for baseline:{0:.3f}'.format(recall_score(y_test, model_dummy.predict(x_test)))

baseline precision: 0.000
recall for baseline:0.000


# kaggle submission

In [47]:
# convert test df to a matrix 
test_x = test_df.values

In [49]:
# get the predictions 
predictions = model_dummy.predict(test_x)

In [51]:
# create a df with n rows and 2 columns column 1 = passenger id, 2 = prediction
df_submission = pd.DataFrame({'PassengerId': test_df.index, 'Survived': predictions })

In [53]:
df_submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [54]:
submission_data_path = os.path.join(os.path.pardir, 'data', 'external')
submission_file_path = os.path.join(submission_data_path, '01_dummy.csv')

In [57]:
# index = false means dont add aditional column with index
df_submission.to_csv(submission_file_path, index=False)

In [58]:
def get_submission_file (model, filename) : 
    # convert the matrix
    test_x = test_df.values
    # make the predictions
    predictions = model.predict(test_x)
    # submission df
    df_submission = pd.DataFrame({'PassengerId':test_df.index, 'Survived': predictions})
    #submission file path 
    submission_data_path = os.path.join(os.path.pardir, 'data', 'external')
    submission_file_path = os.path.join(submission_data_path, filename)
    # write to the disk
    df_submission.to_csv(submission_file_path, index=False)
    

In [59]:
# get the submission file 
get_submission_file(model_dummy, '01_dummy.csv')

# logistic regression 
 $$ \frac{1}{1+e^{-w^{T}x(i)}} = \mu(i)$$

In [61]:
# import the functinos 
from sklearn.linear_model import LogisticRegression

In [64]:
# create the model 
model_lr_1 = LogisticRegression(random_state=0)

In [66]:
# train the model 
model_lr_1.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [68]:
# evaluate the model
print 'score for log reggression - ver1: {0:.3f}'.format(model_lr_1.score(x_test, y_test)) 

score for log reggression - ver1: 0.827


In [74]:
# get performance metric 
# accuracy 
print 'score for log reg- v1: {0:.3f}'.format(accuracy_score(y_test, model_lr_1.predict(x_test)))
# confussion matrix 
print 'confusion matrix for log reg - v1: \n {0}'.format(confusion_matrix(y_test, model_lr_1.predict(x_test)))
print 'precision for log reg - v1: {0:.3f}'.format(precision_score(y_test, model_lr_1.predict(x_test)))
print 'recall for log reg - v1: {0:.3f}'.format(recall_score(y_test, model_lr_1.predict(x_test)))

score for log reg- v1: 0.827
confusion matrix for log reg - v1: 
 [[94 16]
 [15 54]]
precision for log reg - v1: 0.771
recall for log reg - v1: 0.783


In [75]:
# model coefficients, weights, paramaters
model_lr_1.coef_
model_lr_1

array([[-0.02842273,  0.00455452, -0.50009094,  0.61781307, -0.81392328,
         0.1284508 , -0.17281791, -0.39317831,  0.5215997 ,  1.09941224,
         0.40341218, -0.18345049, -0.30036038,  0.96533485,  0.48256742,
        -0.34483444,  0.28089579,  1.21761323,  0.56363966, -1.445863  ,
         1.07245565, -0.11273702, -0.47293649,  0.16255639,  0.24716919,
         0.28009465,  0.41324761,  0.4918353 ,  0.46198829,  0.14924424,
         0.3728352 ,  0.73023263]])

# Second submission on kaggle

In [79]:
get_submission_file(model_lr_1, '02_lr.csv')

In [78]:
#submit to kaggle using api 
!kaggle competitions submit -c titanic -f ../data/external/02_lr.csv -m 'logistic-regression-second-model'

Successfully submitted to Titanic: Machine Learning from Disaster


## model tuning 
- underfitting, overfitting
- regularization
- hyperparameter tuning: cross validation, k fold cross validation  

## feature engineering
- feature normalization

## model persistence 
## api

## hyperparameter optimization 

In [111]:
# base model 
model_lr = LogisticRegression(random_state=0)

In [112]:
from sklearn.model_selection import GridSearchCV

In [113]:
parameters = {'C':[ 0.001, 0.01, 0.1 , 1.0 ,10.0, 100.0], 'penalty' : ['l1', 'l2']}
clf = GridSearchCV(model_lr, param_grid=parameters, cv=4)

In [114]:
clf.fit(x_train, y_train)

GridSearchCV(cv=4, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [115]:
clf.best_params_

{'C': 1.0, 'penalty': 'l1'}

In [116]:
print 'log reg v2 score: {0:.3f}'.format(clf.score(x_test, y_test))

log reg v2 score: 0.827


# make 3rd submission 

In [94]:
get_submission_file(clf, '03_lr.csv')

In [95]:
!kaggle competitions submit -c titanic -f ../data/external/03_lr.csv -m 'logistic-regression-3r-model'

Successfully submitted to Titanic: Machine Learning from Disaster


# feature normalization and standardization
    - normalization scale all the features to some scale,  0, 1 and -1, 1 is very common
    - standardization standardize the feature such that all features have zero mean and 1 variance 

In [96]:
# minmaxscaler is used for normalization, standardscaler is used for stardarization 
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [118]:
# feature normalization
scaler = MinMaxScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.fit_transform(x_test)

In [119]:
# now print the min and max, since we scaled from 0,1 then the min should be zero and the max should be one
x_train_scaled[:,0].min(), x_train_scaled[:,0].max()

(0.0, 1.0)

In [120]:
# feature normalization
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.fit_transform(x_test)

In [121]:
# create a model after standarizing it
parameters = {'C':[ 0.001, 0.01, 0.1 , 1.0 ,10.0, 100.0], 'penalty' : ['l1', 'l2']}
clf = GridSearchCV(model_lr, param_grid=parameters, cv=4)
clf.fit(x_train_scaled, y_train)

GridSearchCV(cv=4, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [122]:
clf.best_score_


0.8258426966292135

In [123]:
clf.best_params_

{'C': 0.1, 'penalty': 'l1'}

In [124]:
print 'score for log reg v 2: {0:.3f}'.format(clf.score(x_test_scaled, y_test))

score for log reg v 2: 0.793


# model persistance
- save the model to the disk so that it can be used later 

In [125]:
# import pickle library 
import pickle

In [132]:
model_file_path = os.path.join(os.path.pardir, 'models', 'lr_model.pkl')
scaler_file_path = os.path.join(os.path.pardir, 'models', 'lr_scaler.pkl')

In [133]:
# open the files to write here we will persist two files 
model_file_pickle = open(model_file_path, 'wb')  # wb write bianary 
scaler_file_pickle = open(scaler_file_path, 'wb')

In [134]:
pickle.dump(clf, model_file_pickle)
pickle.dump(scaler, scaler_file_pickle)

In [135]:
model_file_pickle.close()
scaler_file_pickle.close()

## load the persisted files 

In [139]:
# open the files in read mode 
model_file_pickle = open(model_file_path, 'r')
scaler_file_pickle = open(scaler_file_path, 'r')

In [140]:
#load files
clf_loaded = pickle.load(model_file_pickle)
scaler_loaded = pickle.load(scaler_file_pickle)
# close the file 
model_file_pickle.close()
scaler_file_pickle.close()

In [141]:
clf_loaded

GridSearchCV(cv=4, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [142]:
scaler_loaded

StandardScaler(copy=True, with_mean=True, with_std=True)

In [144]:
x_test_scaled = scaler_loaded.transform(x_test)
print 'score for persisted model: {0:.3f}'.format(clf_loaded.score(x_test_scaled, y_test))

score for persisted model: 0.793
