In [1]:
import numpy as np
import os
import pandas as pd

In [2]:
processed_data_path = os.path.join(os.path.pardir,'data','processed')
train_file_path = os.path.join(processed_data_path,'train.csv')
test_file_path = os.path.join(processed_data_path,'test.csv')

In [4]:
train_df=pd.read_csv(train_file_path,index_col='PassengerId')
test_df=pd.read_csv(test_file_path,index_col='PassengerId')

In [7]:
train_df.info(),test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 33 columns):
Survived              891 non-null int64
Age                   891 non-null float64
Fare                  891 non-null float64
FamilySize            891 non-null int64
IsMother              891 non-null int64
IsMale                891 non-null int64
Deck_A                891 non-null int64
Deck_B                891 non-null int64
Deck_C                891 non-null int64
Deck_D                891 non-null int64
Deck_E                891 non-null int64
Deck_F                891 non-null int64
Deck_G                891 non-null int64
Deck_Z                891 non-null int64
Pclass_1              891 non-null int64
Pclass_2              891 non-null int64
Pclass_3              891 non-null int64
Title_Lady            891 non-null int64
Title_Master          891 non-null int64
Title_Miss            891 non-null int64
Title_Mr              891 non-null int64
Title_Mrs             891 non-

(None, None)

In [8]:
#now we prepare data:

X= train_df.loc[:,'Age':].as_matrix().astype('float')
y=train_df['Survived'].ravel()

  This is separate from the ipykernel package so we can avoid doing imports until


In [11]:
print(X.shape,y.shape)

(891, 32) (891,)


In [12]:
# testing train split will be used for the training model:

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)
print(X_train.shape,X_test.shape)
print(y_train.shape,y_test.shape)

(712, 32) (179, 32)
(712,) (179,)


In [14]:
print('Mean Survival in Train: {0: .3f}'.format(np.mean(y_train)))
print('Mean Survival in Test: {0: .3f}'.format(np.mean(y_test)))
# so we can see that the mean is approx to 38%

Mean Survival in Train:  0.383
Mean Survival in Test:  0.385


In [16]:
#Scikit-Learning
import sklearn

In [17]:
sklearn.__version__

'0.19.1'

In [18]:
# Preparing BaseLine Model for 1st Kaggle Submission
from sklearn.dummy import DummyClassifier

In [20]:
# prepare model
dummy_model = DummyClassifier(strategy='most_frequent',random_state=0)
# Train the model
dummy_model.fit(X_train,y_train)

DummyClassifier(constant=None, random_state=0, strategy='most_frequent')

In [26]:
print('Score for Baseline Model : {0: .2f}'.format(dummy_model.score(X_test,y_test)))
# This score is same for Accuracy too
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
# Confusion Matrix
print('confusion matrix for baseline model: \n {0}'.format(confusion_matrix(y_test, dummy_model.predict(X_test))))
# Accuracy for Model
print('Accuracy for Baseline Model : {0: .2f}'.format(accuracy_score(y_test,dummy_model.predict(X_test))))
#precision
print('Precision for BaseLine Model: {0: .2f}'.format(precision_score(y_test,dummy_model.predict(X_test))))
#recall Score
print('Recall Score for BaseLine model is: {0: .2f}'.format(recall_score(y_test,dummy_model.predict(X_test))))

Score for Baseline Model :  0.61
confusion matrix for baseline model: 
 [[110   0]
 [ 69   0]]
Accuracy for Baseline Model :  0.61
Precision for BaseLine Model:  0.00
Recall Score for BaseLine model is:  0.00


  'precision', 'predicted', average, warn_for)


In [30]:
# Now we will make Kaggle Submission
# 1. Convert it to the Matrix
test_X = test_df.as_matrix().astype('float')
# 2. Get predictions
predictions = dummy_model.predict(test_X)
# 3. Create Submission DATA FRAME, mapping the values with Passenger ID's
df_submission = pd.DataFrame({'PassengerId': test_df.index,'Survived':predictions})
# 4. Check the table
df_submission.head()
# 5. Create Paths
submission_data_path=os.path.join(os.path.pardir,'data','external')
submission_file_path=os.path.join(submission_data_path,'baseline_model.csv')
df_submission.to_csv(submission_file_path,index=False)

  This is separate from the ipykernel package so we can avoid doing imports until


In [39]:
# creating the function for the Above work done:
def get_submission_file(model,filename):
    test_X = test_df.as_matrix().astype('float')
    predictions = model.predict(test_X)
    df_sub = pd.DataFrame({'PassengerId': test_df.index,'Survived':predictions})
    submission_data_path = os.path.join(os.path.pardir,'data','external')
    submission_file_path = os.path.join(submission_data_path,filename)
    df_sub.to_csv(submission_file_path,index=False)

In [40]:
# get the file
get_submission_file(dummy_model,'baseline_model.csv')

  This is separate from the ipykernel package so we can avoid doing imports until


In [48]:
# Now we will use Logistic Regression Model to improve over our BaseLine model result
#import func
from sklearn.linear_model import LogisticRegression
#create Model
model_logistic_reg = LogisticRegression(random_state=0)
#train model
model_logistic_reg.fit(X_train,y_train)

#evaluate this model
print('Score : {0: .2f}'.format(model_logistic_reg.score(X_test,y_test)))
# now look on performance metrics:
#1. Accuracy:
print('Accuracy for LR: {0: .2f}'.format(accuracy_score(y_test,model_logistic_reg.predict(X_test))))
#2. confusion matrix
print('confusion matrix for logistic regression - version 1: \n {0}'.format(confusion_matrix(y_test, model_logistic_reg.predict(X_test))))
#3. precision 
print('precision for logistic regression - version 1 : {0:.2f}'.format(precision_score(y_test, model_logistic_reg.predict(X_test))))
#4. Recall 
print('recall for logistic regression - version 1 : {0:.2f}'.format(recall_score(y_test, model_logistic_reg.predict(X_test))))

# Here the score is 83% and in BaseLine was 61%......Hence we can see the improvement significantly

Score :  0.83
Accuracy for LR:  0.83
confusion matrix for logistic regression - version 1: 
 [[95 15]
 [15 54]]
precision for logistic regression - version 1 : 0.78
recall for logistic regression - version 1 : 0.78


In [49]:
model_logistic_reg.coef_
#These are weights/parameters for the model(b0 & b1)
# y=b0+b1*age

array([[-0.02840734,  0.00455631, -0.50017004,  0.61922838, -0.81414743,
         0.12823264, -0.17253859, -0.39355488,  0.52215008,  1.09939125,
         0.40346551, -0.18369316, -0.30021028,  0.96558544,  0.48281794,
        -0.3451608 ,  0.28258585,  1.21850069,  0.56334183, -1.44612507,
         1.07146232, -0.11345497, -0.47306807,  0.16297326,  0.24746349,
         0.27998252,  0.4128233 ,  0.49202884,  0.46214499,  0.14906873,
         0.37253571,  0.73070686]])

In [50]:
# Kaggle Submission using Logistic regression
get_submission_file(model_logistic_reg,'logistic_regression_model.csv')

  This is separate from the ipykernel package so we can avoid doing imports until
