In [1]:
import pandas as pd
import numpy as numpy
import scipy as sp
import sklearn
import warnings
import pickle
warnings.filterwarnings('ignore')

In [2]:
from sklearn import linear_model,model_selection
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score,classification_report

In [3]:
#loading data
data_train=pd.read_csv('./input/train.csv')
data_val=pd.read_csv('./input/test.csv')

In [4]:
#combining train and validation/test datasets
data_train['test']=0
data_val['test']=1
data_all=data_train.append(data_val)

In [5]:
print('data columns with null values:\n', data_all.isnull().sum())

data columns with null values:
 PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
test              0
dtype: int64


In [6]:
#preprocessing
#complete missing age with median
data_all['Age'].fillna(data_all['Age'].median(), inplace = True)

#complete embarked with mode
data_all['Embarked'].fillna(data_all['Embarked'].mode()[0], inplace = True)

#complete missing fare with median
data_all['Fare'].fillna(data_all['Fare'].median(), inplace = True)

#delete the cabin feature/column and others previously stated to exclude in train dataset
drop_column = ['PassengerId','Cabin', 'Ticket']
data_all.drop(drop_column, axis=1, inplace = True)

#Discrete variables
data_all['FamilySize'] = data_all['SibSp'] + data_all['Parch'] + 1

data_all['IsAlone'] = 1 #initialize to yes/1 is alone
data_all['IsAlone'].loc[data_all['FamilySize'] > 1] = 0 # now update to no/0 if family size is greater than 1

#quick and dirty code split title from name: http://www.pythonforbeginners.com/dictionary/python-split
data_all['Title'] = data_all['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]


#Continuous variable bins; qcut vs cut: https://stackoverflow.com/questions/30211923/what-is-the-difference-between-pandas-qcut-and-pandas-cut
#Fare Bins/Buckets using qcut or frequency bins: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.qcut.html
data_all['FareBin'] = pd.qcut(data_all['Fare'], 4)

#Age Bins/Buckets using cut or value bins: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.cut.html
data_all['AgeBin'] = pd.cut(data_all['Age'].astype(int), 5)

#cleanup rare title names
#print(data1['Title'].value_counts())
stat_min = 10 #while small is arbitrary, we'll use the common minimum in statistics: http://nicholasjjackson.com/2012/03/08/sample-size-is-10-a-magic-number/
title_names = (data_all['Title'].value_counts() < stat_min) #this will create a true false series with title name as index

#apply and lambda functions are quick and dirty code to find and replace with fewer lines of code: https://community.modeanalytics.com/python/tutorial/pandas-groupby-and-python-lambda-functions/
data_all['Title'] = data_all['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)


In [7]:
#code categorical data
label = LabelEncoder()
data_all['Sex_Code'] = label.fit_transform(data_all['Sex'])
data_all['Embarked_Code'] = label.fit_transform(data_all['Embarked'])
data_all['Title_Code'] = label.fit_transform(data_all['Title'])
data_all['AgeBin_Code'] = label.fit_transform(data_all['AgeBin'])
data_all['FareBin_Code'] = label.fit_transform(data_all['FareBin'])

In [8]:
Target = ['Survived']
data_all_x_cols = ['Sex_Code','Pclass', 'Embarked_Code', 'Title_Code', 'FamilySize', 'AgeBin_Code', 'FareBin_Code']

In [9]:
# Only useful id you have test and train data seperate
data_train=data_all[data_all['test']==0]
data_val=data_all[data_all['test']==1]

In [10]:
results = model_selection.cross_validate(linear_model.LogisticRegressionCV(), data_train[data_all_x_cols], data_train[Target], cv  = 5)

In [11]:
results

{'fit_time': array([0.18117094, 0.16790986, 0.17059016, 0.16114211, 0.16341305]),
 'score_time': array([0.001333  , 0.0013051 , 0.0014441 , 0.00131202, 0.00135684]),
 'test_score': array([0.77653631, 0.78651685, 0.75842697, 0.79213483, 0.81460674])}

In [12]:
model=linear_model.LogisticRegressionCV()
model.fit(data_train[data_all_x_cols], data_train[Target])
predictions=model.predict(data_train[data_all_x_cols])


In [13]:
print("accuracy: ",classification_report(data_train[Target],predictions))

accuracy:                precision    recall  f1-score   support

         0.0       0.82      0.86      0.84       549
         1.0       0.75      0.69      0.72       342

    accuracy                           0.79       891
   macro avg       0.78      0.77      0.78       891
weighted avg       0.79      0.79      0.79       891



In [14]:
name="final_model.sav"
pickle.dump(model, open(name, 'wb'))

In [15]:
#we will be later predicting on the test data using the api in testapi.ipynb