In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn import linear_model, model_selection
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import pickle
import warnings
warnings.filterwarnings('ignore')

In [2]:
data_train = pd.read_csv('data/train.csv')
data_val = pd.read_csv('data/test.csv')

In [3]:
data_train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [4]:
data_val.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q


In [5]:
#combining train and validation/test datasets
data_train['test'] = 0
data_val['test'] = 1

In [6]:
data_train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,test
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0


In [7]:
data_val.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,test
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,1


In [8]:
df_all = data_train.append(data_val)
df_all.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,test
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0


In [9]:
df_all.tail(4)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,test
414,1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C,1
415,1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S,1
416,1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S,1
417,1309,,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C,1


In [10]:

print('data columns with null values : \n', data_train.isnull().sum())

data columns with null values : 
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
test             0
dtype: int64


In [11]:
print('data column with null values : \n', data_val.isnull().sum())

data column with null values : 
 PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
test             0
dtype: int64


In [12]:
print('data columns with null values :\n', df_all.isnull().sum())

data columns with null values :
 PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
test              0
dtype: int64


In [13]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
 12  test         1309 non-null   int64  
dtypes: float64(3), int64(5), object(5)
memory usage: 143.2+ KB


In [14]:

#preprocessing
#complete missing age with median

df_all['Age'].fillna(df_all['Age'].median(), inplace = True)

#complete embarked with mode
df_all['Embarked'].fillna(df_all['Embarked'].mode()[0], inplace = True)

#complete missing fare with median
df_all['Fare'].fillna(df_all['Fare'].median(), inplace = True)

#delete the cabin feature/column and others previously stated to exclude in train dataset
drop_column = ['PassengerId', 'Cabin', 'Ticket']
df_all.drop(drop_column, axis = 1, inplace = True)

In [15]:
df_all.head(3)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,test
0,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,0
1,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,0
2,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,0


In [16]:
#Discrete variables
df_all['FamilySize'] = df_all['SibSp'] + df_all['Parch'] + 1
df_all.head(3)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,test,FamilySize
0,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,0,2
1,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,0,2
2,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,0,1


In [17]:
df_all['IsAlone'] = 1 #initialize to yes/1 is alone
df_all['IsAlone'].loc[df_all['FamilySize'] > 1] = 0 # now update to no/0 if family size is greater than 1
df_all['IsAlone']

0      0
1      0
2      1
3      0
4      1
      ..
413    1
414    1
415    1
416    1
417    0
Name: IsAlone, Length: 1309, dtype: int64

In [18]:
df_all.head(3)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,test,FamilySize,IsAlone
0,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,0,2,0
1,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,0,2,0
2,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,0,1,1


In [19]:
#quick and dirty code split title from name: http://www.pythonforbeginners.com/dictionary/python-split
df_all['Title'] = df_all['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]

In [20]:
df_all

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,test,FamilySize,IsAlone,Title
0,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.2500,S,0,2,0,Mr
1,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,0,2,0,Mrs
2,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.9250,S,0,1,1,Miss
3,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1000,S,0,2,0,Mrs
4,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.0500,S,0,1,1,Mr
...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,,3,"Spector, Mr. Woolf",male,28.0,0,0,8.0500,S,1,1,1,Mr
414,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,108.9000,C,1,1,1,Dona
415,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,7.2500,S,1,1,1,Mr
416,,3,"Ware, Mr. Frederick",male,28.0,0,0,8.0500,S,1,1,1,Mr


In [21]:
#Continuous variable bins; qcut vs cut: https://stackoverflow.com/questions/30211923/what-is-the-difference-between-pandas-qcut-and-pandas-cut
#Fare Bins/Buckets using qcut or frequency bins: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.qcut.html

df_all['FareBin'] = pd.qcut(df_all['Fare'],4)

#https://pandas.pydata.org/pandas-docs/stable/generated/pandas.cut.html
df_all['AgeBin'] = pd.cut(df_all['Age'].astype(int), 5)

#cleanup rare title names
# print(df_all['Title'].value_counts())

min_occurance = 10
#this will create a true false series with title name as index
title_names = (df_all['Title'].value_counts()<stat_min)
#apply and lambda functions are quick and dirty code to find and replace with fewer lines of code: https://community.modeanalytics.com/python/tutorial/pandas-groupby-and-python-lambda-functions/
df_all['Title'] = df_all['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)

NameError: name 'stat_min' is not defined

In [None]:
df_all

In [None]:
df_all.info()

In [None]:
# code categorical data
lb = LabelEncoder()
df_all['Sex_code'] = lb.fit_transform(df_all['Sex'])
df_all['Embarked_code'] = lb.fit_transform(df_all['Embarked'])
df_all['Title_code'] = lb.fit_transform(df_all['Title'])
df_all['AgeBin_code'] = lb.fit_transform(df_all['AgeBin'])
df_all['FareBin_code'] = lb.fit_transform(df_all['FareBin'])

In [None]:
df_all.head(3)

In [None]:
Target = ['Survived']
df_all_x_cols = ['Sex_code','Pclass', 'Embarked_code', 'Title_code', 'FamilySize', 'AgeBin_code', 'FareBin_code']

In [None]:
# Only useful id you have test and train data seperate
data_train = df_all[df_all['test'] == 0]
data_test = df_all[df_all['test'] == 1]

In [None]:
data_train.head(3)

In [None]:
data_test.head(3)

In [None]:
results = model_selection.cross_validate(linear_model.LogisticRegressionCV(), data_train[df_all_x_cols], data_train[Target], cv= 5)

In [None]:
results

In [None]:
model = linear_model.LogisticRegressionCV()
model.fit(data_train[df_all_x_cols], data_train[Target])
predictions =model.predict(data_train[df_all_x_cols])

In [None]:
print("Accuracy :\n \n", classification_report(data_train[Target], predictions))

In [None]:
name = 'final_model.sav'
pickle.dump(model,open(name, 'wb'))

In [None]:
#helper 

def preprocess(data_all):
    data_all['Age'].fillna(data_all['Age'].median(), inplace = True)

    #complete embarked with mode
    data_all['Embarked'].fillna(data_all['Embarked'].mode()[0], inplace = True)

    #complete missing fare with median
    data_all['Fare'].fillna(data_all['Fare'].median(), inplace = True)

    #delete the cabin feature/column and others previously stated to exclude in train dataset
    drop_column = ['PassengerId','Cabin', 'Ticket']
    data_all.drop(drop_column, axis=1, inplace = True)

    #Discrete variables
    data_all['FamilySize'] = data_all['SibSp'] + data_all['Parch'] + 1

    data_all['IsAlone'] = 1 #initialize to yes/1 is alone
    data_all['IsAlone'].loc[data_all['FamilySize'] > 1] = 0 # now update to no/0 if family size is greater than 1

    #quick and dirty code split title from name: http://www.pythonforbeginners.com/dictionary/python-split
    data_all['Title'] = data_all['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]


    #Continuous variable bins; qcut vs cut: https://stackoverflow.com/questions/30211923/what-is-the-difference-between-pandas-qcut-and-pandas-cut
    #Fare Bins/Buckets using qcut or frequency bins: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.qcut.html
    data_all['FareBin'] = pd.qcut(data_all['Fare'], 4)

    #Age Bins/Buckets using cut or value bins: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.cut.html
    data_all['AgeBin'] = pd.cut(data_all['Age'].astype(int), 5)

    #cleanup rare title names
    #print(data1['Title'].value_counts())
    stat_min = 10 #while small is arbitrary, we'll use the common minimum in statistics: http://nicholasjjackson.com/2012/03/08/sample-size-is-10-a-magic-number/
    title_names = (data_all['Title'].value_counts() < stat_min) #this will create a true false series with title name as index

    #apply and lambda functions are quick and dirty code to find and replace with fewer lines of code: https://community.modeanalytics.com/python/tutorial/pandas-groupby-and-python-lambda-functions/
    data_all['Title'] = data_all['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)

    #code categorical data
    label = LabelEncoder()
    data_all['Sex_Code'] = label.fit_transform(data_all['Sex'])
    data_all['Embarked_Code'] = label.fit_transform(data_all['Embarked'])
    data_all['Title_Code'] = label.fit_transform(data_all['Title'])
    data_all['AgeBin_Code'] = label.fit_transform(data_all['AgeBin'])
    data_all['FareBin_Code'] = label.fit_transform(data_all['FareBin'])

    return data_all