### Importing relevant libraries

In [102]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

## Loading and preparing the data for regresion

In [103]:
def concat_df(train_data, test_data):
    # Returns a concatenated df of training and test set
    return pd.concat([train_data, test_data], sort=True).reset_index(drop=True)

def divide_df(all_data):
    # Returns divided dfs of training and test set
    return all_data.loc[:890], all_data.loc[891:].drop(['Survived'], axis=1)

df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
data = concat_df(df_train, df_test)

df_train.name = 'Training Set'
df_test.name = 'Test Set'
data.name = 'All Set' 

dfs = [df_train, df_test]

## Exploratory Data Analysis

In [104]:
print(df_train.info())
df_train.sample(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
137,138,0,1,"Futrelle, Mr. Jacques Heath",male,37.0,1,0,113803,53.1,C123,S
706,707,1,2,"Kelly, Mrs. Florence ""Fannie""",female,45.0,0,0,223596,13.5,,S
768,769,0,3,"Moran, Mr. Daniel J",male,,1,0,371110,24.15,,Q


In [105]:
print(df_test.info())
df_test.sample(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
None


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
56,948,3,"Cor, Mr. Bartol",male,35.0,0,0,349230,7.8958,,S
336,1228,2,"de Brito, Mr. Jose Joaquim",male,32.0,0,0,244360,13.0,,S
376,1268,3,"Kink, Miss. Maria",female,22.0,2,0,315152,8.6625,,S


In [106]:
def display_missing(df):    
    for col in df.columns.tolist():          
        print('{} column missing values: {}'.format(col, df[col].isnull().sum()))
    print('\n')
    
for df in dfs:
    print('{}'.format(df.name))
    display_missing(df)

Training Set
PassengerId column missing values: 0
Survived column missing values: 0
Pclass column missing values: 0
Name column missing values: 0
Sex column missing values: 0
Age column missing values: 177
SibSp column missing values: 0
Parch column missing values: 0
Ticket column missing values: 0
Fare column missing values: 0
Cabin column missing values: 687
Embarked column missing values: 2


Test Set
PassengerId column missing values: 0
Pclass column missing values: 0
Name column missing values: 0
Sex column missing values: 0
Age column missing values: 86
SibSp column missing values: 0
Parch column missing values: 0
Ticket column missing values: 0
Fare column missing values: 1
Cabin column missing values: 327
Embarked column missing values: 0




## Filling missing values

In [107]:
data = data.drop(['Embarked', 'Ticket'], axis = 1)

#### Age

In [108]:
df_all_corr = data.corr().abs().unstack().sort_values(kind="quicksort", ascending=False).reset_index()
df_all_corr.rename(columns={"level_0": "Feature 1", "level_1": "Feature 2", 0: 'Correlation Coefficient'}, inplace=True)
df_all_corr[df_all_corr['Feature 1'] == 'Age']

Unnamed: 0,Feature 1,Feature 2,Correlation Coefficient
6,Age,Age,1.0
9,Age,Pclass,0.408106
17,Age,SibSp,0.243699
22,Age,Fare,0.17874
25,Age,Parch,0.150917
29,Age,Survived,0.077221
41,Age,PassengerId,0.028814


In [109]:
age_by_pclass_sex = data.groupby(['Sex', 'Pclass']).median()['Age']

for pclass in range(1, 4):
    for sex in ['female', 'male']:
        print('Median age of Pclass {} {}s: {}'.format(pclass, sex, age_by_pclass_sex[sex][pclass]))
print('Median age of all passengers: {}'.format(data['Age'].median()))

# Filling the missing values in Age with the medians of Sex and Pclass groups
data['Age'] = data.groupby(['Sex', 'Pclass'])['Age'].apply(lambda x: x.fillna(x.median()))

Median age of Pclass 1 females: 36.0
Median age of Pclass 1 males: 42.0
Median age of Pclass 2 females: 28.0
Median age of Pclass 2 males: 29.5
Median age of Pclass 3 females: 22.0
Median age of Pclass 3 males: 25.0
Median age of all passengers: 28.0


## Feature engineering

#### Family size

In [110]:
data['Family_size'] = data['SibSp'] + data['Parch'] + 1

#### Title

In [111]:
import string
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if str.find(big_string, substring) != -1:
            return substring
    print (big_string)
    return np.nan


title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                    'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                    'Don', 'Jonkheer']

In [112]:
data['Title']=data['Name'].map(lambda x: substrings_in_string(x, title_list))

In [113]:
def replace_titles(x):
    title=x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    elif title in ['Countess', 'Mme']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title
data['Title']=data.apply(replace_titles, axis=1)

data['Title'] = data['Title'].map({'Mr':0, 'Mrs':1, 'Miss':2, 'Master':3})

#### Sex

In [114]:
# Changing sex values
data['Sex']=data['Sex'].map({'male':0, 'female':1})

#### Cabin

In [115]:
data['Cabin']= data['Cabin'].astype(np.str)
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
data['Deck']=data['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))

nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan


In [116]:
data['Deck'].unique()

array([nan, 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [117]:
data['Deck'] = data['Deck'].map({'nan':0, 'C':1, 'E':2, 'G':3, 'D':4, 'A':5, 'B':6, 'F':7, 'T':8})
data['Deck'] = data['Deck'].fillna(9)

#### Fare

In [118]:
data[data['Fare'].isnull()]

Unnamed: 0,Age,Cabin,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Family_size,Title,Deck
1043,60.5,,,"Storey, Mr. Thomas",0,1044,3,0,0,,1,0,9.0


In [119]:
med_fare = data.groupby(['Pclass', 'Parch', 'SibSp']).Fare.median()[3][0][0]
# Filling the missing value in Fare with the median Fare of 3rd class alone passenger
data['Fare'] = data['Fare'].fillna(med_fare)

In [120]:
data = data.drop(['Cabin'], axis = 1)

In [121]:
df_train, df_test = divide_df(data)
dfs = [df_train, df_test]

## Declaring the dependent and the independent variables

In [124]:
y = df_train['Survived']
x1 = df_train.drop(['PassengerId','Name', 'Survived', 'SibSp', 'Parch'], axis=1)

In [125]:
x1

Unnamed: 0,Age,Fare,Pclass,Sex,Family_size,Title,Deck
0,22.0,7.2500,3,0,2,0,9.0
1,38.0,71.2833,1,1,2,1,1.0
2,26.0,7.9250,3,1,1,2,9.0
3,35.0,53.1000,1,1,2,1,1.0
4,35.0,8.0500,3,0,1,0,9.0
5,25.0,8.4583,3,0,1,0,9.0
6,54.0,51.8625,1,0,1,0,2.0
7,2.0,21.0750,3,0,5,3,9.0
8,27.0,11.1333,3,1,3,1,9.0
9,14.0,30.0708,2,1,2,1,9.0


# Regression

In [126]:
#est = sm.OLS(y, X.astype(float)).fit()

x = sm.add_constant(x1)
reg_log = sm.Logit(y,x)
results_log = reg_log.fit()
results_log.summary()

Optimization terminated successfully.
         Current function value: 0.424323
         Iterations 6


0,1,2,3
Dep. Variable:,Survived,No. Observations:,891.0
Model:,Logit,Df Residuals:,883.0
Method:,MLE,Df Model:,7.0
Date:,"Wed, 17 Feb 2021",Pseudo R-squ.:,0.3628
Time:,00:31:15,Log-Likelihood:,-378.07
converged:,True,LL-Null:,-593.33
Covariance Type:,nonrobust,LLR p-value:,6.781e-89

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,2.1964,0.588,3.735,0.000,1.044,3.349
Age,-0.0233,0.009,-2.604,0.009,-0.041,-0.006
Fare,0.0029,0.002,1.231,0.218,-0.002,0.008
Pclass,-0.9968,0.172,-5.794,0.000,-1.334,-0.660
Sex,1.8972,0.246,7.705,0.000,1.415,2.380
Family_size,-0.3457,0.072,-4.809,0.000,-0.487,-0.205
Title,0.7767,0.149,5.213,0.000,0.485,1.069
Deck,-0.0665,0.046,-1.456,0.145,-0.156,0.023


### Accuracy

In [127]:
cm_df = pd.DataFrame(results_log.pred_table())
cm_df.columns = ['Predicted 0','Predicted 1']
cm_df = cm_df.rename(index={0: 'Actual 0',1:'Actual 1'})
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,488.0,61.0
Actual 1,103.0,239.0


In [128]:
cm = np.array(cm_df)
accuracy_train = (cm[0,0]+cm[1,1])/cm.sum()
accuracy_train

0.8159371492704826

## Testing the model

In [129]:
x2 = df_test.drop(['PassengerId','Name','SibSp', 'Parch'], axis=1)
X_test = sm.add_constant(x2)

In [130]:
predictions = results_log.predict(X_test)

In [131]:
output = pd.DataFrame({'PassengerId': df_test.PassengerId, 'Survived': predictions})
output['Survived']= output['Survived'].round(decimals = 0)
output['Survived']= output['Survived'].astype(np.int64)

In [133]:
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
