# Libraries and data loading

We will first load all the necessary libraries and the data 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn import preprocessing
import numpy as np
plt.rc("font", size=14)

In [2]:
# We now load the data
traindf = pd.read_csv('train.csv')  # load the training data
testdf = pd.read_csv('test.csv')  # load the training data

# Imputation of missing data

Let's check what is missing:

In [3]:
traindf.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
testdf.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

First we will impute the Age column with mean values. If we don't round the numbers to one decimal point there are too many decimal points which add no new knowledge to the age yet if we are working with very large data frames might pose an problem for speed. Ask Danny if this is true.

Next, let's explore the Cabin column. This is not a numeric value so using mean or median as an imputation value is meaningless. We can try a random sample from the existing cabin value or something else. 
One option is to use the most common cabin designation. Considering there are many types it might be appropriate to use Unknown. This way we are not forcing anything on the data.

In [5]:
def imputeNumerical(dfName, colName):
    missing_col = [colName]
    # Using mean to impute the missing values
    for i in missing_col:
        dfName.loc[dfName.loc[:,i].isnull(),i]=round(dfName.loc[:,i].mean(), 2)

def imputeCategorical(dfName, colName):
    missing_col = [colName]
    # Using unknown to impute the missing values
    for i in missing_col:
        dfName.loc[dfName.loc[:,i].isnull(),i]='Unknown'


In [6]:
imputeNumerical(traindf, 'Age')
imputeCategorical(traindf, 'Cabin')
imputeCategorical(traindf, 'Embarked')
imputeNumerical(testdf, 'Age')
imputeNumerical(testdf, 'Fare')
imputeCategorical(testdf, 'Cabin')

# Data engineering

We will now bunch the age variable into children, age 0-10, young adults (11-30), adults(31-50) and seniors 50+.
We will also bunch the cabin numbers into just the letter in front (ex cabin C85 becomes C, cabin B42 becomes B, etc).
Sex will be transformed to male=0, female=1

In [7]:
def dataEngineering(dfName):
    bins = pd.IntervalIndex.from_breaks([0, 10, 31, 51, np.inf], closed='left')
    dfName["AgeBin"] = pd.cut(dfName.Age.values, bins).codes

    # Transforming the cabin code to only show the letter of it (the first character in the string)
    dfName['CabinLetter'] = dfName['Cabin'].astype(str).str[0]

    # Sex, Cabin, Embarked to integer
    dfName.replace({'Sex': {'female': 0, 'male': 1}}, inplace=True)
    dfName.replace({'Embarked': {'S': 0, 'C': 1, 'Q' : 2, 'Unknown':3}}, inplace=True)
    dfName.replace({'CabinLetter': {'U':0, 'C':1, 'B':2, 'D':3, 'G':4, 'F':5, 'E':6, 'T':7, 'A':8}}, inplace=True)

In [8]:
dataEngineering(traindf)
dataEngineering(testdf)

These two functions below didn't work when implemented in the dataEngineering function so that is why they are out. Ask Danny how to put them back in


In [9]:
# We remove all the variables that cannot have any effect like Name and Ticket 
traindf = traindf.drop(columns = ['Name', 'Ticket','Cabin'])
testdf = testdf.drop(columns = ['Name', 'Ticket','Cabin'])

# We change PassengerID to be the index of the dataframe
traindf = traindf.set_index("PassengerId") 
testdf = testdf.set_index("PassengerId") 

# Data normalization 

Let's look at the unique values in each column to see which can be normalized

In [10]:
for col in testdf:
    print(col, testdf[col].unique())

Pclass [3 2 1]
Sex [1 0]
Age [34.5  47.   62.   27.   22.   14.   30.   26.   18.   21.   30.27 46.
 23.   63.   24.   35.   45.   55.    9.   48.   50.   22.5  41.   33.
 18.5  25.   39.   60.   36.   20.   28.   10.   17.   32.   13.   31.
 29.   28.5  32.5   6.   67.   49.    2.   76.   43.   16.    1.   12.
 42.   53.   26.5  40.   61.   60.5   7.   15.   54.   64.   37.   34.
 11.5   8.    0.33 38.   57.   40.5   0.92 19.   36.5   0.75  0.83 58.
  0.17 59.   14.5  44.    5.   51.    3.   38.5 ]
SibSp [0 1 2 3 4 5 8]
Parch [0 1 3 2 4 6 5 9]
Fare [  7.8292   7.       9.6875   8.6625  12.2875   9.225    7.6292  29.
   7.2292  24.15     7.8958  26.      82.2667  61.175   27.7208  12.35
   7.225    7.925   59.4      3.1708  31.6833  61.3792 262.375   14.5
  61.9792  30.5     21.6792  31.5     20.575   23.45    57.75     8.05
   9.5     56.4958  13.4167  26.55     7.85    13.      52.5542  29.7
   7.75    76.2917  15.9     60.      15.0333  23.     263.      15.5792
  29.125    7.65    

We see that Fare and Age are the only ones that should be normalized. The rest have only few unique values

In [11]:
def scaleData(df): # A function to normalize only select columns ina  dataframe
    col_names = ['Age', 'Fare']
    features = df[col_names]
    scaler = preprocessing.StandardScaler().fit(features.values)
    features = scaler.transform(features.values)
    df[col_names] = features
# Ask Danny how to manke a function that has variable input. 
# Now are age and fare to be scaled but what if we need to omit fare or maybe add two more columns to be scaled?

In [12]:
scaleData(traindf)
scaleData(testdf)

# Logistic regression

We will split the data in X and y

In [13]:
# Training data
X_train = traindf.drop(columns = 'Survived')
y_train = traindf[['Survived']].copy()

import statsmodels.api as sm
logit_model=sm.Logit(y_train,X_train)
logit_model.fit().summary2()

Optimization terminated successfully.
         Current function value: 0.447156
         Iterations 6


0,1,2,3
Model:,Logit,Pseudo R-squared:,0.329
Dependent Variable:,Survived,AIC:,814.8312
Date:,2021-10-14 10:27,BIC:,857.9623
No. Observations:,891,Log-Likelihood:,-398.42
Df Model:,8,LL-Null:,-593.33
Df Residuals:,882,LLR p-value:,2.8111e-79
Converged:,1.0000,Scale:,1.0
No. Iterations:,6.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Pclass,-0.4820,0.1235,-3.9020,0.0001,-0.7242,-0.2399
Sex,-2.5310,0.1925,-13.1473,0.0000,-2.9084,-2.1537
Age,-1.3767,0.1883,-7.3110,0.0000,-1.7458,-1.0076
SibSp,-0.2866,0.1061,-2.7001,0.0069,-0.4946,-0.0786
Parch,-0.1140,0.1168,-0.9759,0.3291,-0.3430,0.1150
Fare,0.2844,0.1359,2.0937,0.0363,0.0182,0.5507
Embarked,0.4178,0.1391,3.0035,0.0027,0.1451,0.6904
AgeBin,1.4405,0.2150,6.7013,0.0000,1.0192,1.8618
CabinLetter,0.2355,0.0529,4.4476,0.0000,0.1317,0.3392


In [14]:
# Testing data
X_test = testdf
if 'Parch' in X_train.columns:
    X_train = X_train.drop(columns='Parch') # Remove this as it is detrimental to the results

In [15]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver = 'saga', penalty = 'l1', max_iter = 10000, multi_class = 'multinomial', verbose = 10)
print(logreg)
results = logreg.fit(X_train, y_train)

LogisticRegression(max_iter=10000, multi_class='multinomial', penalty='l1',
                   solver='saga', verbose=10)
convergence after 65 epochs took 0 seconds


  return f(*args, **kwargs)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [16]:
if 'Parch' in X_test.columns:
    X_test = X_test.drop(columns='Parch') # Remove the Parch column as the P-value is very high

In [17]:
y_test = logreg.predict(X_test)

In [18]:
X_testLogReg = X_test.copy()
X_testLogReg['Survived'] = y_test
subLogReg = X_testLogReg
subLogReg = subLogReg[[ 'Survived']].copy() #Exract only the survived column as per the instructions
subLogReg.to_csv('submissionLogReg.csv') # create a csv file

# Random forest

We will now try a different algorith, random forest, to see if we can improve the results as well as to check which features have insignificant impact on the predictive power and whether they are the same to the ones we found previously

In [19]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_testRandFor=clf.predict(X_test)

  clf.fit(X_train,y_train)


In [20]:
X_testRandFor = X_test.copy()
X_testRandFor['Survived'] = y_testRandFor
subRandFor = X_testRandFor
subRandFor = subRandFor[[ 'Survived']].copy() #Exract only the survived column as per the instructions
subRandFor.to_csv('submissionRandFor.csv') # create a csv file