# Libraries and data loading

We will first load all the necessary libraries and the data 

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn import preprocessing
import numpy as np
plt.rc("font", size=14)

In [2]:
# We now load the data
traindf = pd.read_csv('train.csv')  # load the training data
testdf = pd.read_csv('test.csv')  # load the training data

# Imputation of missing data

Let's check what is missing:

In [3]:
traindf.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
testdf.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

First we will impute the Age column with mean values. If we don't round the numbers to one decimal point there are too many decimal points which add no new knowledge to the age yet if we are working with very large data frames might pose an problem for speed. Ask Danny if this is true.

Next, let's explore the Cabin column. This is not a numeric value so using mean or median as an imutation value is meaningless. We can try a random sample from the existing cabin value or something else. 
One option is to use the most common cabin designation. Considering there are many types it might be appropriate to use Unknown. This way we are not forcing anything on the data.

In [5]:
def imputeNumerical(dfName, colName):
    missing_col = [colName]
    #Technique 1: Using mean to impute the missing values
    for i in missing_col:
        dfName.loc[dfName.loc[:,i].isnull(),i]=round(dfName.loc[:,i].mean(), 2)

def imputeCategorical(dfName, colName):
    missing_col = [colName]
    #Technique 1: Using mean to impute the missing values
    for i in missing_col:
        dfName.loc[dfName.loc[:,i].isnull(),i]='Unknown'


In [6]:
imputeNumerical(traindf, 'Age')
imputeCategorical(traindf, 'Cabin')
imputeCategorical(traindf, 'Embarked')
imputeNumerical(testdf, 'Age')
imputeNumerical(testdf, 'Fare')
imputeCategorical(testdf, 'Cabin')

# Data engineering

We will now bunch the age variable into children, age 0-10, young adults (11-30), adults(31-50) and seniors 50+.
We will also bunch the cabin numbers into just the letter in front (ex cabin C85 becomes C, cabin B42 becomes B, etc).
Sex will be transformed to male=0, female=1

In [7]:
def dataEngineering(dfName):
    bins = pd.IntervalIndex.from_breaks([0, 10, 31, 51, np.inf], closed='left')
    dfName["AgeBin"] = pd.cut(dfName.Age.values, bins).codes

    # Transforming the cabin code to only show the letter of it (the first character in the string)
    dfName['CabinLetter'] = dfName['Cabin'].astype(str).str[0]

    # Sex, Cabin, Embarked to integer
    dfName.replace({'Sex': {'female': 0, 'male': 1}}, inplace=True)
    dfName.replace({'Embarked': {'S': 0, 'C': 1, 'Q' : 2, 'Unknown':3}}, inplace=True)
    dfName.replace({'CabinLetter': {'U':0, 'C':1, 'B':2, 'D':3, 'G':4, 'F':5, 'E':6, 'T':7, 'A':8}}, inplace=True)

In [8]:
dataEngineering(traindf)
dataEngineering(testdf)

These two functions below didn't work when implemented in dataEngineering so that is why they are out. Ask Danny how to put them back in


In [9]:
# We remove all the variables that cannot have any effect like Name and Ticket 
traindf = traindf.drop(columns = ['Name', 'Ticket','Cabin'])
testdf = testdf.drop(columns = ['Name', 'Ticket','Cabin'])

# We change PassengerID to be the index of the dataframe
traindf = traindf.set_index("PassengerId") 
testdf = testdf.set_index("PassengerId") 

# Data normalization 

Let's look at the unique values in each column to see which can be normalized

In [18]:
for col in testdf:
    print(col, traindf[col].unique())
traindf.Fare.max()

Pclass [3 1 2]
Sex [1 0]
Age [22.   38.   26.   35.   29.7  54.    2.   27.   14.    4.   58.   20.
 39.   55.   31.   34.   15.   28.    8.   19.   40.   66.   42.   21.
 18.    3.    7.   49.   29.   65.   28.5   5.   11.   45.   17.   32.
 16.   25.    0.83 30.   33.   23.   24.   46.   59.   71.   37.   47.
 14.5  70.5  32.5  12.    9.   36.5  51.   55.5  40.5  44.    1.   61.
 56.   50.   36.   45.5  20.5  62.   41.   52.   63.   23.5   0.92 43.
 60.   10.   64.   13.   48.    0.75 53.   57.   80.   70.   24.5   6.
  0.67 30.5   0.42 34.5  74.  ]
SibSp [1 0 3 4 2 5 8]
Parch [0 1 2 5 3 4 6]
Fare [  7.25    71.2833   7.925   53.1      8.05     8.4583  51.8625  21.075
  11.1333  30.0708  16.7     26.55    31.275    7.8542  16.      29.125
  13.      18.       7.225   26.       8.0292  35.5     31.3875 263.
   7.8792   7.8958  27.7208 146.5208   7.75    10.5     82.1708  52.
   7.2292  11.2417   9.475   21.      41.5792  15.5     21.6792  17.8
  39.6875   7.8     76.7292  61.9792  27.

512.3292

In [None]:

X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])
scaler = preprocessing.StandardScaler().fit(X_train)
scaler


scaler.mean_


scaler.scale_


X_scaled = scaler.transform(X_train)
X_scaled


# Logistic regression

For simplicity we will split the data in X and y

In [10]:
# Training data
X_train = traindf.drop(columns = 'Survived')
y_train = traindf[['Survived']].copy()

# Testing data
X_test = testdf


In [11]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [12]:
import statsmodels.api as sm
logit_model=sm.Logit(y_train,X_train)
result=logit_model.fit()
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.472547
         Iterations 6
                         Results: Logit
Model:              Logit            Pseudo R-squared: 0.290     
Dependent Variable: Survived         AIC:              860.0780  
Date:               2021-10-05 14:29 BIC:              903.2091  
No. Observations:   891              Log-Likelihood:   -421.04   
Df Model:           8                LL-Null:          -593.33   
Df Residuals:       882              LLR p-value:      1.3008e-69
Converged:          1.0000           Scale:            1.0000    
No. Iterations:     6.0000                                       
------------------------------------------------------------------
              Coef.   Std.Err.     z      P>|z|    [0.025   0.975]
------------------------------------------------------------------
Pclass        0.1801    0.0792    2.2735  0.0230   0.0248   0.3354
Sex          -2.3509    0.1848  -12.7198  0.0000  -2.7131  -1.

In [13]:
y_test = logreg.predict(X_test)

In [14]:
X_test['Survived'] = y_test

sub = sub[[ 'Survived']].copy() #Exract only the survived column as per the instructions
sub.to_csv('submission.csv') # create a csv file

NameError: name 'sub' is not defined