In [125]:
#importing numpy and pandas

import numpy as np
import pandas as pd


# Importing training and test data

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')


# Examining training data to see what transformations need to be made

print(len(train_data))
print(train_data.head())

891
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S 

In [126]:
# Checking for null values

print('Train')
print(train_data.isnull().sum())
print('\nTest')
print(test_data.isnull().sum())


# Null values in Age, Cabin and Embarked

# Replacing null values in "Age" with mean age

mean_age = train_data.Age.mean()
test_mean_age = test_data.Age.mean()

train_data.Age.fillna(mean_age, inplace=True)
test_data.Age.fillna(test_mean_age, inplace=True)

# Removing records from train with Null values in "Embarked"

train_data.dropna(subset=['Embarked'], inplace=True)

# Identifying test record with missing fare

print(test_data[test_data['Fare'].isnull()])

# Passenger is in 3rd Class and embarked at Southampton; finding mean fare for 3rd class Southampton passengers

third_class_mean = test_data[(test_data['Pclass'] == 3) & (test_data['Embarked'] == 'S')].Fare.mean()

test_data.Fare.fillna(third_class_mean, inplace=True)

#Final check for null values (Ignoring "Cabin" null values as there are too many to drop and nothing clear to fill with)

print('Train')
print(train_data.isnull().sum())

print('\nTest')
print(test_data.isnull().sum())


Train
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Test
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64
     PassengerId  Pclass                Name   Sex   Age  SibSp  Parch Ticket  \
152         1044       3  Storey, Mr. Thomas  male  60.5      0      0   3701   

     Fare Cabin Embarked  
152   NaN   NaN        S  
Train
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

Test
PassengerId      0
Pclass           0
Name             

In [127]:
#"Sex" and "Embarked" contain non-numerical values
# Giving numerical representations of "Sex"

train_data['Sex'] = train_data['Sex'].apply(lambda x: 0 if x == "male" else 1)


# Coding dummy variables for numerical representations of point of Embarkation

train_data['qvs'] = train_data['Embarked'].apply(lambda x: 1 if x == 'S' else 0)
train_data['qvc'] = train_data['Embarked'].apply(lambda x: 1 if x == 'C' else 0)


# Coding dummy variables for Passenger Class

train_data['p2vp1'] = train_data['Pclass'].apply(lambda x: 1 if x == 1 else 0)
train_data['p2vp3'] = train_data['Pclass'].apply(lambda x: 1 if x == 3 else 0)

print(train_data.head(10))
        

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   
5            6         0       3   
6            7         0       1   
7            8         0       3   
8            9         1       3   
9           10         1       2   

                                                Name  Sex        Age  SibSp  \
0                            Braund, Mr. Owen Harris    0  22.000000      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    1  38.000000      1   
2                             Heikkinen, Miss. Laina    1  26.000000      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    1  35.000000      1   
4                           Allen, Mr. William Henry    0  35.000000      0   
5                                   Moran, Mr. James    0  29.699118      0   
6                            McCarthy, Mr. Timothy

In [139]:

#Separating features from target

cleaned_data = train_data[['p2vp1', 'p2vp3', 'Sex', 'Age',\
                           'SibSp', 'Parch', 'Fare', 'qvs', 'qvc']]
target_data = train_data[['Survived']]

print(cleaned_data.head())

print(target_data.head())


   p2vp1  p2vp3  Sex   Age  SibSp  Parch     Fare  qvs  qvc
0      0      1    0  22.0      1      0   7.2500    1    0
1      1      0    1  38.0      1      0  71.2833    0    1
2      0      1    1  26.0      0      0   7.9250    1    0
3      1      0    1  35.0      1      0  53.1000    1    0
4      0      1    0  35.0      0      0   8.0500    1    0
   Survived
0         0
1         1
2         1
3         1
4         0


In [147]:
# First pass: multiple linear regression without scaling using Scikit-Learn

from sklearn.linear_model import LinearRegression

unscaled_lr_model = LinearRegression()

unscaled_lr_model.fit(cleaned_data, target_data)

# checking coeffs and R^2 coefficient
print(unscaled_lr_model.coef_)
print(unscaled_lr_model.score(cleaned_data, target_data))

#Second pass: MLR with unit scaling

from sklearn.preprocessing import MinMaxScaler

unit_scaler = MinMaxScaler()

unit_scaled_data = pd.DataFrame(unit_scaler.fit_transform(cleaned_data))

# producing LR model with unit scaled data
unit_scaled_lr_model = LinearRegression()

unit_scaled_lr_model.fit(unit_scaled_data, unit_scaled_target)

#checking coeffs and R^2

print(unit_scaled_lr_model.coef_)
print(unit_scaled_lr_model.score(unit_scaled_data, unit_scaled_target))

# Third pass MLR with standardisation

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaled_data = pd.DataFrame(scaler.fit_transform(cleaned_data))

# producing LR model with unit scaled data
scaled_lr_model = LinearRegression()

scaled_lr_model.fit(unit_scaled_data, unit_scaled_target)

#checking coeffs and R^2

print(scaled_lr_model.coef_)
print(scaled_lr_model.score(unit_scaled_data, unit_scaled_target))


[[ 1.43898357e-01 -1.90912615e-01  5.02151261e-01 -5.84648223e-03
  -4.10023787e-02 -1.63613036e-02  3.47218054e-04 -6.74713876e-02
   1.68624215e-03]]
0.39662538746333265
[[ 0.14389836 -0.19091261  0.50215126 -0.46526306 -0.32801903 -0.09816782
   0.17788995 -0.06747139  0.00168624]]
0.39662538746333265
[[ 0.14389836 -0.19091261  0.50215126 -0.46526306 -0.32801903 -0.09816782
   0.17788995 -0.06747139  0.00168624]]
0.39662538746333265
