In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, RidgeClassifier, RidgeClassifierCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

%matplotlib inline

### Data Cleaning and Feature Engineering

In [2]:
# Set data path and load training data
file_path = "../data"
train_full = pd.read_csv(f'{file_path}/train.csv')
print(train_full.shape)
train_full.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Drop unncessary columns 
train_full = train_full.drop(['Name', 'Ticket', 'PassengerId'], axis=1)

In [4]:
# Get a count of missing values for each variable
for var in train_full.columns:
    total_na = train_full[var].isnull().sum()
    print(f'{var}: {total_na/len(train_full)}')
    
#X['Cabin'].nunique()
#X['Cabin'].value_counts()
    
# Drop cabin as it has too many missing
train_full = train_full.drop('Cabin', axis=1)

# Drop rows that have missing embarked as we cannot impute it
train_full = train_full.dropna(subset = ['Embarked'])

# Drop rows that have missing values for the target
if train_full['Survived'].isnull().any() == True:
    train_full = train_full.dropna(subset = ['Survived'])
else:
    pass

Survived: 0.0
Pclass: 0.0
Sex: 0.0
Age: 0.19865319865319866
SibSp: 0.0
Parch: 0.0
Fare: 0.0
Cabin: 0.7710437710437711
Embarked: 0.002244668911335578


In [5]:
# Create X and y objects
X = train_full.drop('Survived', axis = 1)
y = train_full['Survived']

In [6]:
# Create objects for numerical and categorical columns
object_cols = [col for col in X.columns if X[col].dtype == 'object']
numerical_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]

In [7]:
# Check number of unique values for object_cols
for col in object_cols:
    print(f'{col}: {X[col].nunique()}')

Sex: 2
Embarked: 3


In [8]:
# Split X and y into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X,y,random_state=100)

In [9]:
# For categorical variables in train and test, convert to dummies
X_train = pd.get_dummies(X_train)
X_valid = pd.get_dummies(X_valid)

In [10]:
# Impute Age variable in train and validation set
imp = SimpleImputer(strategy='mean')
X_train['Age'] = imp.fit_transform(X_train[['Age']])
X_valid['Age'] = imp.transform(X_valid[['Age']])

In [12]:
# For numerical variables in train and test, standardize the variables
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_valid[numerical_cols] = scaler.transform(X_valid[numerical_cols])

### Model Building (No Parameter Tuning)

In [13]:
# Model 1 - Logistic Regression (no regularization)
glm = LogisticRegression()
glm_model = glm.fit(X_train,y_train)
glm_preds = glm.predict(X_valid)
print(f'Score from simple logistic regression: {accuracy_score(y_valid, glm_preds)}')

Score from simple logistic regression: 0.7802690582959642


In [14]:
# Model 2 - RandomForest Classifier
rf = RandomForestClassifier()
rf_model = rf.fit(X_train,y_train)
rf_preds = rf.predict(X_valid)
print(f'Score from simple random forest classifier: {accuracy_score(y_valid, rf_preds)}')

Score from simple random forest classifier: 0.7982062780269058


In [15]:
# Model 3 - Support Vector Machine
svc = SVC()
svc_model = svc.fit(X_train,y_train)
svc_preds = svc.predict(X_valid)
print(f'Score from simple SVM classifier: {accuracy_score(y_valid, svc_preds)}')

Score from simple SVM classifier: 0.8295964125560538


In [16]:
# Model 4 - Linear Discriminant Analysis
lda = LinearDiscriminantAnalysis(solver='svd')
lda_model = lda.fit(X_train,y_train)
lda_preds = lda.predict(X_valid)
print(f'Score from simple linear discriminant analysis: {accuracy_score(y_valid, lda_preds)}')

Score from simple linear discriminant analysis: 0.8071748878923767


### Model Building (Parameter Tuning)

In [18]:
# Model 5 - Ridge Classifier with cross-validation
ridge_params = {'penalty': ['l2'], 'C': [0.001,0.01,0.1,1.0,10.0,100.0], 'solver': ['lbfgs','liblinear']}
glm = LogisticRegression()
glm_ridge = GridSearchCV(glm, ridge_params)
glm_ridge.fit(X_train,y_train)

print(f'Best parameters: {glm_ridge.best_params_}') 
glm_ridge_preds = glm_ridge.predict(X_valid) # Predict function automatically uses best parameters
print(f'Score from optimized logistic regression (ridge): {accuracy_score(y_valid, glm_ridge_preds)}')

Best parameters: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
Score from optimized logistic regression (ridge): 0.7892376681614349


In [19]:
# Model 6 - Lasso Classifier with cross-validation
lasso_params = {'penalty': ['l1'], 'C': [0.001,0.01,0.1,1.0,10.0,100.0], 'solver': ['liblinear']}
glm = LogisticRegression()
glm_lasso = GridSearchCV(glm, lasso_params)
glm_lasso.fit(X_train,y_train)

print(f'Best parameters: {glm_lasso.best_params_}') 
glm_lasso_preds = glm_lasso.predict(X_valid) # Predict function automatically uses best parameters
print(f'Score from optimized logistic regression (lasso): {accuracy_score(y_valid, glm_lasso_preds)}')

Best parameters: {'C': 1.0, 'penalty': 'l1', 'solver': 'liblinear'}
Score from optimized logistic regression (lasso): 0.7802690582959642


In [21]:
# Model 7 - RandomForestClassifier with cross-validation
rf_params = {'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
             'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)] }

rf = RandomForestClassifier(random_state=100)

rf_cv = GridSearchCV(rf, rf_params)
rf_cv.fit(X_train,y_train)

print(f'Best parameters: {rf_cv.best_params_}') 
rf_cv_preds = rf_cv.predict(X_valid) 
score = accuracy_score(y_valid, rf_cv_preds)
print(f'Score from optimized random forest: {accuracy_score(y_valid, rf_cv_preds)}')

Best parameters: {'max_depth': 10, 'n_estimators': 600}
Score from optimized random forest: 0.8161434977578476


In [20]:
# Model 8 - Support Vector Machine with cross-validation
svc_params = {'kernel': ['rbf'], 'C': [0.001,0.01,0.1,1.0,10.0,100.0], 'gamma': [1,0.1,0.01,0.001,0.0001]}
svc = SVC(random_state = 1000)
svc_cv = GridSearchCV(svc, svc_params)
svc_cv.fit(X_train,y_train)

print(f'Best parameters: {svc_cv.best_params_}') 
svc_cv_preds = svc_cv.predict(X_valid) 
score = accuracy_score(y_valid, svc_cv_preds)
print(f'Score from optimized SVM: {accuracy_score(y_valid, svc_cv_preds)}')

Best parameters: {'C': 1.0, 'gamma': 0.1, 'kernel': 'rbf'}
Score from optimized SVM: 0.8385650224215246
