In [107]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

import warnings
warnings.filterwarnings('ignore')

In [12]:
train_csv = pd.read_csv('train.csv')
test_csv = pd.read_csv('test.csv')

y = train_csv['Survived']
train_csv_features = train_csv.drop(columns=['Survived'])

X_train, X_cross, y_train, y_cross = train_test_split(train_csv_features, y, test_size=.15, random_state=42) 

# EDA

In [13]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 757 entries, 599 to 102
Data columns (total 11 columns):
PassengerId    757 non-null int64
Pclass         757 non-null int64
Name           757 non-null object
Sex            757 non-null object
Age            604 non-null float64
SibSp          757 non-null int64
Parch          757 non-null int64
Ticket         757 non-null object
Fare           757 non-null float64
Cabin          166 non-null object
Embarked       755 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 71.0+ KB


In [14]:
X_train.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,757.0,757.0,604.0,757.0,757.0,757.0
mean,446.850727,2.331572,29.549404,0.540291,0.380449,32.188391
std,257.9768,0.822771,14.472253,1.147819,0.811973,50.913062
min,1.0,1.0,0.42,0.0,0.0,0.0
25%,223.0,2.0,21.0,0.0,0.0,7.925
50%,453.0,3.0,28.0,0.0,0.0,14.4542
75%,668.0,3.0,38.0,1.0,0.0,30.5
max,891.0,3.0,80.0,8.0,6.0,512.3292


In [85]:
def fillInAgeNullls(df):
    from sklearn.preprocessing import Imputer
    imp_mean = Imputer(missing_values=np.nan, strategy='mean')
    imp_mean.fit(df['Age'].values.reshape(-1,1))
    return imp_mean
ageFill = fillInAgeNullls(X_train)
ageFill

Imputer(axis=0, copy=True, missing_values=nan, strategy='mean', verbose=0)

In [157]:
def cleandf(df):
    df = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
    df['Age'] = ageFill.transform(df['Age'].values.reshape(-1,1))
    norm_age = df['Age'] / np.sqrt(np.sum(df['Age']**2))
    norm_SibSp = df['SibSp'] / np.sqrt(np.sum(df['SibSp']**2))
    norm_Parch = df['Parch'] / np.sqrt(np.sum(df['Parch']**2))
    norm_Fare = df['Fare'] / np.sqrt(np.sum(df['Fare']**2))
    df['Age'] = norm_age
    df['SibSp'] = norm_SibSp
    df['Parch'] = norm_Parch
    df['Fare'] = norm_Fare
    classes = pd.get_dummies(df['Pclass'])
    df = pd.get_dummies(df)
    df = df.drop(columns=['Pclass'])
    df = pd.merge(df, classes, left_index=True, right_index=True)
    return df

In [159]:
X_train_clean = cleandf(X_train)

In [177]:
train_corr = pd.merge(X_train_clean, pd.DataFrame(y_train), left_index=True, right_index=True)

In [178]:
train_corr.corr()['Survived']

Age          -0.059088
SibSp        -0.036872
Parch         0.081230
Fare          0.254007
Sex_female    0.541528
Sex_male     -0.541528
Embarked_C    0.160526
Embarked_Q   -0.005421
Embarked_S   -0.143335
1             0.273204
2             0.104933
3            -0.317234
Survived      1.000000
Name: Survived, dtype: float64

# Logistic regression

In [179]:
from sklearn.linear_model import LogisticRegression

In [180]:
clf = LogisticRegression(random_state=0).fit(X_train_clean, y_train)

In [185]:
clf.score(X_train_clean, y_train)

0.7820343461030383

In [186]:
X_cross_clean = cleandf(X_cross)

In [187]:
clf.score(X_cross_clean, y_cross)

0.7985074626865671

In [268]:
#Recreate cleandf
def cleandf(df):
    df = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
    df['Age'] = ageFill.transform(df['Age'].values.reshape(-1,1))
    df['Family'] = df['SibSp'] + df['Parch']
    df.drop(columns=['SibSp', 'Parch'])
    norm_age = df['Age'] / np.sqrt(np.sum(df['Age']**2))
    norm_Fare = df['Fare'] / np.sqrt(np.sum(df['Fare']**2))
    norm_Family = df['Family'] / np.sqrt(np.sum(df['Family']**2))  
    df['Age'] = norm_age
    df['Fare'] = norm_Fare
    df['Family'] = norm_Family
    classes = pd.get_dummies(df['Pclass'])
    df = pd.get_dummies(df)
    df = df.drop(columns=['Pclass','Sex_male'])
    
    df = pd.merge(df, classes, left_index=True, right_index=True)
    df.rename(columns={list(df)[8]:'c1'}, inplace=True)
    df.rename(columns={list(df)[9]:'c2'}, inplace=True)
    df.rename(columns={list(df)[10]:'c3'}, inplace=True)
    return df

In [269]:
X_train_clean = cleandf(X_train)

In [270]:
clf = LogisticRegression(random_state=0).fit(X_train_clean, y_train)

In [271]:
clf.score(X_train_clean, y_train)

0.797886393659181

In [272]:
X_cross_clean = cleandf(X_cross)
clf.score(X_cross_clean, y_cross)

0.7761194029850746

# Not Normalized

In [309]:
def cleandf_unNorm(df):
    df = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
    df['Age'] = ageFill.transform(df['Age'].values.reshape(-1,1))
    classes = pd.get_dummies(df['Pclass'])
    df = pd.get_dummies(df)
    df = df.drop(columns=['Pclass'])
    df = df.drop(columns=['Sex_male'])
    df['Family'] = df['SibSp'] + df['Parch']
    df.drop(columns=['SibSp', 'Parch'])
    df = pd.merge(df, classes, left_index=True, right_index=True)
    df.rename(columns={list(df)[8]:'c1'}, inplace=True)
    df.rename(columns={list(df)[9]:'c2'}, inplace=True)
    df.rename(columns={list(df)[10]:'c3'}, inplace=True)
    return df

In [310]:
X_not_norm = cleandf_unNorm(X_train)

In [311]:
clf = LogisticRegression(random_state=0).fit(X_not_norm, y_train)

In [312]:
clf.score(X_not_norm, y_train)

0.809775429326288

In [313]:
X_cross_not = cleandf_unNorm(X_cross)
clf.score(X_cross_not, y_cross)

0.8059701492537313

# The Test Data

In [329]:
test = cleandf(test_csv)
test.isnull().sum()

Age           0
SibSp         0
Parch         0
Fare          1
Family        0
Sex_female    0
Embarked_C    0
Embarked_Q    0
c1            0
c2            0
c3            0
3             0
dtype: int64

In [330]:
#Find the class for the null
test[pd.isnull(test).any(axis=1)]
#get the average for that class
avg_test_Fare = np.mean(test.loc[test['c3'] == 1, 'Fare'])
test['Fare'] = test['Fare'].fillna(value=avg_test_Fare)
test.isnull().sum()

Age           0
SibSp         0
Parch         0
Fare          0
Family        0
Sex_female    0
Embarked_C    0
Embarked_Q    0
c1            0
c2            0
c3            0
3             0
dtype: int64

In [331]:
predictions = clf.predict(test)

# CSV Creation

In [326]:
def create_output(predict):
    output = pd.DataFrame({'PassengerId': test_csv.PassengerId, 'Survived': predict})
    output.to_csv('my_submission.csv', index=False)

# SGDC

In [317]:
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state = 0)
sgd_clf.fit(X_not_norm, y_train)
sgd_clf.score(X_cross_not, y_cross)

0.4701492537313433

In [318]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train_clean, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [319]:
knn_clf.score(X_train_clean, y_train)

0.857331571994716

In [320]:
knn_clf.score(X_cross_clean, y_cross)

0.7985074626865671

In [332]:
knn_predictions = knn_clf.predict(test)

In [333]:
create_output(knn_predictions)

# Grid Search KNN

In [342]:
from sklearn.model_selection import GridSearchCV
grid_params = {
    'n_neighbors': [13, 15, 21, 31],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

gs = GridSearchCV(
    KNeighborsClassifier(),
    grid_params,
    verbose=1,
    cv=3 
    )

gs_results = gs.fit(X_train_clean, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:    0.5s finished


In [343]:
print(gs_results.best_params_)
gs_results.best_score_

{'metric': 'euclidean', 'n_neighbors': 15, 'weights': 'uniform'}


0.7965653896961691