In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

import warnings
warnings.filterwarnings('ignore')

In [2]:
train_csv = pd.read_csv('train.csv')
test_csv = pd.read_csv('test.csv')

y = train_csv['Survived']
train_csv_features = train_csv.drop(columns=['Survived'])

X_train, X_cross, y_train, y_cross = train_test_split(train_csv_features, y, test_size=.15, random_state=42) 

# EDA

In [3]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 757 entries, 599 to 102
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  757 non-null    int64  
 1   Pclass       757 non-null    int64  
 2   Name         757 non-null    object 
 3   Sex          757 non-null    object 
 4   Age          604 non-null    float64
 5   SibSp        757 non-null    int64  
 6   Parch        757 non-null    int64  
 7   Ticket       757 non-null    object 
 8   Fare         757 non-null    float64
 9   Cabin        166 non-null    object 
 10  Embarked     755 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 71.0+ KB


In [72]:
X_train['Embarked'].describe()

count     755
unique      3
top         S
freq      554
Name: Embarked, dtype: object

In [3]:
most_common_embarked='S'

In [4]:
X_train.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,757.0,757.0,604.0,757.0,757.0,757.0
mean,446.850727,2.331572,29.549404,0.540291,0.380449,32.188391
std,257.9768,0.822771,14.472253,1.147819,0.811973,50.913062
min,1.0,1.0,0.42,0.0,0.0,0.0
25%,223.0,2.0,21.0,0.0,0.0,7.925
50%,453.0,3.0,28.0,0.0,0.0,14.4542
75%,668.0,3.0,38.0,1.0,0.0,30.5
max,891.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
def fillInAgeNullls(df):
    from sklearn.impute import SimpleImputer
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp_mean.fit(df['Age'].values.reshape(-1,1))
    return imp_mean
ageFill = fillInAgeNullls(X_train)
ageFill

SimpleImputer()

In [75]:
# def cleandf(df):
#     df = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
#     df['Age'] = ageFill.transform(df['Age'].values.reshape(-1,1))
#     df['Embarked'] = df['Embarked'].fillna(most_common_embarked)
#     norm_age = df['Age'] / np.sqrt(np.sum(df['Age']**2))
#     norm_SibSp = df['SibSp'] / np.sqrt(np.sum(df['SibSp']**2))
#     norm_Parch = df['Parch'] / np.sqrt(np.sum(df['Parch']**2))
#     norm_Fare = df['Fare'] / np.sqrt(np.sum(df['Fare']**2))
#     df['Age'] = norm_age
#     df['SibSp'] = norm_SibSp
#     df['Parch'] = norm_Parch
#     df['Fare'] = norm_Fare
#     classes = pd.get_dummies(df['Pclass'])
#     df = pd.get_dummies(df)
#     df = df.drop(columns=['Pclass'])
#     df = pd.merge(df, classes, left_index=True, right_index=True)
#     return df

In [7]:
X_train_clean = cleandf(X_train)

In [8]:
train_corr = pd.merge(X_train_clean, pd.DataFrame(y_train), left_index=True, right_index=True)

In [9]:
train_corr.corr()['Survived']

Age          -0.059088
SibSp        -0.036872
Parch         0.081230
Fare          0.254007
Family        0.014237
Sex_female    0.541528
Embarked_C    0.160526
Embarked_Q   -0.005421
Embarked_S   -0.136109
c1            0.273204
c2            0.104933
c3           -0.317234
Survived      1.000000
Name: Survived, dtype: float64

# Logistic regression

In [10]:
from sklearn.linear_model import LogisticRegression

In [13]:
clf = LogisticRegression(random_state=0).fit(X_train_clean, y_train)

In [14]:
clf.score(X_train_clean, y_train)

0.7965653896961691

In [15]:
X_cross_clean = cleandf(X_cross)

In [16]:
clf.score(X_cross_clean, y_cross)

0.7761194029850746

In [6]:
#Recreate cleandf to create a family field
def cleandf(df):
    df = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
    df['Age'] = ageFill.transform(df['Age'].values.reshape(-1,1))
    df['Embarked'] = df['Embarked'].fillna(most_common_embarked)
    df['Family'] = df['SibSp'] + df['Parch']
    df.drop(columns=['SibSp', 'Parch'])
    norm_age = df['Age'] / np.sqrt(np.sum(df['Age']**2))
    norm_Fare = df['Fare'] / np.sqrt(np.sum(df['Fare']**2))
    norm_Family = df['Family'] / np.sqrt(np.sum(df['Family']**2))  
    df['Age'] = norm_age
    df['Fare'] = norm_Fare
    df['Family'] = norm_Family
    classes = pd.get_dummies(df['Pclass'])
    df = pd.get_dummies(df)
    df = df.drop(columns=['Pclass','Sex_male'])
    
    df = pd.merge(df, classes, left_index=True, right_index=True)
    df.rename(columns={list(df)[9]:'c1'}, inplace=True)
    df.rename(columns={list(df)[10]:'c2'}, inplace=True)
    df.rename(columns={list(df)[11]:'c3'}, inplace=True)
    return df

In [84]:
X_train_clean = cleandf(X_train)

In [85]:
clf = LogisticRegression(random_state=0).fit(X_train_clean, y_train)

In [86]:
clf.score(X_train_clean, y_train)

0.7965653896961691

In [87]:
X_cross_clean = cleandf(X_cross)
clf.score(X_cross_clean, y_cross)

0.7761194029850746

# Not Normalized

In [309]:
def cleandf_unNorm(df):
    df = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
    df['Age'] = ageFill.transform(df['Age'].values.reshape(-1,1))
    classes = pd.get_dummies(df['Pclass'])
    df = pd.get_dummies(df)
    df = df.drop(columns=['Pclass'])
    df = df.drop(columns=['Sex_male'])
    df['Family'] = df['SibSp'] + df['Parch']
    df.drop(columns=['SibSp', 'Parch'])
    df = pd.merge(df, classes, left_index=True, right_index=True)
    df.rename(columns={list(df)[8]:'c1'}, inplace=True)
    df.rename(columns={list(df)[9]:'c2'}, inplace=True)
    df.rename(columns={list(df)[10]:'c3'}, inplace=True)
    return df

In [310]:
X_not_norm = cleandf_unNorm(X_train)

In [311]:
clf = LogisticRegression(random_state=0).fit(X_not_norm, y_train)

In [312]:
clf.score(X_not_norm, y_train)

0.809775429326288

In [313]:
X_cross_not = cleandf_unNorm(X_cross)
clf.score(X_cross_not, y_cross)

0.8059701492537313

# The Test Data

In [17]:
test = cleandf(test_csv)
test.isnull().sum()

Age           0
SibSp         0
Parch         0
Fare          1
Family        0
Sex_female    0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
c1            0
c2            0
c3            0
dtype: int64

In [18]:
#Find the class for the null
test[pd.isnull(test).any(axis=1)]
#get the average for that class
avg_test_Fare = np.mean(test.loc[test['c3'] == 1, 'Fare'])
test['Fare'] = test['Fare'].fillna(value=avg_test_Fare)
test.isnull().sum()

Age           0
SibSp         0
Parch         0
Fare          0
Family        0
Sex_female    0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
c1            0
c2            0
c3            0
dtype: int64

In [19]:
predictions = clf.predict(test)

# CSV Creation

In [20]:
def create_output(predict):
    output = pd.DataFrame({'PassengerId': test_csv.PassengerId, 'Survived': predict})
    output.to_csv('my_submission.csv', index=False)

# SGDC

In [21]:
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state = 0)
sgd_clf.fit(X_train_clean, y_train)
sgd_clf.score(X_cross_clean, y_cross)

0.7164179104477612

In [22]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train_clean, y_train)

KNeighborsClassifier()

In [60]:
knn_clf.score(X_train_clean, y_train)

0.857331571994716

In [61]:
knn_clf.score(X_cross_clean, y_cross)

0.7985074626865671

In [62]:
knn_predictions = knn_clf.predict(test)

In [31]:
create_output(knn_predictions)

# Grid Search KNN

In [23]:
from sklearn.model_selection import GridSearchCV
grid_params = {
    'n_neighbors': [13, 15, 21, 31],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

gs = GridSearchCV(
    KNeighborsClassifier(),
    grid_params,
    verbose=1,
    cv=3 
    )

gs_results = gs.fit(X_train_clean, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:    0.3s finished


In [24]:
print(gs_results.best_params_)
gs_results.best_score_

{'metric': 'euclidean', 'n_neighbors': 15, 'weights': 'uniform'}


0.7965995357299706

# Random Forest

In [25]:
from sklearn.ensemble import RandomForestClassifier
param_grid  ={
    'n_estimators': [50, 100, 200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [3,4,5,6,7,8,9],
    'criterion' :['gini', 'entropy']
}

rfc=RandomForestClassifier(random_state=0)

In [45]:
grid_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
grid_rfc.fit(X_train_clean, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False, random_state=0,
                                   

In [38]:
grid_rfc.best_params_

{'criterion': 'entropy',
 'max_depth': 5,
 'max_features': 'auto',
 'n_estimators': 100}

In [39]:
grid_rfc.best_score_

0.8242941791565006

In [40]:
grid_rfc.score(X_cross_clean, y_cross)

0.8059701492537313

In [42]:
forest_prediction = grid_rfc.predict(test)

In [43]:
create_output(forest_prediction)

In [89]:
#Full Train-set
X_train_full = cleandf(train_csv_features)
grid_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
grid_rfc.fit(X_train_full, y)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False, random_state=0,
                                   

# Check Precision vs Recall

In [26]:
from sklearn.metrics import precision_score, recall_score
rfc_train_predictions = grid_rfc.predict(X_train_clean)

print("Precision:", precision_score(y_train, rfc_train_predictions))
print("Recall:",recall_score(y_train, rfc_train_predictions))

NameError: name 'grid_rfc' is not defined

# Trying With Binning and New Features

In [27]:
pd.qcut(X_train["Fare"], 5).value_counts()

(7.881, 10.5]        158
(21.0, 39.688]       153
(-0.001, 7.881]      152
(39.688, 512.329]    149
(10.5, 21.0]         145
Name: Fare, dtype: int64

In [28]:
pd.qcut(X_train["Age"], 5).value_counts()

(0.419, 19.0]    138
(41.4, 80.0]     121
(25.0, 32.0]     121
(19.0, 25.0]     118
(32.0, 41.4]     106
Name: Age, dtype: int64

In [29]:
def clean_bin_df(df):
    df = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
    df['Age'] = ageFill.transform(df['Age'].values.reshape(-1,1))
    df['Embarked'] = df['Embarked'].fillna(most_common_embarked)
    df['Family'] = df['SibSp'] + df['Parch']
    df.drop(columns=['SibSp', 'Parch'])
    
#     df['Cabin'] = df['Cabin'].fillna('U')
#     newCabin = []
#     for i in df['Cabin']:
#         newCabin.append(i[0])
#     df['Cabin'] = newCabin
    
    
    #Fare bins
    bins = [-1, 7.881, 10.5, 21, 39.688, 10000]
    # Create the names for the four bins
    group_names = [0, 1, 2, 3, 4]
    
    df["Fare"] = pd.cut(df["Fare"], bins, labels=group_names)
    
    #Age bins
    abins = [0, 19, 25, 32, 41.4, 80]
    # Create the names for the four bins
    agroup_names = [0, 1, 2, 3, 4]
    
    df["Age"] = pd.cut(df["Age"], bins, labels=group_names)  
    
    df['Fare'] = df['Fare'].astype(int)
    df['Age'] = df['Age'].astype(int)
    
    #New Feature - FarebyAge
    df["Fare_by_Age"] = df["Fare"]*(df["Age"]+1)
    
    
    classes = pd.get_dummies(df['Pclass'])
    df = pd.get_dummies(df)
    df = df.drop(columns=['Pclass','Sex_male'])
    
    df = pd.merge(df, classes, left_index=True, right_index=True)
    df.rename(columns={list(df)[-3]:'c1'}, inplace=True)
    df.rename(columns={list(df)[-2]:'c2'}, inplace=True)
    df.rename(columns={list(df)[-1]:'c3'}, inplace=True)
    
    return df

In [30]:
X_bin_train = clean_bin_df(X_train)
X_bin_train2 = clean_bin_df(train_csv_features)

param_grid  ={
    'n_estimators': [80, 100, 120, 140, 160],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [8, 9, 10],
    'criterion' :['gini', 'entropy']
}

rfc2=RandomForestClassifier(random_state=0)

In [269]:
grid_rfc2 = GridSearchCV(estimator=rfc2, param_grid=param_grid, cv= 5)
grid_rfc2.fit(X_bin_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False, random_state=0,
                                   

In [270]:
print(f"Best params: {grid_rfc.best_params_}")
print(f"Best score: {grid_rfc.best_score_}")

Best params: {'criterion': 'entropy', 'max_depth': 9, 'max_features': 'auto', 'n_estimators': 100}
Best score: 0.8372920720607621


In [39]:
X_cross_bin = clean_bin_df(X_cross)
#grid_rfc2.score(X_cross_bin, y_cross)

In [272]:
test2 = test_csv.copy()
test2['Fare'] = test2['Fare'].fillna(value=avg_test_Fare)

test_bin = clean_bin_df(test2)
bin_forest_prediction = grid_rfc2.predict(test_bin)
create_output(bin_forest_prediction)

In [274]:
rfm = RandomForestClassifier(random_state=0, criterion='entropy', max_depth=9, max_features='auto', n_estimators=100)
rfm.fit(X_bin_train, y_train)
rfm.feature_importances_

array([0.0821144 , 0.05334337, 0.04158303, 0.08309168, 0.07503664,
       0.1219181 , 0.34280402, 0.02300206, 0.01223004, 0.02290102,
       0.03589611, 0.0276695 , 0.07841004])

In [276]:
X_bin_train.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Family,Fare_by_Age,Sex_female,Embarked_C,Embarked_Q,Embarked_S,c1,c2,c3
599,4,1,0,4,1,20,0,1,0,0,1,0,0
830,2,1,0,2,1,6,1,1,0,0,0,0,1
306,3,0,0,4,0,16,1,1,0,0,1,0,0
231,3,0,0,0,0,0,0,0,0,1,0,0,1
845,4,0,0,0,0,0,0,0,0,1,0,0,1


In [277]:
tokeep = ['Age', 'Fare', 'Family', 'Fare_by_age', 'Sex_female', 'c3']
rfm2 = RandomForestClassifier(random_state=0, criterion='entropy', max_depth=9, max_features='auto', n_estimators=100)
rfm2.fit(X_bin_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=9, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [278]:
rfm2.score(X_bin_train, y_train)

0.8916776750330251

In [279]:
rfm2.score(X_cross_bin, y_cross)

0.835820895522388

In [45]:
test2 = test_csv.copy()
test2['Fare'] = test2['Fare'].fillna(value=avg_test_Fare)

test_bin2 = clean_bin_df(test2)
#bin_forest_prediction2 = rfm2.predict(test_bin2)
#create_output(bin_forest_prediction2)

In [33]:
from sklearn.ensemble import VotingClassifier
log_clf = LogisticRegression()
sgd_vlf = SGDClassifier()
K_clf = KNeighborsClassifier()
rf_clf = RandomForestClassifier(criterion='entropy', max_depth=5, max_features='auto', n_estimators=100)

voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('sgd', sgd_clf), ('k', K_clf), ('rf', rf_clf)], voting='hard')
voting_clf.fit(X_bin_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('sgd', SGDClassifier(random_state=0)),
                             ('k', KNeighborsClassifier()),
                             ('rf',
                              RandomForestClassifier(criterion='entropy',
                                                     max_depth=5))])

In [37]:
from sklearn.metrics import accuracy_score
voting_clf.score(X_bin_train, y_train)

0.8375165125495376

In [40]:
voting_clf.score(X_cross_bin, y_cross)

0.8134328358208955

In [46]:
voting_prediction = voting_clf.predict(test_bin2)
create_output(voting_prediction)