In [296]:
import numpy as np
import sklearn as sk 
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import pandas as pd 
from sklearn.metrics import confusion_matrix

print("the numpy version is {}".format(np.__version__))
print("the sklearn version is {}".format(sk.__version__))
print("the pandas version is {}".format(pd.__version__))



the numpy version is 1.18.1
the sklearn version is 0.22.2.post1
the pandas version is 1.0.3


In [251]:
def train_model(name, model, X_train, y_train, X_test, y_test):
    # log
    print("===> using model {}".format(name))

    # train the model
    model.fit(X_train, y_train)

    # predict the test data
    y_pred = model.predict(X_test)

    # measure effectiveness
    confusion = confusion_matrix(y_test, y_pred)
    accuracy = float(np.trace(confusion))/float(np.sum(confusion))
    print("the accuracy is {0:.2%}".format(accuracy))
    # print("the sum is {}, diagonal sum is {} and elements are {}".format(np.sum(confusion), np.trace(confusion), np.diagonal(confusion)))
    # print("the confusion matrix of type {} is \n{}\n\n".format(type(confusion), confusion))
    print("\n")

    # return
    return accuracy

In [243]:
def pre_process(dataframe, is_train=True):
    '''preprocess the data'''
    # need to impute the Age and Fare column
    dataframe['Age'].fillna((dataframe['Age'].mean()), inplace=True)
    dataframe['Fare'].fillna((dataframe['Fare'].mean()), inplace=True)

    # drop the cabin column
    dataframe.drop(['PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1, inplace=True)

    # dropna for the embarked column (only needed for the train dataset)
    if is_train:
        dataframe = dataframe.dropna(subset=['Embarked'])

    return dataframe


In [297]:
# build the classifier map
classifiers = {
    "Nearest Neighbors": KNeighborsClassifier(3),
    "Linear SVM": SVC(kernel="linear", C=0.025),
    "RBF SVM": SVC(gamma=2, C=1),
    "Gaussian Process": GaussianProcessClassifier(1.0 * RBF(1.0)),
    "Decision Tree": DecisionTreeClassifier(max_depth=4),
    "Random Forest": RandomForestClassifier(max_depth=10, n_estimators=20, max_features=1),
    "Neural Net": MLPClassifier(alpha=1, max_iter=1000),
    "AdaBoost": AdaBoostClassifier(),
    "Naive Bayes": GaussianNB(),
    "QDA": QuadraticDiscriminantAnalysis(),
    'SGDClassifier': SGDClassifier(max_iter=1000, tol=1e-3)
    }

print("classifier map has size {}".format(len(classifiers)))

classifier map has size 11


In [227]:
# load the titanic data
# load the training data
titanic_df = pd.read_csv("../../../../Datasets/Kaggle/Competitions/Titanic/train.csv")

titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [244]:
# get the features/labels data
features = titanic_df.copy()

features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [245]:
# drop the approriate columns
# features.drop(['PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1, inplace=True)
features = pre_process(features, True)

features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  889 non-null    int64  
 1   Pclass    889 non-null    int64  
 2   Sex       889 non-null    object 
 3   Age       889 non-null    float64
 4   SibSp     889 non-null    int64  
 5   Parch     889 non-null    int64  
 6   Fare      889 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 62.5+ KB


In [246]:
# get the labels
labels = features.pop("Survived")

print("the features has shape {} and the labels has shape{}".format(features.shape, labels.shape))

the features has shape (889, 7) and the labels has shape(889,)


In [247]:
# one hot sex and embarked
features = pd.get_dummies(features)

features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Pclass      889 non-null    int64  
 1   Age         889 non-null    float64
 2   SibSp       889 non-null    int64  
 3   Parch       889 non-null    int64  
 4   Fare        889 non-null    float64
 5   Sex_female  889 non-null    uint8  
 6   Sex_male    889 non-null    uint8  
 7   Embarked_C  889 non-null    uint8  
 8   Embarked_Q  889 non-null    uint8  
 9   Embarked_S  889 non-null    uint8  
dtypes: float64(2), int64(3), uint8(5)
memory usage: 46.0 KB


In [248]:
# split the data
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=2)

print("the training features have shape {} and the training labels have shape {}".format(X_train.shape, y_train.shape))
print("the test features have shape {} and the test labels have shape {}".format(X_test.shape, y_test.shape))



the training features have shape (711, 10) and the training labels have shape (711,)
the test features have shape (178, 10) and the test labels have shape (178,)


In [298]:
# build pandas dataframe to store the results
results = pd.DataFrame(columns=['name', 'accuracy'])

# loop through the data and fit the model
for name, model in classifiers.items():
    # print("training model {}".format(name))
    accuracy = train_model(name, model, X_train, y_train, X_test, y_test)
    results.loc[-len(results)] = [name, accuracy]

===> using model Nearest Neighbors
the accuracy is 70.22%


===> using model Linear SVM
the accuracy is 80.90%


===> using model RBF SVM
the accuracy is 63.48%


===> using model Gaussian Process
the accuracy is 37.64%


===> using model Decision Tree
the accuracy is 74.16%


===> using model Random Forest
the accuracy is 84.27%


===> using model Neural Net
the accuracy is 79.21%


===> using model AdaBoost
the accuracy is 78.09%


===> using model Naive Bayes
the accuracy is 78.09%


===> using model QDA
the accuracy is 71.35%


===> using model SGDClassifier
the accuracy is 63.48%




In [299]:
# sort and show results
results = results.sort_values(by=['accuracy'], ascending=False)


results

Unnamed: 0,name,accuracy
-5,Random Forest,0.842697
-1,Linear SVM,0.808989
-6,Neural Net,0.792135
-7,AdaBoost,0.780899
-8,Naive Bayes,0.780899
-4,Decision Tree,0.741573
-9,QDA,0.713483
0,Nearest Neighbors,0.702247
-2,RBF SVM,0.634831
-10,SGDClassifier,0.634831


In [287]:
# load the titanic data
# load the training data
titanic_test_df = pd.read_csv("../../../../Datasets/Kaggle/Competitions/Titanic/test.csv")

titanic_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [222]:
titanic_df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [288]:
# get the passenger id
titanic_test_passenger = pd.DataFrame(titanic_test_df['PassengerId'])

titanic_test_passenger.head()

Unnamed: 0,PassengerId
0,892
1,893
2,894
3,895
4,896


In [289]:
# preprocess the test dataset
titanic_test_df = pre_process(titanic_test_df, False)

titanic_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Sex       418 non-null    object 
 2   Age       418 non-null    float64
 3   SibSp     418 non-null    int64  
 4   Parch     418 non-null    int64  
 5   Fare      418 non-null    float64
 6   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 23.0+ KB


In [290]:
# one hot the remaining columns
titanic_test_df = pd.get_dummies(titanic_test_df)

titanic_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Pclass      418 non-null    int64  
 1   Age         418 non-null    float64
 2   SibSp       418 non-null    int64  
 3   Parch       418 non-null    int64  
 4   Fare        418 non-null    float64
 5   Sex_female  418 non-null    uint8  
 6   Sex_male    418 non-null    uint8  
 7   Embarked_C  418 non-null    uint8  
 8   Embarked_Q  418 non-null    uint8  
 9   Embarked_S  418 non-null    uint8  
dtypes: float64(2), int64(3), uint8(5)
memory usage: 18.5 KB


In [259]:
# get the model
model = classifiers['Random Forest']


In [293]:
# predict and store
# calculate predictions

titanic_test_df['Survived'] = model.predict(titanic_test_df)
titanic_test_df['Survived'] = titanic_test_df['Survived'].apply(lambda x: round(x,0)).astype('int')

titanic_test_df.head()


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Survived
0,3,34.5,0,0,7.8292,0,1,0,1,0,0
1,3,47.0,1,0,7.0,1,0,0,0,1,0
2,2,62.0,0,0,9.6875,0,1,0,1,0,0
3,3,27.0,0,0,8.6625,0,1,0,0,1,0
4,3,22.0,1,1,12.2875,1,0,0,0,1,1


In [294]:
# create the solution dataframe
solution = pd.concat([titanic_test_passenger['PassengerId'], titanic_test_df['Survived']], axis=1)

solution.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [295]:
# save to csv
solution.to_csv("../../../../Datasets/Kaggle/Competitions/Titanic/RandomForest.csv", index=False)

