In [176]:
import numpy as np
import sklearn as sk 
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import pandas as pd 
from sklearn.metrics import confusion_matrix

print("the numpy version is {}".format(np.__version__))
print("the sklearn version is {}".format(sk.__version__))
print("the pandas version is {}".format(pd.__version__))



the numpy version is 1.18.1
the sklearn version is 0.22.2.post1
the pandas version is 1.0.3


In [194]:
def train_model(name, model, X_train, y_train, X_test, y_test):
    # log
    print("===> using model {}".format(name))

    # train the model
    model.fit(X_train, y_train)

    # predict the test data
    y_pred = model.predict(X_test)

    # measure effectiveness
    confusion = confusion_matrix(y_test, y_pred)
    accuracy = float(np.trace(confusion))/float(np.sum(confusion))
    print("the accuracy is {0:.2%}".format(accuracy))
    # print("the sum is {}, diagonal sum is {} and elements are {}".format(np.sum(confusion), np.trace(confusion), np.diagonal(confusion)))
    # print("the confusion matrix of type {} is \n{}\n\n".format(type(confusion), confusion))
    print("\n")

    # return
    return accuracy

In [216]:
# build the classifier map
classifiers = {
    "Nearest Neighbors": KNeighborsClassifier(3),
    "Linear SVM": SVC(kernel="linear", C=0.025),
    "RBF SVM": SVC(gamma=2, C=1),
    "Gaussian Process": GaussianProcessClassifier(1.0 * RBF(1.0)),
    "Decision Tree": DecisionTreeClassifier(max_depth=4),
    "Random Forest": RandomForestClassifier(max_depth=10, n_estimators=20, max_features=1),
    "Neural Net": MLPClassifier(alpha=1, max_iter=1000),
    "AdaBoost": AdaBoostClassifier(),
    "Naive Bayes": GaussianNB(),
    "QDA": QuadraticDiscriminantAnalysis()
    }

print("classifier map has size {}".format(len(classifiers)))

classifier map has size 10


In [150]:
# load the titanic data
# load the training data
titanic_df = pd.read_csv("../../../../Datasets/Kaggle/Competitions/Titanic/train.csv")

titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [170]:
# get the features/labels data
features = titanic_df.copy()

features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [171]:
# drop the approriate columns
features.drop(['PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1, inplace=True)

features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [172]:
# remove na rows
features = features.dropna()

features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  712 non-null    int64  
 1   Pclass    712 non-null    int64  
 2   Sex       712 non-null    object 
 3   Age       712 non-null    float64
 4   SibSp     712 non-null    int64  
 5   Parch     712 non-null    int64  
 6   Fare      712 non-null    float64
 7   Embarked  712 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 50.1+ KB


In [173]:
# get the labels
labels = features.pop("Survived")

print("the features has shape {} and the labels has shape{}".format(features.shape, labels.shape))

the features has shape (712, 7) and the labels has shape(712,)


In [174]:
# one hot sex and embarked
features = pd.get_dummies(features)

features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 0 to 890
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Pclass      712 non-null    int64  
 1   Age         712 non-null    float64
 2   SibSp       712 non-null    int64  
 3   Parch       712 non-null    int64  
 4   Fare        712 non-null    float64
 5   Sex_female  712 non-null    uint8  
 6   Sex_male    712 non-null    uint8  
 7   Embarked_C  712 non-null    uint8  
 8   Embarked_Q  712 non-null    uint8  
 9   Embarked_S  712 non-null    uint8  
dtypes: float64(2), int64(3), uint8(5)
memory usage: 36.9 KB


In [179]:
# split the data
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=2)

print("the training features have shape {} and the training labels have shape {}".format(X_train.shape, y_train.shape))
print("the test features have shape {} and the test labels have shape {}".format(X_test.shape, y_test.shape))



the training features have shape (569, 10) and the training labels have shape (569,)
the test features have shape (143, 10) and the test labels have shape (143,)


In [217]:
# build pandas dataframe to store the results
results = pd.DataFrame(columns=['name', 'accuracy'])

# loop through the data and fit the model
for name, model in classifiers.items():
    print("training model {}".format(name))
    accuracy = train_model(name, model, X_train, y_train, X_test, y_test)
    results.loc[-len(results)] = [name, accuracy]

training model Nearest Neighbors
===> using model Nearest Neighbors
the accuracy is 70.63%


training model Linear SVM
===> using model Linear SVM
the accuracy is 73.43%


training model RBF SVM
===> using model RBF SVM
the accuracy is 62.94%


training model Gaussian Process
===> using model Gaussian Process
the accuracy is 74.83%


training model Decision Tree
===> using model Decision Tree
the accuracy is 74.83%


training model Random Forest
===> using model Random Forest
the accuracy is 72.73%


training model Neural Net
===> using model Neural Net
the accuracy is 72.73%


training model AdaBoost
===> using model AdaBoost
the accuracy is 74.83%


training model Naive Bayes
===> using model Naive Bayes
the accuracy is 74.13%


training model QDA
===> using model QDA
the accuracy is 74.83%




In [218]:
# sort and show results
results = results.sort_values(by=['accuracy'], ascending=False)


results

Unnamed: 0,name,accuracy
-3,Gaussian Process,0.748252
-4,Decision Tree,0.748252
-7,AdaBoost,0.748252
-9,QDA,0.748252
-8,Naive Bayes,0.741259
-1,Linear SVM,0.734266
-5,Random Forest,0.727273
-6,Neural Net,0.727273
0,Nearest Neighbors,0.706294
-2,RBF SVM,0.629371
