In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from vecstack import stacking

In [2]:
df = pd.read_csv("/Users/mohamed/PycharmProjects/titanic/train.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [4]:
# Get Nan values as list
columns_with_nan = df.columns[df.isna().any()].tolist()
print(columns_with_nan)

['Age', 'Cabin', 'Embarked']


In [5]:
# Fill Nan values with average age
df['Age'] = df['Age'].fillna((df['Age'].mean()))

# Fill Nan Categorical values with Unknown age
df['Embarked'] = df['Embarked'].fillna("Unknown")
df['Cabin'] = df['Cabin'].fillna("Unknown")
having_family = [] 
for index, row in df.iterrows():
    if row['SibSp'] > 0 or row['Parch'] > 0 :
        having_family.append(1)
    else:
        having_family.append(0)
        
df = df.drop(['SibSp', 'Parch'], axis=1)
# df.assign(protected=pd.Series(having_family))
df['HavingFamily'] = pd.Series(having_family).values
df.info()
df.describe()
df.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
PassengerId     891 non-null int64
Survived        891 non-null int64
Pclass          891 non-null int64
Name            891 non-null object
Sex             891 non-null object
Age             891 non-null float64
Ticket          891 non-null object
Fare            891 non-null float64
Cabin           891 non-null object
Embarked        891 non-null object
HavingFamily    891 non-null int64
dtypes: float64(2), int64(4), object(5)
memory usage: 76.6+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Ticket,Fare,Cabin,Embarked,HavingFamily
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,A/5 21171,7.25,Unknown,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,STON/O2. 3101282,7.925,Unknown,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,113803,53.1,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,373450,8.05,Unknown,S,0


In [6]:
# Get Nan values as list
columns_with_nan = df.columns[df.isna().any()].tolist()
print(columns_with_nan)

[]


In [7]:
df.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Ticket,Fare,Cabin,Embarked,HavingFamily
count,891.0,891.0,891.0,891,891,891.0,891,891.0,891,891,891.0
unique,,,,891,2,,681,,148,4,
top,,,,"Christy, Miss. Julie Rachel",male,,CA. 2343,,Unknown,S,
freq,,,,1,577,,7,,687,644,
mean,446.0,0.383838,2.308642,,,29.699118,,32.204208,,,0.397306
std,257.353842,0.486592,0.836071,,,13.002015,,49.693429,,,0.489615
min,1.0,0.0,1.0,,,0.42,,0.0,,,0.0
25%,223.5,0.0,2.0,,,22.0,,7.9104,,,0.0
50%,446.0,0.0,3.0,,,29.699118,,14.4542,,,0.0
75%,668.5,1.0,3.0,,,35.0,,31.0,,,1.0


In [8]:
labels = ['PassengerId', 'Pclass', 'Sex', 'Age', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'HavingFamily']
x = df.loc[:, labels].values
y = df.iloc[:, 1].values


In [9]:
label_encoder_x = LabelEncoder()
# change Age from ['Male', 'Female'] to [0, 1] through sklearn label encoder
x[:, 2] = label_encoder_x.fit_transform(x[:, 2])
x[:, -2] = label_encoder_x.fit_transform(x[:, -2])
x[:, -3] = label_encoder_x.fit_transform(x[:, -3])
x[:, 4] = label_encoder_x.fit_transform(x[:, 4])

In [10]:
x[0]

array([1, 3, 1, 22.0, 523, 7.25, 147, 2, 1], dtype=object)

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [65]:
models = [
        # RandomForestClassifier(random_state = 0, n_jobs = -1, n_estimators=400, max_depth = 5),
        # ExtraTreesClassifier(random_state = 0, n_jobs = -1, n_estimators=400, max_depth = 5),
    AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=0.05, random_state=None),
    xgb.XGBClassifier(seed = 0, n_jobs = -1, learning_rate = 0.1, n_estimators=400, max_depth = 5),
    RandomForestClassifier(random_state = 0, n_jobs = -1, n_estimators=400, max_depth = 5),
    ExtraTreesClassifier(random_state = 0, n_jobs = -1, n_estimators=400, max_depth = 5),
]

In [66]:
def stacking_models(models,x_train, y_train):
    models_fit = []
    for model in models:
        models_fit.append(model.fit(x_train, y_train))
    return models_fit

In [67]:
stacking_models_arr = stacking_models(models, x_train, y_train)


In [131]:
def stacking_pred(models_fit, x):
    res_as_arr = []
    for model_fit in models_fit:
        result = model_fit.predict(x)
        res_as_arr.append(result)
    res_as_numpy = np.dstack(res_as_arr)                    
    return res_as_numpy.reshape(len(x), len(models_fit))

In [125]:
s_train = stacking_pred(stacking_models_arr, x_train)

In [128]:
model = xgb.XGBClassifier(seed = 0, n_jobs = -1, learning_rate = 0.1, n_estimators=400, max_depth = 5)
model = model.fit(s_train, y_train)

In [132]:
s_test = stacking_pred(stacking_models_arr, x_test)
print(s_test.shape)
predictions = model.predict(s_test)

(179, 4)


In [133]:
# Print accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
confusion_matrix(y_test, predictions)

Accuracy: 84.36%


array([[100,  10],
       [ 18,  51]])

In [None]:
###########################################################################
# Stacking by hand finished here the below cell it's using stacking via a #
# lib called vecstack the issue with it is that we cant create the s_test #
# for the other test that's why i have created those functions above      #
###########################################################################

In [118]:
models = [
        # RandomForestClassifier(random_state = 0, n_jobs = -1, n_estimators=400, max_depth = 5),
        # ExtraTreesClassifier(random_state = 0, n_jobs = -1, n_estimators=400, max_depth = 5),
    AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=0.05, random_state=None),
    xgb.XGBClassifier(seed = 0, n_jobs = -1, learning_rate = 0.1, n_estimators=400, max_depth = 5),
    RandomForestClassifier(random_state = 0, n_jobs = -1, n_estimators=400, max_depth = 5),
    ExtraTreesClassifier(random_state = 0, n_jobs = -1, n_estimators=400, max_depth = 5),
]

s_train, s_test = stacking(models, x_train, y_train, x_test, 
    regression = False, metric = accuracy_score, n_folds = 4, 
    stratified = True, shuffle = True, random_state = 0, verbose = 2)
print(s_train)
model = xgb.XGBClassifier(seed = 0, n_jobs = -1, learning_rate = 0.1, n_estimators=400, max_depth = 5)
model = model.fit(s_train, y_train)

predictions = model.predict(s_test)

task:       [classification]
n_classes:  [2]
metric:     [accuracy_score]
mode:       [oof_pred_bag]
n_models:   [4]

model 0:    [AdaBoostClassifier]
    fold 0: [0.86592179]
    fold 1: [0.76966292]


    fold 2: [0.75842697]
    fold 3: [0.75141243]
    ----
    MEAN:   [0.78635603] + [0.04639628]
    FULL:   [0.78651685]

model 1:    [XGBClassifier]


    fold 0: [0.82681564]


    fold 1: [0.76966292]


    fold 2: [0.81460674]


    fold 3: [0.80790960]
    ----
    MEAN:   [0.80474873] + [0.02136079]
    FULL:   [0.80477528]

model 2:    [RandomForestClassifier]


    fold 0: [0.83798883]


    fold 1: [0.81460674]


    fold 2: [0.78651685]


    fold 3: [0.82485876]
    ----
    MEAN:   [0.81599279] + [0.01892870]
    FULL:   [0.81601124]

model 3:    [ExtraTreesClassifier]


    fold 0: [0.82122905]


    fold 1: [0.81460674]


    fold 2: [0.78651685]


    fold 3: [0.80225989]
    ----
    MEAN:   [0.80615313] + [0.01322391]
    FULL:   [0.80617978]

[[1 0 1 1]
 [0 0 0 0]
 [0 0 0 0]
 ..., 
 [0 0 0 0]
 [1 0 0 0]
 [0 0 0 0]]


In [119]:
s_train.shape

(712, 4)

In [189]:
# Print accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 84.92%


In [190]:
confusion_matrix(y_test, predictions)

array([[102,   8],
       [ 19,  50]])

In [191]:
# read test.csv file
df2 = pd.read_csv("/Users/mohamed/PycharmProjects/titanic/test.csv")

In [192]:
# Fill Nan values with average age
df2['Age'] = df2['Age'].fillna((df2['Age'].mean()))

# Fill Nan Categorical values with Unknown age
df2['Embarked'] = df2['Embarked'].fillna("Unknown")
df2['Cabin'] = df2['Cabin'].fillna("Unknown")
having_family = [] 
for index, row in df2.iterrows():
    if row['SibSp'] > 0 or row['Parch'] > 0 :
        having_family.append(1)
    else:
        having_family.append(0)
        
df2 = df2.drop(['SibSp', 'Parch'], axis=1)
df2['HavingFamily'] = pd.Series(having_family).values

In [193]:
df2.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,Ticket,Fare,Cabin,Embarked,HavingFamily
0,892,3,"Kelly, Mr. James",male,34.5,330911,7.8292,Unknown,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,363272,7.0,Unknown,S,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,240276,9.6875,Unknown,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,315154,8.6625,Unknown,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,3101298,12.2875,Unknown,S,1


In [194]:
labels = ['PassengerId', 'Pclass', 'Sex', 'Age', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'HavingFamily']
x2 = df2.loc[:, labels].values
print(x2[0])
# change Age from ['Male', 'Female'] to [0, 1] through sklearn label encoder
x2[:, 2] = label_encoder_x.fit_transform(x2[:, 2])
x2[:, -2] = label_encoder_x.fit_transform(x2[:, -2])
x2[:, -3] = label_encoder_x.fit_transform(x2[:, -3])
x2[:, 4] = label_encoder_x.fit_transform(x2[:, 4])

print(x2[0])

[892 3 'male' 34.5 '330911' 7.8292 'Unknown' 'Q' 0]
[892 3 1 34.5 152 7.8292 76 1 0]


In [195]:
predictions = model.predict(x2)

ValueError: feature_names mismatch: ['f0', 'f1'] ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8']
training data did not have the following fields: f5, f6, f2, f4, f3, f7, f8

In [19]:
# Kaggle needs the submission to have a certain format;
# see https://www.kaggle.com/c/titanic-gettingStarted/download/gendermodel.csv
# for an example of what it's supposed to look like.
submission = pd.DataFrame({ 'PassengerId': df2['PassengerId'],
                            'Survived': predictions })
submission.to_csv("/Users/mohamed/PycharmProjects/titanic/submission.csv", index=False)