In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [2]:
df = pd.read_csv("/Users/mohamed/PycharmProjects/titanic/train.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [4]:
# Get Nan values as list
columns_with_nan = df.columns[df.isna().any()].tolist()
print(columns_with_nan)

['Age', 'Cabin', 'Embarked']


In [5]:
# Fill Nan values with average age
df['Age'] = df['Age'].fillna((df['Age'].mean()))

# Fill Nan Categorical values with Unknown age
df['Embarked'] = df['Embarked'].fillna("Unknown")
df['Cabin'] = df['Cabin'].fillna("Unknown")

In [6]:
# Get Nan values as list
columns_with_nan = df.columns[df.isna().any()].tolist()
print(columns_with_nan)

[]


In [7]:
df.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,891.0,891.0,891.0,891.0,891.0,891,891
unique,,,,891,2,,,,681.0,,148,4
top,,,,"Hocking, Mrs. Elizabeth (Eliza Needs)",male,,,,347082.0,,Unknown,S
freq,,,,1,577,,,,7.0,,687,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,13.002015,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,22.0,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,29.699118,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,35.0,1.0,0.0,,31.0,,


In [8]:
labels = ['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
x = df.loc[:, labels].values
y = df.iloc[:, 1].values


In [9]:
label_encoder_x = LabelEncoder()
# change Age from ['Male', 'Female'] to [0, 1] through sklearn label encoder
x[:, 2] = label_encoder_x.fit_transform(x[:, 2])
x[:, -1] = label_encoder_x.fit_transform(x[:, -1])
x[:, -2] = label_encoder_x.fit_transform(x[:, -2])
x[:, 6] = label_encoder_x.fit_transform(x[:, 6])

In [10]:
x[0]

array([1, 3, 1, 22.0, 1, 0, 523, 7.25, 147, 2], dtype=object)

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [12]:
gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(x_train, y_train)
predictions = gbm.predict(x_test)

In [13]:
# Print accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 85.47%


In [14]:
confusion_matrix(y_test, predictions)

array([[98, 12],
       [14, 55]])

In [15]:
# read test.csv file
df2 = pd.read_csv("/Users/mohamed/PycharmProjects/titanic/test.csv")

In [16]:
# Fill Nan values with average age
df2['Age'] = df2['Age'].fillna((df2['Age'].mean()))

# Fill Nan Categorical values with Unknown age
df2['Embarked'] = df2['Embarked'].fillna("Unknown")
df2['Cabin'] = df2['Cabin'].fillna("Unknown")

In [17]:
labels = ['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
x2 = df2.loc[:, labels].values
print(x2[0])
# change Age from ['Male', 'Female'] to [0, 1] through sklearn label encoder
x2[:, 2] = label_encoder_x.fit_transform(x2[:, 2])
x2[:, -1] = label_encoder_x.fit_transform(x2[:, -1])
x2[:, -2] = label_encoder_x.fit_transform(x2[:, -2])
x2[:, 6] = label_encoder_x.fit_transform(x2[:, 6])

print(x2[0])

[892 3 'male' 34.5 0 0 '330911' 7.8292 'Unknown' 'Q']
[892 3 1 34.5 0 0 152 7.8292 76 1]


In [18]:
predictions = gbm.predict(x2)

In [19]:
# Kaggle needs the submission to have a certain format;
# see https://www.kaggle.com/c/titanic-gettingStarted/download/gendermodel.csv
# for an example of what it's supposed to look like.
submission = pd.DataFrame({ 'PassengerId': df2['PassengerId'],
                            'Survived': predictions })
submission.to_csv("/Users/mohamed/PycharmProjects/titanic/submission.csv", index=False)