In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Import train and test data sets for the Titanic
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [2]:
# read in the data sets
train_data = pd.read_csv("C:/Users/rmous/CSCI 191T/191T Assignment 2/Data/train.csv")
test_data = pd.read_csv("C:/Users/rmous/CSCI 191T/191T Assignment 2/Data/test.csv")

# check first 10 rows of train data set
train_data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [3]:
# check for missing data
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [4]:
# check survival data
train_data["Survived"].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [5]:
# check passenger class data
train_data["Pclass"].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [6]:
# check gender data
train_data["Sex"].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [7]:
# check survival rates by size of family
train_data["Family"] = train_data["SibSp"] + train_data["Parch"]

# Change attributes in test_data to match with train_data
test_data["Family"] = test_data["SibSp"] + test_data["Parch"]

# Check survival rate between number of family members
train_data[["Family", "Survived"]].groupby(['Family']).mean()

Unnamed: 0_level_0,Survived
Family,Unnamed: 1_level_1
0,0.303538
1,0.552795
2,0.578431
3,0.724138
4,0.2
5,0.136364
6,0.333333
7,0.0
10,0.0


In [8]:
# Bin the family sizes
both_data = [train_data, test_data]
for d in both_data:
    d.loc[d["Family"] == 0, "Family"] = 0
    d.loc[(d["Family"] > 0) & (d["Family"] <= 3), "Family"] = 1
    d.loc[(d["Family"] > 3) & (d["Family"] <= 6), "Family"] = 2
    d.loc[d["Family"] > 6, "Family"] = 3
    d["Family"] = d["Family"].astype(int)

# Check the count values of each newly created bin
train_data["Family"].value_counts()

0    537
1    292
2     49
3     13
Name: Family, dtype: int64

In [9]:
# fill NaN values in Age with the mean
both_data = [train_data, test_data]

for d in both_data:
    mean = train_data["Age"].mean()
    temp = d["Age"].copy()
    temp[np.isnan(temp)] = mean
    d["Age"] = temp
    d["Age"] = train_data["Age"].astype(int)

# check if there's any null left
train_data["Age"].isnull().sum()  

0

In [10]:
# check survival rates by size of family
train_data[["Age", "Survived"]].groupby(["Age"]).mean()

Unnamed: 0_level_0,Survived
Age,Unnamed: 1_level_1
0,1.000000
1,0.714286
2,0.300000
3,0.833333
4,0.700000
5,1.000000
6,0.666667
7,0.333333
8,0.500000
9,0.250000


In [11]:
# group up the ages

both_data = [train_data, test_data]

for d in both_data:
    d["Age"] = d["Age"].astype(int)
    d.loc[ d["Age"] <= 8, "Age"] = 0
    d.loc[(d["Age"] > 8) & (d["Age"] <= 18), "Age"] = 1
    d.loc[(d["Age"] > 18) & (d["Age"] <= 22), "Age"] = 2
    d.loc[(d["Age"] > 22) & (d["Age"] <= 27), "Age"] = 3
    d.loc[(d["Age"] > 27) & (d["Age"] <= 47), "Age"] = 4
    d.loc[(d["Age"] > 47) & (d["Age"] <= 63), "Age"] = 5
    d.loc[ d["Age"] > 63, 'Age'] = 6

    # Check value counts of each Age groups
train_data["Age"].value_counts()

4    465
3    106
2     92
1     85
5     76
0     54
6     13
Name: Age, dtype: int64

In [12]:
# check survival rates by place of Embarkment 
train_data[["Embarked", "Survived"]].groupby(["Embarked"]).mean()

Unnamed: 0_level_0,Survived
Embarked,Unnamed: 1_level_1
C,0.553571
Q,0.38961
S,0.336957


In [13]:
# Check Embarked attribute information to find most common location
train_data['Embarked'].describe()

count     889
unique      3
top         S
freq      644
Name: Embarked, dtype: object

In [14]:
# fill NaN values in Embarked with the most appeared location
mode = "S"
both_data = [train_data, test_data]

for d in both_data:
    d["Embarked"] = d["Embarked"].fillna(mode)

In [15]:
# standardized Embarked locations into numerical values
locations = {"S": 0, "C": 1, "Q": 2}
both_data = [train_data, test_data]

for d in both_data:
    d["Embarked"] = d["Embarked"].map(locations)

In [16]:
# change sex into 0:Male and 1:Female
genders = {"male": 0, "female": 1}
both_data = [train_data, test_data]

for d in both_data:
    d["Sex"] = d["Sex"].map(genders)

In [17]:
# check survival rates by gender
train_data[["Sex", "Survived"]].groupby(["Sex"]).mean()

Unnamed: 0_level_0,Survived
Sex,Unnamed: 1_level_1
0,0.188908
1,0.742038


In [18]:
# check last 50 rows of train data set to look for name prefixes
train_data.tail(50)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family
841,842,0,2,"Mudd, Mr. Thomas Charles",0,1,0,0,S.O./P.P. 3,10.5,,0,0
842,843,1,1,"Serepeca, Miss. Augusta",1,4,0,0,113798,31.0,,1,0
843,844,0,3,"Lemberopolous, Mr. Peter L",0,4,0,0,2683,6.4375,,1,0
844,845,0,3,"Culumovic, Mr. Jeso",0,1,0,0,315090,8.6625,,0,0
845,846,0,3,"Abbing, Mr. Anthony",0,4,0,0,C.A. 5547,7.55,,0,0
846,847,0,3,"Sage, Mr. Douglas Bullen",0,4,8,2,CA. 2343,69.55,,0,3
847,848,0,3,"Markoff, Mr. Marin",0,4,0,0,349213,7.8958,,1,0
848,849,0,2,"Harper, Rev. John",0,4,0,1,248727,33.0,,0,1
849,850,1,1,"Goldenberg, Mrs. Samuel L (Edwiga Grabowska)",1,4,1,0,17453,89.1042,C92,1,1
850,851,0,3,"Andersson, Master. Sigvard Harald Elias",0,0,4,2,347082,31.275,,0,2


In [19]:
# Find the name prefixes and since the prefixes tells more about a passenger and group noncommon prefixes into "others"
both_data = [train_data, test_data]
prefixes = {"Miss": 1, "Mrs": 2, "Mr": 3, "Master": 4, "Women": 5, "Deck": 6}

for d in both_data:
    d["Prefix"] = d.Name.str.extract(" ([A-Za-z]+)\.", expand = False)
    d["Prefix"] = d["Prefix"].replace(["Capt", "Col", "Major"], "Deck")
    d["Prefix"] = d["Prefix"].replace(["Dr", "Rev", "Sir", "Jonkheer", "Don"], "Mr")
    d["Prefix"] = d["Prefix"].replace(["Lady", "Countess", "Dona"], "Woman")
    d["Prefix"] = d["Prefix"].replace(["Ms", "Mlle"], "Miss")
    d["Prefix"] = d["Prefix"].replace("Mme", "Mrs")
    d["Prefix"] = d["Prefix"].map(prefixes)
    d["Prefix"] = d["Prefix"].fillna(0)

In [20]:
# change fare into type int
both_data = [train_data, test_data]

for d in both_data:
    d["Fare"] = d["Fare"].fillna(0)
    d["Fare"] = d["Fare"].astype(int)

# check survival rates by Fare price
train_data[["Fare", "Survived"]].groupby(["Fare"]).mean()

Unnamed: 0_level_0,Survived
Fare,Unnamed: 1_level_1
0,0.066667
4,0.000000
5,0.000000
6,0.090909
7,0.234742
8,0.142857
9,0.200000
10,0.321429
11,0.555556
12,0.692308


In [21]:
# set fare prices into range groups
both_data = [train_data, test_data]

for d in both_data:
    d.loc[ d["Fare"] <= 10, "Fare"] = 0
    d.loc[(d["Fare"] > 10) & (d["Fare"] <= 15), "Fare"] = 1
    d.loc[(d["Fare"] > 15) & (d["Fare"] <= 31), "Fare"]   = 2
    d.loc[(d["Fare"] > 31) & (d["Fare"] <= 93), "Fare"]   = 3
    d.loc[(d["Fare"] > 93) & (d["Fare"] <= 226), "Fare"]   = 4
    d.loc[ d["Fare"] > 226, "Fare"] = 5
    d["Fare"] = d["Fare"].astype(int)

In [22]:
# Make cabin determine whether a passenger had a cabin on the Titanic, 0 if they had a cabin and 1 if they did not
train_data["Cabin"] = train_data["Cabin"].apply(lambda att: 0 if type(att) == float else 1)
test_data["Cabin"] = test_data["Cabin"].apply(lambda att: 0 if type(att) == float else 1)

In [23]:
# drop attributes out of train_data and test_data sets
drop_attributes = ["Name", "SibSp", "Parch", "Ticket"]
train_data = train_data.drop(drop_attributes, axis = 1)
test_data  = test_data.drop(drop_attributes, axis = 1)

In [24]:
# check train data set again
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Cabin,Embarked,Family,Prefix
0,1,0,3,0,2,0,0,0,1,3.0
1,2,1,1,1,4,3,1,1,1,2.0
2,3,1,3,1,3,0,0,0,0,1.0
3,4,1,1,1,4,3,1,0,1,2.0
4,5,0,3,0,4,0,0,0,0,3.0


In [25]:
# check test data set again
test_data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Cabin,Embarked,Family,Prefix
0,892,3,0,2,0,0,2,0,3.0
1,893,3,1,4,0,0,0,1,2.0
2,894,2,0,3,0,0,2,0,3.0
3,895,3,0,4,0,0,0,0,3.0
4,896,3,1,4,1,0,0,1,2.0


In [26]:
# Double check data to make sure there are no more nulls
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null int64
Age            891 non-null int32
Fare           891 non-null int32
Cabin          891 non-null int64
Embarked       891 non-null int64
Family         891 non-null int32
Prefix         891 non-null float64
dtypes: float64(1), int32(3), int64(6)
memory usage: 59.2 KB


In [27]:
# assign training and test cases
x_train = train_data.drop(["PassengerId", "Survived"], axis = 1)
y_train = train_data["Survived"]
x_test  = test_data.drop("PassengerId", axis = 1).copy()

In [28]:
# Decision Tree with Entropy, max_depth = 14 gives the best training result
decision_tree = DecisionTreeClassifier(criterion = "entropy", max_depth = 14)
decision_tree.fit(x_train, y_train)  
y_pred = decision_tree.predict(x_test)

# Round the results by four decimal places
results = round(decision_tree.score(x_train, y_train) * 100, 4)

In [29]:
# Display Accuracy and Previous High Was 90.0112
print(results)

90.0112


In [30]:
# Add Survived column to test set
test_data["Survived"] = y_pred
test_data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Cabin,Embarked,Family,Prefix,Survived
0,892,3,0,2,0,0,2,0,3.0,0
1,893,3,1,4,0,0,0,1,2.0,0
2,894,2,0,3,0,0,2,0,3.0,0
3,895,3,0,4,0,0,0,0,3.0,0
4,896,3,1,4,1,0,0,1,2.0,0


In [31]:
# Make the PassengerId the index column for the test set
test_data.set_index("PassengerId")

Unnamed: 0_level_0,Pclass,Sex,Age,Fare,Cabin,Embarked,Family,Prefix,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
892,3,0,2,0,0,2,0,3.0,0
893,3,1,4,0,0,0,1,2.0,0
894,2,0,3,0,0,2,0,3.0,0
895,3,0,4,0,0,0,0,3.0,0
896,3,1,4,1,0,0,1,2.0,0
897,3,0,4,0,0,0,0,3.0,0
898,3,1,5,0,0,2,0,1.0,1
899,2,0,0,2,0,0,1,3.0,0
900,3,1,3,0,0,1,0,2.0,1
901,3,0,1,2,0,0,1,3.0,0


In [32]:
# Write to csv with no indexes and only two header columns
test_data.to_csv("submission.csv", index = False, columns = ["PassengerId", "Survived"])