In [16]:
from bs4 import BeautifulSoup

In [17]:
titanic_data = "data/titanicdata.htm"  # If using this file, while opening use Latin-1 encoding 

with open(titanic_data, 'r', encoding="Latin-1") as file:
    print(file)
    soup = BeautifulSoup(file, "html.parser")   


<_io.TextIOWrapper name='data/titanicdata.htm' mode='r' encoding='Latin-1'>


##### PROBLEM:

We mostly can't get the encoding format of a file and we just guess it. Infact tools such as Notepad++ just guesses it as well and not guarantees.

Latin-1 works; it'll work for any file regardless of what the actual encoding of the file is. 
That's because all 256 possible byte values in a file have a Latin-1 codepoint to map to, but that doesn't 
mean you get legible results! If you don't know the encoding, even opening the file in binary mode instead 
might be better than assuming Latin-1


<b> Option for handling the encoding issue  </b>

Convert the file to utf8 encoding by opening the file in Notepad and saving the encoding as UTF8. So now all the codepoints are converted to code points of utf8.

If you dont do this and open the file with encoding="utf8", it gave an error saying it :
""<i>UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 619286: invalid continuation byte""


In [18]:
#titanic_data = "data/titanicdata_utf8.html"

#with open(titanic_data, 'r', encoding="utf8") as file:
#    print(file)
#    soup = BeautifulSoup(file, "html.parser") 


In [19]:
table = soup.find('table')
print(table)

<table class="display" id="manifest" width="100%">
<thead>
<tr>
<th>Name</th>
<th>Age</th>
<th>Class/Dept</th>
<th>Ticket</th>
<th>Joined</th>
<th>Job</th>
<th>Boat [Body]</th>
<th></th>
</tr>
</thead>
<tbody>
<tr id="infinite_scroll">
<td>
<span style="font-style:italic;"><strong><a href="/titanic-survivor/nassef-cassem-albimona.html" itemprop="url"><span class="fn" itemprop="name"><span itemprop="familyName">ABÄ«-AL-MUNÃ </span>, <span itemprop="honorificPrefix">Mr</span> <span itemprop="givenName">NÄsÄ«f QÄsim</span></span></a></strong></span>
</td>
<td>
<a href="/titanic-ages/27.html">27</a></td>
<td><a href="/titanic-third-class-passengers/"><span>3rd Class Passenger</span></a></td>
<td>2699<br/>£18 15s 9d</td>
<td><a href="/titanic-places/cherbourg.html">Cherbourg</a></td>
<td>
 </td>
<td><a href="/titanic-lifeboat-15/"><strong>15</strong></a> </td>
<td><div class="profile_pic"></div></td>
</tr>
<tr id="infinite_scroll">
<td>
<span><strong><a href="/titanic-victim/anthony-abbin

In [20]:
import pandas as pd
data = pd.read_html(str(table), flavor='bs4')[0]  # Only one table in the page.

In [21]:
data.head()

Unnamed: 0,Name,Age,Class/Dept,Ticket,Joined,Job,Boat [Body],Unnamed: 7
0,"ABÄ«-AL-MUNÃ , Mr NÄsÄ«f QÄsim",27,3rd Class Passenger,2699£18 15s 9d,Cherbourg,,15,
1,"ABBING, Mr Anthony",42,3rd Class Passenger,5547£7 11s,Southampton,Blacksmith,,
2,"ABBOTT, Mrs Rhoda Mary 'Rosa'",39,3rd Class Passenger,CA2673£20 5s,Southampton,,A,
3,"ABBOTT, Mr Rossmore Edward",16,3rd Class Passenger,CA2673£20 5s,Southampton,Jeweller,[190],
4,"ABBOTT, Mr Eugene Joseph",13,3rd Class Passenger,CA2673£20 5s,Southampton,Scholar,,


In [22]:
def cleanup(value):
    return value.encode('ascii', errors='replace').replace("?"," ")

#data['Name']= data['Name'].apply(cleanup)
#data['Boat [Body]']= data['Boat [Body]'].apply(cleanup)
data['Age'] = data['Age'].apply(pd.to_numeric, errors='coerce')
data = data[["Name","Age","Class/Dept","Boat [Body]"]]

data.head()

Unnamed: 0,Name,Age,Class/Dept,Boat [Body]
0,"ABÄ«-AL-MUNÃ , Mr NÄsÄ«f QÄsim",27.0,3rd Class Passenger,15
1,"ABBING, Mr Anthony",42.0,3rd Class Passenger,
2,"ABBOTT, Mrs Rhoda Mary 'Rosa'",39.0,3rd Class Passenger,A
3,"ABBOTT, Mr Rossmore Edward",16.0,3rd Class Passenger,[190]
4,"ABBOTT, Mr Eugene Joseph",13.0,3rd Class Passenger,


In [23]:
def checkPass(class_type):
    if "Passenger" in class_type:
        return "Passenger"
    else:
        return "Crew"
    
data["Crew/Pass"]=data["Class/Dept"].apply(checkPass)
data.head()

Unnamed: 0,Name,Age,Class/Dept,Boat [Body],Crew/Pass
0,"ABÄ«-AL-MUNÃ , Mr NÄsÄ«f QÄsim",27.0,3rd Class Passenger,15,Passenger
1,"ABBING, Mr Anthony",42.0,3rd Class Passenger,,Passenger
2,"ABBOTT, Mrs Rhoda Mary 'Rosa'",39.0,3rd Class Passenger,A,Passenger
3,"ABBOTT, Mr Rossmore Edward",16.0,3rd Class Passenger,[190],Passenger
4,"ABBOTT, Mr Eugene Joseph",13.0,3rd Class Passenger,,Passenger


In [24]:
def checkClass(class_type):
    if "Passenger" in class_type:
        return class_type.split(" ")[0]
    else:
        return "Crew"
    
data["Class"]=data["Class/Dept"].apply(checkClass)
data.head() 

Unnamed: 0,Name,Age,Class/Dept,Boat [Body],Crew/Pass,Class
0,"ABÄ«-AL-MUNÃ , Mr NÄsÄ«f QÄsim",27.0,3rd Class Passenger,15,Passenger,3rd
1,"ABBING, Mr Anthony",42.0,3rd Class Passenger,,Passenger,3rd
2,"ABBOTT, Mrs Rhoda Mary 'Rosa'",39.0,3rd Class Passenger,A,Passenger,3rd
3,"ABBOTT, Mr Rossmore Edward",16.0,3rd Class Passenger,[190],Passenger,3rd
4,"ABBOTT, Mr Eugene Joseph",13.0,3rd Class Passenger,,Passenger,3rd


In [25]:
def checkAdult(age):
    if age>=18:
        return "Adult"
    else:
        return "Child"
    
data["Adult/Child"]=data["Age"].apply(checkAdult)
data.head() 

Unnamed: 0,Name,Age,Class/Dept,Boat [Body],Crew/Pass,Class,Adult/Child
0,"ABÄ«-AL-MUNÃ , Mr NÄsÄ«f QÄsim",27.0,3rd Class Passenger,15,Passenger,3rd,Adult
1,"ABBING, Mr Anthony",42.0,3rd Class Passenger,,Passenger,3rd,Adult
2,"ABBOTT, Mrs Rhoda Mary 'Rosa'",39.0,3rd Class Passenger,A,Passenger,3rd,Adult
3,"ABBOTT, Mr Rossmore Edward",16.0,3rd Class Passenger,[190],Passenger,3rd,Child
4,"ABBOTT, Mr Eugene Joseph",13.0,3rd Class Passenger,,Passenger,3rd,Child


In [26]:
def checkGender(name):
    firstname = name[name.index(",")+2:]
    salutation = firstname.split(" ")[0]
    if salutation in ["Mr","Master"]:
        return "Male"
    else:
        return "Female"

In [27]:
data["Gender"]=data["Name"].apply(checkGender)
data.head() 

Unnamed: 0,Name,Age,Class/Dept,Boat [Body],Crew/Pass,Class,Adult/Child,Gender
0,"ABÄ«-AL-MUNÃ , Mr NÄsÄ«f QÄsim",27.0,3rd Class Passenger,15,Passenger,3rd,Adult,Male
1,"ABBING, Mr Anthony",42.0,3rd Class Passenger,,Passenger,3rd,Adult,Male
2,"ABBOTT, Mrs Rhoda Mary 'Rosa'",39.0,3rd Class Passenger,A,Passenger,3rd,Adult,Female
3,"ABBOTT, Mr Rossmore Edward",16.0,3rd Class Passenger,[190],Passenger,3rd,Child,Male
4,"ABBOTT, Mr Eugene Joseph",13.0,3rd Class Passenger,,Passenger,3rd,Child,Male


In [28]:
def checkSurvival(boat):
    if str(boat).strip()=="" or "[" in str(boat):
        return 0
    else:
        return 1

data["Survival"]=data["Boat [Body]"].apply(checkSurvival)
data.head()

Unnamed: 0,Name,Age,Class/Dept,Boat [Body],Crew/Pass,Class,Adult/Child,Gender,Survival
0,"ABÄ«-AL-MUNÃ , Mr NÄsÄ«f QÄsim",27.0,3rd Class Passenger,15,Passenger,3rd,Adult,Male,1
1,"ABBING, Mr Anthony",42.0,3rd Class Passenger,,Passenger,3rd,Adult,Male,1
2,"ABBOTT, Mrs Rhoda Mary 'Rosa'",39.0,3rd Class Passenger,A,Passenger,3rd,Adult,Female,1
3,"ABBOTT, Mr Rossmore Edward",16.0,3rd Class Passenger,[190],Passenger,3rd,Child,Male,0
4,"ABBOTT, Mr Eugene Joseph",13.0,3rd Class Passenger,,Passenger,3rd,Child,Male,1


In [29]:
data.groupby(['Crew/Pass'])['Survival'].sum()*100/data.groupby(['Crew/Pass'])['Survival'].count()

Crew/Pass
Crew         90.217391
Passenger    90.310651
Name: Survival, dtype: float64

In [30]:
def compare(group,data):
    return data.groupby([group])['Survival'].sum()*100/data.groupby([group])['Survival'].count()

compare("Class",data)

Class
1st     89.714286
2nd     88.395904
3rd     91.396333
Crew    90.217391
Name: Survival, dtype: float64

In [31]:
compare("Gender",data)

Gender
Female    95.840555
Male      88.557743
Name: Survival, dtype: float64

In [32]:
compare("Adult/Child",data)

Adult/Child
Adult    89.699955
Child    95.964126
Name: Survival, dtype: float64

In [33]:
trainingData=data[["Age","Crew/Pass","Class","Adult/Child","Gender","Survival"]]
trainingData.head()

Unnamed: 0,Age,Crew/Pass,Class,Adult/Child,Gender,Survival
0,27.0,Passenger,3rd,Adult,Male,1
1,42.0,Passenger,3rd,Adult,Male,1
2,39.0,Passenger,3rd,Adult,Female,1
3,16.0,Passenger,3rd,Child,Male,0
4,13.0,Passenger,3rd,Child,Male,1


In [34]:
def catToNum(series):
    series = series.astype('category')
    return series.cat.codes

catData=trainingData[["Crew/Pass","Class","Adult/Child","Gender"]].apply(catToNum)
trainingData[["Crew/Pass","Class","Adult/Child","Gender"]]=catData
trainingData.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


Unnamed: 0,Age,Crew/Pass,Class,Adult/Child,Gender,Survival
0,27.0,1,2,0,1,1
1,42.0,1,2,0,1,1
2,39.0,1,2,0,0,1
3,16.0,1,2,1,1,0
4,13.0,1,2,1,1,1


In [35]:
len(trainingData)

2456

In [36]:
trainingData = trainingData.dropna()
len(trainingData)

2426

In [37]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(trainingData, test_size = 0.2)

In [38]:
train.head()

Unnamed: 0,Age,Crew/Pass,Class,Adult/Child,Gender,Survival
325,18.0,0,3,0,1,1
2338,26.0,0,3,0,1,1
1778,20.0,0,3,0,0,1
1571,27.0,0,3,0,1,1
1972,14.0,1,2,1,0,1


In [44]:
from sklearn.tree import DecisionTreeClassifier
clf=DecisionTreeClassifier(max_leaf_nodes=25)
clf=clf.fit(train[["Age","Crew/Pass","Class","Adult/Child","Gender"]],train["Survival"])

In [40]:
clf

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=25,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [41]:
clf.feature_importances_

array([ 0.72782909,  0.0285952 ,  0.13032494,  0.        ,  0.11325077])

In [46]:
from sklearn.externals.six import StringIO  
import pydotplus 
from sklearn import tree

dotfile = StringIO() 
tree.export_graphviz(clf, feature_names=["Age","Crew/Pass","Class","Adult/Child","Gender"], out_file=dotfile) 
pydotplus.graph_from_dot_data(dotfile.getvalue()).write_png("my_tree.png")

True

In [47]:
predictions = clf.predict(test[["Age","Crew/Pass","Class","Adult/Child","Gender"]])

In [48]:
from sklearn.metrics import accuracy_score

accuracy_score(test["Survival"], predictions)

0.89094650205761317

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=1000, max_leaf_nodes=15)

In [None]:
clf

In [None]:
def checkAccuracy(clf):
    clf=clf.fit(train[["Age","Crew/Pass","Class","Adult/Child","Gender"]],train["Survival"])
    predictions = clf.predict(test[["Age","Crew/Pass","Class","Adult/Child","Gender"]])
    return accuracy_score(test["Survival"], predictions)

In [None]:
checkAccuracy(clf)

## Gradient Boosted Trees

##### Steps to install xgboost package is given below:
    
[Download](https://www.lfd.uci.edu/~gohlke/pythonlibs/) the whl file for the python version you have and run the following command:

"<i>pip install xgboost-0.6+20171121-cp36-cp36m-win_amd64.whl</i>"

In [49]:
from xgboost.sklearn import XGBClassifier 

In [50]:
clf = XGBClassifier(n_estimators=1000)

In [51]:
clf

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [52]:
checkAccuracy(clf)

0.89506172839506171

## Hyper-parameter Tuning

In [53]:
from hyperopt import fmin, tpe, hp, STATUS_OK,Trials

In [54]:
space ={
    'n_estimators':hp.quniform('n_estimators',100,1000,1),
    'learning_rate':hp.quniform('learning_rate',0.025,0.5,0.025),
    'max_depth':hp.quniform('max_depth',1,13,1),
    'min_child_weight': hp.quniform('min_child_weight',1,6,1),
    'subsample': hp.quniform('subsample',0.5,1,0.05),
    'gamma':hp.quniform('gamma',0.5,1,0.05),
    'colsample_bytree':hp.quniform('colsample_bytree',0.5,1,0.05),
    'nthread':6,
    'silent':1
}

In [58]:
def score(params):
    params['n_estimators']=int(params['n_estimators'])
    params['max_depth']=int(params['max_depth'])
    clf=XGBClassifier(**params)
    return {'loss':1-checkAccuracy(clf),'status':STATUS_OK}

In [56]:
trials=Trials()

In [59]:
best=fmin(score,space,algo=tpe.suggest,trials=trials,max_evals=250)

In [61]:
print(best)

{'colsample_bytree': 0.75, 'gamma': 0.55, 'learning_rate': 0.1, 'max_depth': 3.0, 'min_child_weight': 4.0, 'n_estimators': 689.0, 'subsample': 0.7000000000000001}


In [62]:
1-score(best)['loss']

0.9032921810699589