## Drzewo decyzyjne - RMS Titanic

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

### Pobieramy dane

In [2]:
url = "https://www.encyclopedia-titanica.org/titanic-passengers-and-crew/"
strona = requests.get(url).text

#### Szukamy tabel na stronie

In [3]:
soup = BeautifulSoup(strona,"html.parser")
table = soup.find('table')

#### Korzystając z silnika bs4 zapisujemy pierwszą tabelę

In [4]:
data = pd.read_html(str(table), flavor = 'bs4')[0]

In [5]:
print(data.shape)

(2456, 8)


In [6]:
data.head()

Unnamed: 0,Name,Age,Class/Dept,Ticket,Joined,Job,Boat [Body],Unnamed: 7
0,"ABī-AL-MUNà, Mr Nāsīf Qāsim",27,3rd Class Passenger,2699£18 15s 9d,Cherbourg,,15,
1,"ABBING, Mr Anthony",42,3rd Class Passenger,5547£7 11s,Southampton,Blacksmith,,
2,"ABBOTT, Mrs Rhoda Mary 'Rosa'",39,3rd Class Passenger,CA2673£20 5s,Southampton,,A,
3,"ABBOTT, Mr Rossmore Edward",16,3rd Class Passenger,CA2673£20 5s,Southampton,Jeweller,[190],
4,"ABBOTT, Mr Eugene Joseph",13,3rd Class Passenger,CA2673£20 5s,Southampton,Scholar,,


#### Usuwamy nieważne dane

In [7]:
data = data[["Name","Age","Class/Dept","Boat [Body]"]]

In [8]:
data.head()

Unnamed: 0,Name,Age,Class/Dept,Boat [Body]
0,"ABī-AL-MUNà, Mr Nāsīf Qāsim",27,3rd Class Passenger,15
1,"ABBING, Mr Anthony",42,3rd Class Passenger,
2,"ABBOTT, Mrs Rhoda Mary 'Rosa'",39,3rd Class Passenger,A
3,"ABBOTT, Mr Rossmore Edward",16,3rd Class Passenger,[190]
4,"ABBOTT, Mr Eugene Joseph",13,3rd Class Passenger,


In [9]:
data["Boat [Body]"]= data["Boat [Body]"].fillna("")

In [10]:
data.head()

Unnamed: 0,Name,Age,Class/Dept,Boat [Body]
0,"ABī-AL-MUNà, Mr Nāsīf Qāsim",27,3rd Class Passenger,15
1,"ABBING, Mr Anthony",42,3rd Class Passenger,
2,"ABBOTT, Mrs Rhoda Mary 'Rosa'",39,3rd Class Passenger,A
3,"ABBOTT, Mr Rossmore Edward",16,3rd Class Passenger,[190]
4,"ABBOTT, Mr Eugene Joseph",13,3rd Class Passenger,


### Tworzymy kolumnę przetrwania

In [11]:
def przetrwanie(val):
    if val=="" or "[" in val:
        return 0
    else:
        return 1

In [12]:
data["Przetrwanie"] = data["Boat [Body]"].apply(przetrwanie)

#### Zastępujemy napis liczby liczbą

In [14]:
data["Age"] = data["Age"].apply(pd.to_numeric, errors = "coerce")

In [15]:
data.head()

Unnamed: 0,Name,Age,Class/Dept,Boat [Body],Przetrwanie
0,"ABī-AL-MUNà, Mr Nāsīf Qāsim",27.0,3rd Class Passenger,15,1
1,"ABBING, Mr Anthony",42.0,3rd Class Passenger,,0
2,"ABBOTT, Mrs Rhoda Mary 'Rosa'",39.0,3rd Class Passenger,A,1
3,"ABBOTT, Mr Rossmore Edward",16.0,3rd Class Passenger,[190],0
4,"ABBOTT, Mr Eugene Joseph",13.0,3rd Class Passenger,,0


#### Tworzymy kolumnę klasa

In [19]:
def jaka_klasa(klasa):
    if "Passenger" in klasa:
        return klasa.split()[0][0]
    else:
        return "Załoga"

In [20]:
data["Klasa"] = data["Class/Dept"].apply(jaka_klasa)

In [21]:
data.head()

Unnamed: 0,Name,Age,Class/Dept,Boat [Body],Przetrwanie,Klasa
0,"ABī-AL-MUNà, Mr Nāsīf Qāsim",27.0,3rd Class Passenger,15,1,3
1,"ABBING, Mr Anthony",42.0,3rd Class Passenger,,0,3
2,"ABBOTT, Mrs Rhoda Mary 'Rosa'",39.0,3rd Class Passenger,A,1,3
3,"ABBOTT, Mr Rossmore Edward",16.0,3rd Class Passenger,[190],0,3
4,"ABBOTT, Mr Eugene Joseph",13.0,3rd Class Passenger,,0,3


#### Tworzymy kolumnę dorosłości

In [22]:
def jaka_grupa(wiek):
    if wiek<18:
        return "Dziecko"
    else:
        return "Dorosły"

In [23]:
data["Dziecko/Dorosły"] = data["Age"].apply(jaka_grupa)

In [24]:
data.head()

Unnamed: 0,Name,Age,Class/Dept,Boat [Body],Przetrwanie,Klasa,Dziecko/Dorosły
0,"ABī-AL-MUNà, Mr Nāsīf Qāsim",27.0,3rd Class Passenger,15,1,3,Dorosły
1,"ABBING, Mr Anthony",42.0,3rd Class Passenger,,0,3,Dorosły
2,"ABBOTT, Mrs Rhoda Mary 'Rosa'",39.0,3rd Class Passenger,A,1,3,Dorosły
3,"ABBOTT, Mr Rossmore Edward",16.0,3rd Class Passenger,[190],0,3,Dziecko
4,"ABBOTT, Mr Eugene Joseph",13.0,3rd Class Passenger,,0,3,Dziecko


#### Tworzymy kolumnę płci

In [25]:
def jaka_plec(imie):
    po_przecinku = imie[imie.index(",")+2:].split(" ")
    forma = po_przecinku[0]
    if forma in ["Mr","Master", "Sig.", "Sr."]:
        return "Mężczyzna"
    else: 
        return "Kobieta"

In [26]:
data["Płeć"] = data["Name"].apply(jaka_plec)

In [27]:
data.head()

Unnamed: 0,Name,Age,Class/Dept,Boat [Body],Przetrwanie,Klasa,Dziecko/Dorosły,Płeć
0,"ABī-AL-MUNà, Mr Nāsīf Qāsim",27.0,3rd Class Passenger,15,1,3,Dorosły,Mężczyzna
1,"ABBING, Mr Anthony",42.0,3rd Class Passenger,,0,3,Dorosły,Mężczyzna
2,"ABBOTT, Mrs Rhoda Mary 'Rosa'",39.0,3rd Class Passenger,A,1,3,Dorosły,Kobieta
3,"ABBOTT, Mr Rossmore Edward",16.0,3rd Class Passenger,[190],0,3,Dziecko,Mężczyzna
4,"ABBOTT, Mr Eugene Joseph",13.0,3rd Class Passenger,,0,3,Dziecko,Mężczyzna


#### Garść informacji o zbiorze

In [28]:
data.groupby(["Płeć"])["Name"].count()

Płeć
Kobieta       533
Mężczyzna    1923
Name: Name, dtype: int64

In [29]:
data.groupby(["Płeć"])["Przetrwanie"].sum()

Płeć
Kobieta      341
Mężczyzna    294
Name: Przetrwanie, dtype: int64

In [30]:
def porownanie_przezywalnosci(grupa):
    return data.groupby([grupa])["Przetrwanie"].sum()/data.groupby([grupa])["Przetrwanie"].count()

In [31]:
porownanie_przezywalnosci("Płeć")

Płeć
Kobieta      0.639775
Mężczyzna    0.152886
Name: Przetrwanie, dtype: float64

In [32]:
porownanie_przezywalnosci("Klasa")

Klasa
1         0.574286
2         0.378840
3         0.242595
Załoga    0.136775
Name: Przetrwanie, dtype: float64

In [33]:
porownanie_przezywalnosci("Dziecko/Dorosły")

Dziecko/Dorosły
Dorosły    0.248343
Dziecko    0.378238
Name: Przetrwanie, dtype: float64

In [34]:
data.head()

Unnamed: 0,Name,Age,Class/Dept,Boat [Body],Przetrwanie,Klasa,Dziecko/Dorosły,Płeć
0,"ABī-AL-MUNà, Mr Nāsīf Qāsim",27.0,3rd Class Passenger,15,1,3,Dorosły,Mężczyzna
1,"ABBING, Mr Anthony",42.0,3rd Class Passenger,,0,3,Dorosły,Mężczyzna
2,"ABBOTT, Mrs Rhoda Mary 'Rosa'",39.0,3rd Class Passenger,A,1,3,Dorosły,Kobieta
3,"ABBOTT, Mr Rossmore Edward",16.0,3rd Class Passenger,[190],0,3,Dziecko,Mężczyzna
4,"ABBOTT, Mr Eugene Joseph",13.0,3rd Class Passenger,,0,3,Dziecko,Mężczyzna


In [35]:
train = data[["Płeć","Klasa","Age","Dziecko/Dorosły","Przetrwanie"]]

In [36]:
train.head()

Unnamed: 0,Płeć,Klasa,Age,Dziecko/Dorosły,Przetrwanie
0,Mężczyzna,3,27.0,Dorosły,1
1,Mężczyzna,3,42.0,Dorosły,0
2,Kobieta,3,39.0,Dorosły,1
3,Mężczyzna,3,16.0,Dziecko,0
4,Mężczyzna,3,13.0,Dziecko,0


#### Zamiana wartości kategorycznych na liczby

In [37]:
def toNum(val):
    val = val.astype('category')
    return val.cat.codes
train2 = train[["Klasa","Dziecko/Dorosły","Płeć"]].apply(toNum)
train[["Klasa","Dziecko/Dorosły","Płeć"]] = train2
train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


Unnamed: 0,Płeć,Klasa,Age,Dziecko/Dorosły,Przetrwanie
0,1,2,27.0,0,1
1,1,2,42.0,0,0
2,0,2,39.0,0,1
3,1,2,16.0,1,0
4,1,2,13.0,1,0


In [38]:
train.isnull().sum()

Płeć                0
Klasa               0
Age                40
Dziecko/Dorosły     0
Przetrwanie         0
dtype: int64

In [39]:
len(train)

2456

In [40]:
train = train.dropna()

In [41]:
len(train)

2416

In [42]:
train.isnull().sum()

Płeć               0
Klasa              0
Age                0
Dziecko/Dorosły    0
Przetrwanie        0
dtype: int64

In [43]:
train.head()

Unnamed: 0,Płeć,Klasa,Age,Dziecko/Dorosły,Przetrwanie
0,1,2,27.0,0,1
1,1,2,42.0,0,0
2,0,2,39.0,0,1
3,1,2,16.0,1,0
4,1,2,13.0,1,0


In [44]:
def corrplot(df):
    return df.corr()
corrplot(train)

Unnamed: 0,Płeć,Klasa,Age,Dziecko/Dorosły,Przetrwanie
Płeć,1.0,0.427418,0.033126,-0.146984,-0.468146
Klasa,0.427418,1.0,-0.12486,-0.080689,-0.352164
Age,0.033126,-0.12486,1.0,-0.506012,-0.05535
Dziecko/Dorosły,-0.146984,-0.080689,-0.506012,1.0,0.080117
Przetrwanie,-0.468146,-0.352164,-0.05535,0.080117,1.0


## Nauka modelu

In [48]:
from sklearn.model_selection import train_test_split

### Dzielimy zbiór na treningowy oraz testownik

In [49]:
train,test = train_test_split(train, test_size = 0.2)

In [50]:
train.head()

Unnamed: 0,Płeć,Klasa,Age,Dziecko/Dorosły,Przetrwanie
2393,0,1,30.0,0,1
984,1,3,28.0,0,1
115,1,1,18.0,0,0
2047,1,0,51.0,0,0
2150,1,3,20.0,0,0


In [51]:
len(train)+len(test)

2416

## Importujemy i tworzymy [...]

In [52]:
from sklearn.tree import DecisionTreeClassifier

In [61]:
drzewko = DecisionTreeClassifier(max_leaf_nodes=10)

In [62]:
drzewko

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=10, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [63]:
drzewko = drzewko.fit(train[["Klasa","Age","Dziecko/Dorosły","Płeć"]], train["Przetrwanie"])

In [64]:
dict(zip(["Klasa","Age","Dziecko/Dorosły","Płeć"],list(drzewko.feature_importances_)))

{'Age': 0.088324038125327783,
 'Dziecko/Dorosły': 0.0,
 'Klasa': 0.2003748631509486,
 'Płeć': 0.71130109872372371}

### Tworzymy graf drzewa decyzyjnego

In [68]:
from sklearn import tree
with open("titanic.dot", "w") as f:
    f = tree.export_graphviz(drzewko, feature_names = ["Klasa","Age","Dziecko/Dorosły","Płeć"], out_file = f)

In [65]:
predictions = drzewko.predict(test[["Klasa","Age","Dziecko/Dorosły","Płeć"]])

In [66]:
from sklearn.metrics import accuracy_score

### Sprawdzamy dokładność predykcji

In [67]:
accuracy_score(test["Przetrwanie"], predictions)

0.8223140495867769

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [None]:
rf

In [None]:
def jaka_skutecznosc(rf):
    rf = rf.fit(train[["Klasa","Age","Dziecko/Dorosły","Płeć"]], train["Przetrwanie"])
    predictions = rf.predict(test[["Klasa","Age","Dziecko/Dorosły","Płeć"]])
    return accuracy_score(test["Przetrwanie"],predictions)

In [None]:
jaka_skutecznosc(rf)

# The Koniec