## Wprowadzenie
Skrypt pokazuje jak użyć pakietu SciKit do klasyfikacji danych. Rozważane są dwa przykłady:  zestaw danych IRIS oraz zestaw danych TITANIC  (do ściągnięcia z https://www.kaggle.com/c/titanic, dokładniej: potrzebny jest  plik https://www.kaggle.com/c/titanic/download/train.csv).

In [2]:
% matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn import tree

from L5.validation import test_training_indices, cross_validation

## 1. Pierwszy zestaw danych
Dane IRIS

In [54]:
# wczytanie zestawu danych
from sklearn import datasets
iris = datasets.load_iris()

data = pd.DataFrame(iris.data, columns=iris.feature_names)
data['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [55]:
# rozbicie zestawu danych na dane opisujące kwiat (X) i etykietę klasy (y)
y = data['species']
X = data.drop('species', axis = 1)

# stworzenie drzewa klasyfikacyjnego
t = tree.DecisionTreeClassifier()
t = t.fit(X, y)

In [56]:
# zapisanie drzewa klasyfikacyjnego do pliku .dot
# plik ten można przekształcić do pliku .pdf za pomocą programu graphviz używając polecenia:
#   dot -Tpdf iris.dot -o iris.pdf

with open("L5/trees/iris.dot", "w") as f:
    tree.export_graphviz(t, out_file=f, feature_names=X.columns)

In [57]:
# ocena stworzonego klasyfikatora na danych uczących
t.score(X, y)

1.0

In [6]:
# Uczciwiej byłoby oceniać klasyfikator na danych, które nie były używane podczas tworzenia
# klasyfikatora. Dlatego cały zestaw danych warto podzielić na dwie części: dane uczące i dane
# testowe.

data['train'] = np.random.uniform(0, 1, len(data))

data_train = data[data['train'] <= 0.65]
data_test = data[data['train'] > 0.65]

y = data_train['species']
X = data_train.drop('species', axis = 1)

t = tree.DecisionTreeClassifier()
t = t.fit(X, y)

print(t.score(X, y))

y = data_test['species']
X = data_test.drop('species', axis = 1)

print(t.score(X, y))

1.0
0.976744186047


## 2. Drugi zestaw danych
Dane TITANIC (do ściągnięcia z https://www.kaggle.com/c/titanic, dokładniej: potrzebny jest  plik https://www.kaggle.com/c/titanic/download/train.csv).

In [21]:
# wczytanie zestawu danych z pliku
data = pd.read_csv("L5/data/titanic.csv")
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [22]:
# usunięcie z zestawu danych atrybutów nieistotnych dla klasyfikacji
data = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 1)
data = data.dropna()
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [23]:
# zmiana kodowania atrybutów nominalnych w zestawie danych
data['Sex'] = pd.Categorical(data['Sex']).codes
data['Embarked'] = pd.Categorical(data['Embarked']).codes
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


In [24]:
# rozbicie zestawu danych na dane opisujące pasażera (X) i etykietę klasy (y)
titanic_labels = data['Survived']
titanic_data = data.drop('Survived', axis = 1)

# stworzenie drzewa klasyfikacyjnego
decision_tree = tree.DecisionTreeClassifier()
decision_tree.fit(titanic_data, titanic_labels)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [26]:
# zapisanie drzewa klasyfikacyjnego do pliku .dot
# plik ten można przekształcić do pliku .pdf za pomocą programu graphviz używając polecenia:
#   dot -Tpdf titanic.dot -o titanic.pdf

with open("L5/trees/titanic.dot", "w") as f:
    tree.export_graphviz(decision_tree, out_file=f, feature_names=titanic_data.columns)

In [27]:
# ocena stworzonego klasyfikatora na danych uczących
decision_tree.score(titanic_data, titanic_labels)

0.9859550561797753

In [28]:
# Uczciwiej byłoby oceniać klasyfikator na danych, które nie były używane podczas tworzenia
# klasyfikatora. Dlatego cały zestaw danych warto podzielić na dwie części: dane uczące i dane
# testowe (lista 5 zadanie 2b).

In [29]:
print(len(titanic_labels))

712


In [30]:
training_indices, test_indices = test_training_indices(data_size=712, train_size=476)
decision_tree = tree.DecisionTreeClassifier()
decision_tree.fit(titanic_data[training_indices], titanic_labels[training_indices])
decision_tree.score(titanic_data[test_indices], titanic_labels[test_indices])

0.76271186440677963

In [31]:
decision_tree = tree.DecisionTreeClassifier()
cross_validation_results = cross_validation(titanic_data, titanic_labels, 
                                            decision_tree)
print(cross_validation_results)

[0.6901408450704225, 0.76056338028169013, 0.73239436619718312, 0.78873239436619713, 0.79166666666666663, 0.83098591549295775, 0.77464788732394363, 0.71830985915492962, 0.80281690140845074, 0.84722222222222221]


In [32]:
decision_tree = tree.DecisionTreeClassifier(criterion="entropy")
decision_tree.fit(titanic_data, titanic_labels)
with open("L5/trees/titanic_entropy.dot", "w") as f:
    tree.export_graphviz(decision_tree, out_file=f, feature_names=titanic_data.columns)
decision_tree.score(titanic_data, titanic_labels)

0.9859550561797753

In [33]:
training_indices, test_indices = test_training_indices(data_size=712, train_size=476)
decision_tree = tree.DecisionTreeClassifier(criterion="entropy")
decision_tree.fit(titanic_data[training_indices], titanic_labels[training_indices])
decision_tree.score(titanic_data[test_indices], titanic_labels[test_indices])

0.75847457627118642

In [34]:
decision_tree = tree.DecisionTreeClassifier(criterion="entropy")
cross_validation_results = cross_validation(titanic_data, titanic_labels, 
                                            decision_tree)
print(cross_validation_results)

[0.70422535211267601, 0.78873239436619713, 0.71830985915492962, 0.78873239436619713, 0.83333333333333337, 0.77464788732394363, 0.80281690140845074, 0.6901408450704225, 0.74647887323943662, 0.80555555555555558]


In [39]:
decision_tree = tree.DecisionTreeClassifier(max_depth=4)
decision_tree.fit(titanic_data, titanic_labels)
with open("L5/trees/titanic_depth_4.dot", "w") as f:
    tree.export_graphviz(decision_tree, out_file=f, feature_names=titanic_data.columns)
decision_tree.score(titanic_data, titanic_labels)

0.8300561797752809

In [40]:
training_indices, test_indices = test_training_indices(data_size=712, train_size=476)
decision_tree = tree.DecisionTreeClassifier(max_depth=4)
decision_tree.fit(titanic_data[training_indices], titanic_labels[training_indices])
decision_tree.score(titanic_data[test_indices], titanic_labels[test_indices])

0.8347457627118644

In [41]:
decision_tree = tree.DecisionTreeClassifier(max_depth=4)
cross_validation_results = cross_validation(titanic_data, titanic_labels, 
                                            decision_tree)
print(cross_validation_results)

[0.80281690140845074, 0.84507042253521125, 0.76056338028169013, 0.74647887323943662, 0.86111111111111116, 0.77464788732394363, 0.76056338028169013, 0.78873239436619713, 0.85915492957746475, 0.79166666666666663]


In [46]:
decision_tree = tree.DecisionTreeClassifier(min_samples_leaf=20)
decision_tree.fit(titanic_data, titanic_labels)
with open("L5/trees/titanic_prunned.dot", "w") as f:
    tree.export_graphviz(decision_tree, out_file=f, feature_names=titanic_data.columns)
decision_tree.score(titanic_data, titanic_labels)

0.8286516853932584

In [47]:
training_indices, test_indices = test_training_indices(data_size=712, train_size=476)
decision_tree = tree.DecisionTreeClassifier(min_samples_leaf=20)
decision_tree.fit(titanic_data[training_indices], titanic_labels[training_indices])
decision_tree.score(titanic_data[test_indices], titanic_labels[test_indices])

0.77542372881355937

In [50]:
decision_tree = tree.DecisionTreeClassifier(min_samples_leaf=20)
cross_validation_results = cross_validation(titanic_data, titanic_labels, 
                                            decision_tree)
print(cross_validation_results)

[0.73239436619718312, 0.80281690140845074, 0.76056338028169013, 0.77464788732394363, 0.86111111111111116, 0.81690140845070425, 0.76056338028169013, 0.80281690140845074, 0.83098591549295775, 0.77777777777777779]


In [48]:
decision_tree = tree.DecisionTreeClassifier(max_leaf_nodes=20)
decision_tree.fit(titanic_data, titanic_labels)
with open("L5/trees/titanic_prunned_2.dot", "w") as f:
    tree.export_graphviz(decision_tree, out_file=f, feature_names=titanic_data.columns)
decision_tree.score(titanic_data, titanic_labels)

0.8665730337078652

In [49]:
training_indices, test_indices = test_training_indices(data_size=712, train_size=476)
decision_tree = tree.DecisionTreeClassifier(max_leaf_nodes=20)
decision_tree.fit(titanic_data[training_indices], titanic_labels[training_indices])
decision_tree.score(titanic_data[test_indices], titanic_labels[test_indices])

0.81779661016949157

In [51]:
decision_tree = tree.DecisionTreeClassifier(max_leaf_nodes=20)
cross_validation_results = cross_validation(titanic_data, titanic_labels, 
                                            decision_tree)
print(cross_validation_results)

[0.70422535211267601, 0.81690140845070425, 0.76056338028169013, 0.81690140845070425, 0.88888888888888884, 0.80281690140845074, 0.81690140845070425, 0.78873239436619713, 0.88732394366197187, 0.83333333333333337]
