In [None]:
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00571/hcvdat0.csv"

--2020-11-18 22:37:08--  https://archive.ics.uci.edu/ml/machine-learning-databases/00571/hcvdat0.csv
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 46183 (45K) [application/x-httpd-php]
Saving to: ‘hcvdat0.csv.1’


2020-11-18 22:37:08 (348 KB/s) - ‘hcvdat0.csv.1’ saved [46183/46183]



In [None]:
# Importo pandas para lectura
import pandas as pd

# Importo el encoder
from sklearn.preprocessing import LabelEncoder

# Importo el divisor de training y tests
from sklearn.model_selection import train_test_split

# Lectua del archivo
data = pd.read_csv("hcvdat0.csv")

# Eliminacion del indice
data.pop('Unnamed: 0')

print(f'Cantidad de tuplas: {data.shape[0]}')
print(f'Cantidad de columnas: {data.shape[1]}')
print(f'Cantidad de NaN\s: {data.isnull().sum().sum()}')
print(f'\nNaN\s por columna:\n{data.isnull().sum()}')

data.head()


Cantidad de tuplas: 615
Cantidad de columnas: 13
Cantidad de NaN\s: 31

NaN\s por columna:
Category     0
Age          0
Sex          0
ALB          1
ALP         18
ALT          1
AST          0
BIL          0
CHE          0
CHOL        10
CREA         0
GGT          0
PROT         1
dtype: int64


Unnamed: 0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
0,0=Blood Donor,32,m,38.5,52.5,7.7,22.1,7.5,6.93,3.23,106.0,12.1,69.0
1,0=Blood Donor,32,m,38.5,70.3,18.0,24.7,3.9,11.17,4.8,74.0,15.6,76.5
2,0=Blood Donor,32,m,46.9,74.7,36.2,52.6,6.1,8.84,5.2,86.0,33.2,79.3
3,0=Blood Donor,32,m,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7
4,0=Blood Donor,32,m,39.2,74.1,32.6,24.8,9.6,9.15,4.32,76.0,29.9,68.7


In [None]:
# Pre-procesamiento sobre columna Sex para discretizar variable de tipo String
le = LabelEncoder()
data['Sex'] = le.fit_transform(data['Sex'])

# Imputacion por hot-deck sobre las columnas 
# data.fillna(method='ffill', inplace=True)

from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=2, weights="uniform")

missing_cols = ['ALB', 'ALP', 'ALT', 'CHOL', 'PROT']

for col in missing_cols:
  data[col] = imputer.fit_transform(data[[col]])

data.head()

Unnamed: 0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
0,0=Blood Donor,32,1,38.5,52.5,7.7,22.1,7.5,6.93,3.23,106.0,12.1,69.0
1,0=Blood Donor,32,1,38.5,70.3,18.0,24.7,3.9,11.17,4.8,74.0,15.6,76.5
2,0=Blood Donor,32,1,46.9,74.7,36.2,52.6,6.1,8.84,5.2,86.0,33.2,79.3
3,0=Blood Donor,32,1,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7
4,0=Blood Donor,32,1,39.2,74.1,32.6,24.8,9.6,9.15,4.32,76.0,29.9,68.7


In [96]:
# Importo numpy 
import numpy as np

# Importo las metricas para testear el modelo
from sklearn import metrics

# Defino target
target = data.pop('Category')
target_name = target.name

# Defino features
features = data
features_names = data.columns

# Divido el dataset 80-20 como 
features_train, features_test, target_train, target_test = train_test_split(features, target, random_state=0, test_size=0.2)

from sklearn import tree
# Instancio el arbol por entropia
t = tree.DecisionTreeClassifier(criterion='entropy')

# Lo entreno
t.fit(features_train, target_train)

# Hago la prediccion
prediccion = t.predict(features_test)

# Convierto a array para recorrer
target_test = np.asarray(target_test)

# Muestro prediccion vs original
header = '{:<4}{:<2}{:<25}{:<2}{:<25}{}'.format('#', '|', 'Predicción', '|', 'Original','|')
print(header)
print('='*58, end='|')

sep = '|'

for i in range(10):
    print(f"\n{str(i).ljust(4, ' ')}{sep.ljust(2, ' ')}{prediccion[i].ljust(25, ' ')}{sep.ljust(2, ' ')}{target_test[i].ljust(25, ' ')}{sep}\n", end='-'*58+'|')


# Veo qué tan acertado estuvo
print(f'\n\nPrecisión del modelo: {metrics.accuracy_score(target_test, prediccion)}', end='\n\n')

# Muestro un reporte de clasificación con diferentes métricas sobre cada feature 
print(f'Reporte de clasificación: \n{metrics.classification_report(target_test, prediccion, target_names=["Donante", "Sospechoso donante", "Hepatitis", "Fibrosis", "Cirrosis"])}')

#   | Predicción               | Original                 |
0   | 0=Blood Donor            | 0=Blood Donor            |
----------------------------------------------------------|
1   | 0=Blood Donor            | 0=Blood Donor            |
----------------------------------------------------------|
2   | 0=Blood Donor            | 0=Blood Donor            |
----------------------------------------------------------|
3   | 0=Blood Donor            | 0=Blood Donor            |
----------------------------------------------------------|
4   | 2=Fibrosis               | 0=Blood Donor            |
----------------------------------------------------------|
5   | 3=Cirrhosis              | 1=Hepatitis              |
----------------------------------------------------------|
6   | 0=Blood Donor            | 0=Blood Donor            |
----------------------------------------------------------|
7   | 0=Blood Donor            | 0=Blood Donor            |
----------------------------------------

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from io import StringIO 
import pydotplus 
from google.colab import files 
import graphviz

# Graficamos el arbol
tree_export = tree.export_graphviz(t, out_file=None, feature_names=features_names, class_names=["Donante", "Sospechoso donante", "Hepatitis", "Fibrosis", "Cirrosis"], filled=True, rounded=True)

graph = graphviz.Source(tree_export)
graph

dot_data = StringIO() 

tree.export_graphviz(t, out_file=dot_data, feature_names=features_names, class_names=["Donante", "Sospechoso donante", "Hepatitis", "Fibrosis", "Cirrosis"], filled=True, rounded=True, special_characters=True) 
# Con el string del dot lo paso a un gráfico 

graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) 

# Genero png y lo descargo 
graph.write_png('hcv_model.png') 
files.download('hcv_model.png') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [97]:
from sklearn import tree
# Instancio el arbol por entropia
t2 = tree.DecisionTreeClassifier(criterion='entropy', max_depth=4)

# Lo entreno
t2.fit(features_train, target_train)

# Hago la prediccion
prediccion2 = t2.predict(features_test)

# Convierto a array para recorrer
target_test = np.asarray(target_test)

# Muestro prediccion vs original
header = '{:<4}{:<2}{:<25}{:<2}{:<25}{}'.format('#', '|', 'Predicción', '|', 'Original','|')
print(header)
print('='*58, end='|')

sep = '|'

for i in range(10):
    print(f"\n{str(i).ljust(4, ' ')}{sep.ljust(2, ' ')}{prediccion2[i].ljust(25, ' ')}{sep.ljust(2, ' ')}{target_test[i].ljust(25, ' ')}{sep}\n", end='-'*58+'|')

# Veo qué tan acertado estuvo
print(f'\n\nPrecisión del modelo: {metrics.accuracy_score(target_test, prediccion2)}', end='\n\n')

# Muestro un reporte de clasificación con diferentes métricas sobre cada feature 
print(f'Reporte de clasificación: \n{metrics.classification_report(target_test, prediccion2, target_names=["Donante", "Sospechoso donante", "Hepatitis", "Fibrosis", "Cirrosis"])}')

#   | Predicción               | Original                 |
0   | 0=Blood Donor            | 0=Blood Donor            |
----------------------------------------------------------|
1   | 0=Blood Donor            | 0=Blood Donor            |
----------------------------------------------------------|
2   | 0=Blood Donor            | 0=Blood Donor            |
----------------------------------------------------------|
3   | 0=Blood Donor            | 0=Blood Donor            |
----------------------------------------------------------|
4   | 0=Blood Donor            | 0=Blood Donor            |
----------------------------------------------------------|
5   | 3=Cirrhosis              | 1=Hepatitis              |
----------------------------------------------------------|
6   | 0=Blood Donor            | 0=Blood Donor            |
----------------------------------------------------------|
7   | 0=Blood Donor            | 0=Blood Donor            |
----------------------------------------

  _warn_prf(average, modifier, msg_start, len(result))


In [99]:
from io import StringIO 
import pydotplus 
from google.colab import files 
import graphviz

# Graficamos el arbol
tree_export = tree.export_graphviz(t2, out_file=None, feature_names=features_names, class_names=["Donante", "Sospechoso donante", "Hepatitis", "Fibrosis", "Cirrosis"], filled=True, rounded=True)

graph = graphviz.Source(tree_export)
graph

dot_data = StringIO() 

tree.export_graphviz(t2, out_file=dot_data, feature_names=features_names, class_names=["Donante", "Sospechoso donante", "Hepatitis", "Fibrosis", "Cirrosis"], filled=True, rounded=True, special_characters=True) 
# Con el string del dot lo paso a un gráfico 

graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) 

# Genero png y lo descargo 
graph.write_png('poda_hcv_model.png') 
files.download('poda_hcv_model.png') 

<pydotplus.graphviz.Dot at 0x7fedad85e3c8>