In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('house-votes-84.data', header=None, names=['class', 'h-infants', 'wproject-cost-sharing', 'adoption-of-the-budget-resolution', 'physician-fee-freeze', 'el-salvador-aid', 'religious-in-schools', 'anti-satellite', 'aid-to-nicaraguan', 'mx-missile', 'immigration', 'synfuels-cutback', 'education-spending', 'superfund-right-to-sue', 'crime', 'duty-free', 'export-south-africa'])
df.head(3)

In [None]:
# Substituindo n = o e y =1 e ? = NaN
df[df == 'n'] = 0
df[df == 'y'] = 1
df = df[df != '?']

In [None]:
# Decision tree não funciona com NaN
# Substituir pelos valores mais comuns

imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer.fit(df)
df = pd.DataFrame(imputer.transform(df))

In [None]:
df.head(3)

In [None]:
X = df.iloc[:,1:].values
y = df.iloc[:,:1].values

In [None]:
# dividindo em conjunto de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, random_state = 6)

In [None]:
tree_clf = DecisionTreeClassifier(max_depth =3)
tree_clf.fit(X_train, y_train)

In [None]:
tree.plot_tree(tree_clf)

In [None]:
print(tree_clf.score(X_test, y_test))

In [None]:
# Usando grid Search

In [None]:
cross_valid_scores = {}

In [None]:
parameters = {
    "max_depth": [2, 3, 5, 7, 9, 11],
}


model_desicion_tree = GridSearchCV(
    DecisionTreeClassifier(class_weight='balanced'), 
    parameters, 
    cv=5,
    scoring='accuracy',
)

model_desicion_tree.fit(X_train, y_train)

print('-----')
print(f'Best parameters {model_desicion_tree.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: ' + \
    f'{model_desicion_tree.best_score_:.3f}'
)
cross_valid_scores['desicion_tree'] = model_desicion_tree.best_score_
print('-----')

In [None]:
# Random Forest

In [None]:
# Grid Search

In [None]:
parameters = {
    "n_estimators": [5, 10, 15, 20, 25], 
    "max_depth": [3, 5, 7, 9, 11, 13],
    "max_leaf_nodes": [3, 5, 7, 9, 11, 13]
}

In [None]:
model_random_forest = GridSearchCV(
    RandomForestClassifier(class_weight='balanced'), 
    parameters, 
    cv=5,
    scoring='accuracy',
)

model_random_forest.fit(X_train, y_train)

print('-----')
print(f'Best parameters {model_random_forest.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: ' + \
    f'{model_random_forest.best_score_:.3f}'
)
cross_valid_scores['random_forest'] = model_random_forest.best_score_
print('-----')

In [None]:
rnd_clf = RandomForestClassifier(n_estimators=20, max_depth=7, max_leaf_nodes=7, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

In [None]:
print(rnd_clf.score(X_test, y_test))