In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_rows',2500)
np.random.seed(42)

In [2]:
data = pd.read_excel("NewDataset.xlsx")

# Remoção de features por haver poucos países com tais informações:
* Pf_rol_procedural = Procedural Justice
* pf_rol_civil = Civil justice
* pf_rol_criminal = Criminal justice

In [3]:
test = data.sort_values('countries')
test = test.drop(['pf_rol_procedural','pf_rol_civil','pf_rol_criminal'],axis=1)
test = test.fillna(method='backfill',axis = 0)

In [4]:
test.sort_values('PIB',inplace=True,ascending=False)

In [5]:
def classifier(PIB):
    PIB_third = len(PIB)/3
    r = []
    for i in range(len(PIB)):
        if(i < PIB_third * 1):
            r.append('0')
        elif(i < PIB_third *2):
            r.append('1')
        else:
            r.append('2')
    return r

# 2 ultimos anos da Venezuela vazios
# Dropando países: Taiwan, Macedonia, Syria, Swaziland

In [6]:
test.dropna(how='any',subset=['PIB'],inplace=True)

Criando as Labels e classificando os países como a 0, 1 e 2

In [7]:
test['Label'] = classifier(test['PIB'].values)
final = test.drop(test.columns[:5],axis = 1)
final.drop("PIB",axis = 1,inplace=True)
final = round(final,2)
final = final.fillna(method='ffill')

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
x = final.drop("Label",axis = 1)
y = final["Label"]
x_train,X_val,y_train,Y_val = train_test_split(x,y,random_state = 42)

In [10]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest

In [11]:
RFC = RandomForestClassifier(verbose=1)

In [12]:
RFC.fit(x_train,y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)

In [13]:
estimator = RFC.estimators_[5]

from sklearn.tree import export_graphviz
from g
# Export as dot file
export_graphviz(estimator, out_file='tree.dot', 
                feature_names = x_train.columns.values,
                class_names = y_train,
                rounded = True, proportion = False, 
                precision = 2, filled = True)

# Convert to png using system command (requires Graphviz)
Source.from_file('figures/iris_tree.dot')

ModuleNotFoundError: No module named 'graphviz'

In [13]:
RFC.score(X_val,Y_val)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


0.9232876712328767

In [19]:
l = []
for i in range(len(X_val.columns.values)):
    l.append(X_val.columns.values[i] + " : " + str(RFC.feature_importances_[i]))
    
print("Importância de features:")
l.sort()
for i in range(10):
    print(l[i])

Importância de features:
ef_government : 0.0183579338968995
ef_government_consumption : 0.021201143967088335
ef_government_enterprises : 0.0024493211276137997
ef_government_tax : 0.00936483726095152
ef_government_tax_income : 0.0031527528957838785
ef_government_tax_payroll : 0.006737683015061943
ef_government_transfers : 0.0774030279241006
ef_legal : 0.015244119783057583
ef_legal_courts : 0.004458387307496393
ef_legal_crime : 0.007929657966956744


# Cross Validation com Múltiplos Classificadores

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

In [21]:
import scipy.stats as st
classifiers = [LogisticRegression(solver='lbfgs',multi_class="multinomial",random_state=42,n_jobs=-1),KNeighborsClassifier(n_jobs=-1),RandomForestClassifier(n_estimators=100,verbose=0,n_jobs=-1)]
names = ["Logistic Regression","KNN","Random Forest"]
def train_and_test(clf, name):
    scores = cross_val_score(clf, x_train, y_train, cv=10)
    print('Acurácia do %s = %.2f [+/- %.2f]' % (name, scores.mean(), scores.std()))

for name, clf in zip(names, classifiers):
    train_and_test(clf, name);

Acurácia do Logistic Regression = 0.83 [+/- 0.04]
Acurácia do KNN = 0.88 [+/- 0.03]
Acurácia do Random Forest = 0.94 [+/- 0.02]


* Utilizaremos o que tem melhores resultados que é o Random Forest Classifier

# Randomized Search com Hiper Parâmetros

In [24]:
from sklearn.model_selection import RandomizedSearchCV

In [25]:
hyper_param = {
    "n_estimators": np.random.randint(10,100,size=5),
    "criterion"  :["gini","entropy"],
    "max_depth"  :np.random.randint(1,100,size=5)
}

In [26]:
rsc = RandomizedSearchCV(RFC,hyper_param, verbose=1, random_state=42,n_jobs=-1,cv=10)

In [27]:
rsc.fit(x_train,y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    7.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  39 out of  39 | elapsed:    0.1s finished


RandomizedSearchCV(cv=10, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'n_estimators': array([19, 65, 39, 14, 42]), 'criterion': ['gini', 'entropy'], 'max_depth': array([65, 18, 96, 49, 11])},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=1)

In [28]:
rfc_h_pred = rsc.predict(X_val)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  39 out of  39 | elapsed:    0.0s finished


In [29]:
rsc.best_score_

0.9423604757548033

In [30]:
rsc.best_params_

{'n_estimators': 39, 'max_depth': 49, 'criterion': 'gini'}

# Matriz de Confusão

In [32]:
from sklearn.metrics import confusion_matrix

In [33]:
print(confusion_matrix(Y_val, rfc_h_pred))

[[110   9   0]
 [  2 106   5]
 [  2   8 123]]


# Precision/Recall

In [34]:
AxA, AxB, AxC, BxA, BxB, BxC, CxA, CxB, CxC = confusion_matrix(Y_val, rfc_h_pred).ravel()
print(AxA)
print(AxB)
print(AxC)
print(BxA)
print(BxB)
print(BxC)
print(CxA)
print(CxB)
print(CxC)

110
9
0
2
106
5
2
8
123


 # Recall

In [35]:
#axo q tá certo
tprA = AxA/(AxB + AxC + AxA)
tprA

0.9243697478991597

In [36]:
tprB = BxB/(BxA + BxB + BxC)
tprB

0.9380530973451328

In [37]:
tprC = CxC/(CxA + CxB + CxC)
tprC

0.924812030075188

#  Precision

In [38]:
precA = AxA/(AxA + BxA +CxA)
precA

0.9649122807017544

In [39]:
precB = BxB/(AxB + BxB + CxB)
precB

0.8617886178861789

In [40]:
precC = CxC/(AxC + BxC +CxC)
precC

0.9609375

In [41]:
(precA + precB + precC)/3

0.929212799529311