In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_rows',2500)
np.random.seed(42)

In [2]:
data = pd.read_excel("NewDataset.xlsx")
f = open("Features",'r')
data_feature_names = []
for i in f.readlines():
    data_feature_names.append(i.split()[1])

f.close()

# Remoção de features por haver poucos países com tais informações:
* Pf_rol_procedural = Procedural Justice
* pf_rol_civil = Civil justice
* pf_rol_criminal = Criminal justice

In [3]:
test = data.sort_values('countries')
test = test.drop(['pf_rol_procedural','pf_rol_civil','pf_rol_criminal'],axis=1)
test = test.fillna(method='backfill',axis = 0)

In [4]:
test.sort_values('PIB',inplace=True,ascending=False)

In [5]:
def classifier(PIB):
    PIB_third = len(PIB)/3
    r = []
    for i in range(len(PIB)):
        if(i < PIB_third * 1):
            r.append('0')
        elif(i < PIB_third *2):
            r.append('1')
        else:
            r.append('2')
    return r

# 2 ultimos anos da Venezuela vazios
# Dropando países: Taiwan, Macedonia, Syria, Swaziland

In [6]:
test.dropna(how='any',subset=['PIB'],inplace=True)

Criando as Labels e classificando os países como a 0, 1 e 2

In [7]:
test['Label'] = classifier(test['PIB'].values)
final = test.drop(test.columns[:5],axis = 1)
final.drop("PIB",axis = 1,inplace=True)
final = round(final,2)
final = final.fillna(method='ffill')

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
x = final.drop("Label",axis = 1)
y = final["Label"]
x_train,X_val,y_train,Y_val = train_test_split(x,y,random_state = 42)

In [10]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest

In [11]:
RFC = RandomForestClassifier(verbose=1)

In [12]:
RFC.fit(x_train,y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)

In [14]:
estimator = RFC.estimators_[5]

from sklearn.tree import export_graphviz
#from 
# Export as dot file
export_graphviz(estimator, out_file='tree.dot', 
                feature_names = x_train.columns.values,
                class_names = y_train,
                rounded = True, proportion = False, 
                precision = 2, filled = True)

# Convert to png using system command (requires Graphviz)
Source.from_file('figures/iris_tree.dot')

NameError: name 'Source' is not defined

In [13]:
RFC.score(X_val,Y_val)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


0.9205479452054794

In [18]:
l = []
for i in range(len(X_val.columns.values)):
    l.append([RFC.feature_importances_[i],data_feature_names[i]])
    
print("Importância de features:")
l.sort(reverse=True)
for i in range(10):
    print("%s : %0.5f"%(l[i][1],l[i][0]))

Importância de features:
Personal_Freedom_(score) : 0.07740
Procedural_justice : 0.06834
Standard_deviation_of_tariffs_rates : 0.05806
Divorce : 0.05279
Freedom_to_own_foreign_currency_bank_account : 0.03658
Judicial_independence : 0.03156
Civil_justice : 0.02491
Capital_controls : 0.02292
Access_to_cable/satellite : 0.02271
Freedom_to_associate_and_assemble_with_peaceful_individuals_or_organizations : 0.02197


# Cross Validation com Múltiplos Classificadores

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

In [20]:
import scipy.stats as st
classifiers = [LogisticRegression(solver='lbfgs',multi_class="multinomial",random_state=42,n_jobs=-1),KNeighborsClassifier(n_jobs=-1),RandomForestClassifier(n_estimators=100,verbose=0,n_jobs=-1)]
names = ["Logistic Regression","KNN","Random Forest"]
def train_and_test(clf, name):
    scores = cross_val_score(clf, x_train, y_train, cv=10)
    print('Acurácia do %s = %.2f [+/- %.2f]' % (name, scores.mean(), scores.std()))

for name, clf in zip(names, classifiers):
    train_and_test(clf, name);

Acurácia do Logistic Regression = 0.83 [+/- 0.04]
Acurácia do KNN = 0.88 [+/- 0.03]
Acurácia do Random Forest = 0.94 [+/- 0.02]


* Utilizaremos o que tem melhores resultados que é o Random Forest Classifier

# Randomized Search com Hiper Parâmetros

In [21]:
from sklearn.model_selection import RandomizedSearchCV

In [22]:
hyper_param = {
    "n_estimators": np.random.randint(10,100,size=5),
    "criterion"  :["gini","entropy"],
    "max_depth"  :np.random.randint(1,100,size=5)
}

In [23]:
rsc = RandomizedSearchCV(RFC,hyper_param, verbose=1, random_state=42,n_jobs=-1,cv=10)

In [24]:
rsc.fit(x_train,y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    8.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  38 out of  38 | elapsed:    0.2s finished


RandomizedSearchCV(cv=10, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'n_estimators': array([38, 22, 55, 44, 15]), 'criterion': ['gini', 'entropy'], 'max_depth': array([82, 69, 47, 25, 66])},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=1)

In [25]:
rfc_h_pred = rsc.predict(X_val)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  38 out of  38 | elapsed:    0.0s finished


In [26]:
rsc.best_score_

0.9414455626715462

In [27]:
rsc.best_params_

{'n_estimators': 38, 'max_depth': 82, 'criterion': 'entropy'}

# Matriz de Confusão

In [28]:
from sklearn.metrics import confusion_matrix

In [29]:
print(confusion_matrix(Y_val, rfc_h_pred))

[[113   6   0]
 [  1 106   6]
 [  2   4 127]]


# Precision/Recall

In [30]:
AxA, AxB, AxC, BxA, BxB, BxC, CxA, CxB, CxC = confusion_matrix(Y_val, rfc_h_pred).ravel()
print(AxA)
print(AxB)
print(AxC)
print(BxA)
print(BxB)
print(BxC)
print(CxA)
print(CxB)
print(CxC)

113
6
0
1
106
6
2
4
127


 # Recall

In [31]:
#axo q tá certo
tprA = AxA/(AxB + AxC + AxA)
tprA

0.9495798319327731

In [32]:
tprB = BxB/(BxA + BxB + BxC)
tprB

0.9380530973451328

In [33]:
tprC = CxC/(CxA + CxB + CxC)
tprC

0.9548872180451128

#  Precision

In [34]:
precA = AxA/(AxA + BxA +CxA)
precA

0.9741379310344828

In [35]:
precB = BxB/(AxB + BxB + CxB)
precB

0.9137931034482759

In [36]:
precC = CxC/(AxC + BxC +CxC)
precC

0.9548872180451128

In [37]:
(precA + precB + precC)/3

0.9476060841759572