In [1]:
import pandas as pd
import numpy as np

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score


In [2]:
dados = pd.read_csv('../datasets/15.csv')

# desordenamos os dados pensando que na frente serão utilizados para treinamento e teste
# 
dados = dados.sample(frac=1)
dados.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
106,Tunisia,Middle East and Northern Africa,107,4.739,0.03589,0.88113,0.60429,0.73793,0.26268,0.06358,0.06431,2.12466
44,Slovakia,Central and Eastern Europe,45,5.995,0.04267,1.16891,1.26999,0.78902,0.31751,0.03431,0.16893,2.24639
56,Nicaragua,Latin America and Caribbean,57,5.828,0.05371,0.59325,1.14184,0.74314,0.55475,0.19317,0.27815,2.32407
92,Macedonia,Central and Eastern Europe,93,5.007,0.05376,0.91851,1.00232,0.73545,0.33457,0.05327,0.22359,1.73933
67,Algeria,Middle East and Northern Africa,68,5.605,0.05099,0.93929,1.07772,0.61766,0.28579,0.17383,0.07822,2.43209


In [4]:
# "Region" vai ser o atributo alvo 
# tentaremos treinar o clasificador para terntar identificar a regiao onde o pais 
# se localiza a partir dos indicadores

Y = np.array(dados["Region"])
#len(Y) = 158

# pegamos todos os dados exeito a posicao no ranking e o nome do pais 
# pois não sao relevantes

X = np.array(dados.iloc[: , 5:])

# representação em dataFrame do X

dados.iloc[:,5:].head(2)

Unnamed: 0,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
106,0.88113,0.60429,0.73793,0.26268,0.06358,0.06431,2.12466
44,1.16891,1.26999,0.78902,0.31751,0.03431,0.16893,2.24639


In [5]:
type(X)

numpy.ndarray

###  Fazendo uma analise da quantidade presente de cada uma das regioes vemos que nos casos de america do norte e Australia o a quantidade de instanciaspresentes é bem pequena, contexto que não é recomendado para fazer uma analise do tipo cross-validation.
Por tanto vamos remove-las

In [6]:
# primeiro lista as ocorrencias de cada regiao

unique, counts = np.unique(Y, return_counts=True)
print (np.asarray((unique, counts)).T)

[['Australia and New Zealand' 2]
 ['Central and Eastern Europe' 29]
 ['Eastern Asia' 6]
 ['Latin America and Caribbean' 22]
 ['Middle East and Northern Africa' 20]
 ['North America' 2]
 ['Southeastern Asia' 9]
 ['Southern Asia' 7]
 ['Sub-Saharan Africa' 40]
 ['Western Europe' 21]]


In [7]:
# Para conseguir manipular mais facilmente o conteudo do array que do atributo alvo (em formato de string) 
# vamos converter para DataFrame e tirar assim as regioes que se encontram presentes somente em 2 casos

regioes = pd.DataFrame(data=Y)
regioes.rename(index=str, columns={0: "Regioes"}, inplace=True)

regioes.groupby("Regioes").size()

Regioes
Australia and New Zealand           2
Central and Eastern Europe         29
Eastern Asia                        6
Latin America and Caribbean        22
Middle East and Northern Africa    20
North America                       2
Southeastern Asia                   9
Southern Asia                       7
Sub-Saharan Africa                 40
Western Europe                     21
dtype: int64

In [8]:
# filtrando as regioes

regioes = regioes[regioes.Regioes != 'Australia and New Zealand']
regioes = regioes[regioes.Regioes != 'North America']

regioes.groupby("Regioes").size()

Regioes
Central and Eastern Europe         29
Eastern Asia                        6
Latin America and Caribbean        22
Middle East and Northern Africa    20
Southeastern Asia                   9
Southern Asia                       7
Sub-Saharan Africa                 40
Western Europe                     21
dtype: int64

** O atributo alvo vai ser atualizado com os registros sem as regioes que possuim poucas ocorrencias **

In [9]:
regioes.head()

Unnamed: 0,Regioes
0,Middle East and Northern Africa
1,Central and Eastern Europe
2,Latin America and Caribbean
3,Central and Eastern Europe
4,Middle East and Northern Africa


In [10]:
Y = regioes['Regioes'].values

** E tambem deverão ser removidos dentre os atributos de testes as entradas correspondentes às regiões que acabamosde retirar **

In [11]:
dados[dados.Region == ('Australia and New Zealand')]

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
9,Australia,Australia and New Zealand,10,7.284,0.04083,1.33358,1.30923,0.93156,0.65124,0.35637,0.43562,2.26646
8,New Zealand,Australia and New Zealand,9,7.286,0.03371,1.25018,1.31967,0.90837,0.63938,0.42922,0.47501,2.26425


In [12]:
dados[dados.Region == ("North America")]

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
14,United States,North America,15,7.119,0.03839,1.39451,1.24711,0.86179,0.54604,0.1589,0.40105,2.51011
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


In [13]:
indice = 0
for i in X[:,0:1]:
    
    if ((i > 1.3945 and i < 1.39452)or
        (i > 1.3262 and i < 1.3263)or
        (i > 1.3335 and i < 1.33359)or
        (i > 1.2501 and i < 1.25019)): 
        print ("indice {} = {} removido".format(indice, i))
        X = np.delete(X, 1, 0)
        
    indice = indice + 1

indice 68 = [ 1.33358] removido
indice 76 = [ 1.39451] removido
indice 110 = [ 1.25018] removido
indice 122 = [ 1.32629] removido


In [14]:

print ("Formato dos atributos \t\t(linhas, colunas) \t= {},  \nFormato do atributo alvo \t(linhas, colunas)\t= {}".format(X.shape, Y.shape))

Formato dos atributos 		(linhas, colunas) 	= (154, 7),  
Formato do atributo alvo 	(linhas, colunas)	= (154,)


In [15]:
# classificador Naive Bayes
clf = MultinomialNB()


In [15]:
scoring = ['precision_macro', 'recall_macro']

scores = cross_validate(clf, X, Y, scoring=scoring, cv=5, return_train_score=True)

sorted(scores.keys())

#clf.fit(X[:140], Y[:140])
#predict = clf.predict(X[-15:])

predicted_nb = cross_val_predict(clf, X, Y, cv=5)

data = {'Classificacao Naive Bayes': predicted_nb, "target": Y}

dfNV = pd.DataFrame(data=data)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


NameError: name 'cross_val_predict' is not defined


## Classes indicadas pelo NaiveBayes

In [402]:
print("K-Fold cross validation. K = 5")
print("Classificação com Naive Bayes")
print("- Acuracia \t= {}".format(metrics.accuracy_score(Y, predicted_3)))
print("- Recall \t= {}".format(scores['test_recall_macro']))
print("- Precisão \t= {}".format(scores['test_precision_macro']))

predicted_6 = cross_val_predict(clf, X, Y, cv=6)
predicted_3 = cross_val_predict(clf, X, Y, cv=3)

print ("\nAcuracia para \n 6-fold = {} \n 3-Fold = {}".format(metrics.accuracy_score(Y, predicted_6),
metrics.accuracy_score(Y, predicted_3)))

dfNV.groupby('Classificacao Naive Bayes').size()


K-Fold cross validation. K = 5
Classificação com Naive Bayes
- Acuracia 	= 0.2597402597402597
- Recall 	= [ 0.125  0.125  0.125  0.125  0.125]
- Precisão 	= [ 0.02941176  0.03125     0.03333333  0.03333333  0.03571429]

Acuracia para 
 6-fold = 0.2597402597402597 
 3-Fold = 0.2597402597402597


Classificacao Naive Bayes
Sub-Saharan Africa    154
dtype: int64

Vemos que para todas as entradas do banco foi indicada o mesmo atributo alvo, no caso pertence a região de "africa" que, com 40 entradas é a moda.


In [16]:
from sklearn.model_selection import cross_val_predict
from sklearn import metrics

predicted = cross_val_predict(clf, X, Y, cv=6)
metrics.accuracy_score(Y, predicted)



0.25974025974025972

In [17]:
from sklearn.model_selection import LeaveOneOut

loo = LeaveOneOut()
loo.get_n_splits(X)



154

##  Classificando com KNN

In [18]:

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)


StandardScaler(copy=True, with_mean=True, with_std=True)

In [19]:
scaled_features = scaler.transform(X)

In [20]:
X_train, y_train = X[:142], Y[:142]
X_test = X[142:]

In [21]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)


In [22]:
scoring = ['precision_macro', 'recall_macro']

scores = cross_validate(knn, X, Y, scoring=scoring,
                        cv=3, return_train_score=True)
sorted(scores.keys())

data = {'Classificação-KNN': predicted, "target": Y}

dfknn = pd.DataFrame(data=data)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [23]:
print("===== K fold cross validation  K = 3 ===== ")
print("===== Tentativa de classificação do KNN k= 5 ===== ")

predicted_knn_3 = cross_val_predict(knn, X, Y, cv=3)
predicted_knn_5 = cross_val_predict(knn, X, Y, cv=5)
predicted_knn_6 = cross_val_predict(knn, X, Y, cv=6)

print("- Acuracia \t= {}".format(metrics.accuracy_score(Y, predicted_knn_5)))
print("- Recall \t= {}".format(scores['test_recall_macro']))
print("- Precisão \t= {}".format(scores['test_precision_macro']))


print ("\nAcuracia para \n 6-fold = {} \n 3-Fold = {}".format(metrics.accuracy_score(Y, predicted_knn_6),
metrics.accuracy_score(Y, predicted_knn_3)))


dfknn.groupby('Classificação-KNN').size()

===== K fold cross validation  K = 3 ===== 
===== Tentativa de classificação do KNN k= 5 ===== 
- Acuracia 	= 0.18181818181818182
- Recall 	= [ 0.16979167  0.12953297  0.11874237]
- Precisão 	= [ 0.23913399  0.11123512  0.08106061]

Acuracia para 
 6-fold = 0.18181818181818182 
 3-Fold = 0.19480519480519481


Classificação-KNN
Sub-Saharan Africa    154
dtype: int64

In [27]:
predicted_10 = cross_val_predict(knn, X, Y, cv=10)
predicted_5 = cross_val_predict(knn, X, Y, cv=5)

print ("Acuracia para \n 10-fold = {} \n 5-Fold = {}".format(metrics.accuracy_score(Y, predicted_10),
metrics.accuracy_score(Y, predicted_5)))



Acuracia para 
 10-fold = 0.5443037974683544 
 5-Fold = 0.569620253164557


