In [1]:
import pandas as pd
import numpy as np

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score


In [2]:
dados = pd.read_csv('15.csv')

# desordenamos os dados pensando que na frente serão utilizados para treinamento e teste
# 
dados = dados.sample(frac=1)
dados.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
35,Spain,Western Europe,36,6.329,0.03468,1.23011,1.31379,0.95562,0.45951,0.06398,0.18227,2.12367
72,Estonia,Central and Eastern Europe,73,5.429,0.04013,1.15174,1.22791,0.77361,0.44888,0.15184,0.0868,1.58782
13,Mexico,Latin America and Caribbean,14,7.187,0.04176,1.02054,0.91451,0.81444,0.48181,0.21312,0.14074,3.60214
134,Egypt,Middle East and Northern Africa,135,4.194,0.0326,0.8818,0.747,0.61712,0.17288,0.06324,0.11291,1.59927
7,Sweden,Western Europe,8,7.364,0.03157,1.33171,1.28907,0.91087,0.6598,0.43844,0.36262,2.37119


In [3]:
# "Region" vai ser o atributo alvo 
# tentaremos treinar o clasificador para terntar identificar a regiao onde o pais 
# se localiza a partir dos indicadores

Y = np.array(dados["Region"])
#len(Y) = 158

# pegamos todos os dados exeito a posicao no ranking e o nome do pais 
# pois não sao relevantes

X = np.array(dados.iloc[: , 5:])

# representação em dataFrame do X

dados.iloc[:,5:].head(2)

Unnamed: 0,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
35,1.23011,1.31379,0.95562,0.45951,0.06398,0.18227,2.12367
72,1.15174,1.22791,0.77361,0.44888,0.15184,0.0868,1.58782


In [4]:
type(X)

numpy.ndarray

###  Fazendo uma analise da quantidade presente de cada uma das regioes vemos que nos casos de america do norte e Australia o a quantidade de instanciaspresentes é bem pequena, contexto que não é recomendado para fazer uma analise do tipo cross-validation.
Por tanto vamos remove-las

In [5]:
# primeiro lista as ocorrencias de cada regiao

unique, counts = np.unique(Y, return_counts=True)
print (np.asarray((unique, counts)).T)

[['Australia and New Zealand' 2]
 ['Central and Eastern Europe' 29]
 ['Eastern Asia' 6]
 ['Latin America and Caribbean' 22]
 ['Middle East and Northern Africa' 20]
 ['North America' 2]
 ['Southeastern Asia' 9]
 ['Southern Asia' 7]
 ['Sub-Saharan Africa' 40]
 ['Western Europe' 21]]


In [6]:
# Para conseguir manipular mais facilmente o conteudo do array que do atributo alvo (em formato de string) 
# vamos converter para DataFrame e tirar assim as regioes que se encontram presentes somente em 2 casos

regioes = pd.DataFrame(data=Y)
regioes.rename(index=str, columns={0: "Regioes"}, inplace=True)

regioes.groupby("Regioes").size()

Regioes
Australia and New Zealand           2
Central and Eastern Europe         29
Eastern Asia                        6
Latin America and Caribbean        22
Middle East and Northern Africa    20
North America                       2
Southeastern Asia                   9
Southern Asia                       7
Sub-Saharan Africa                 40
Western Europe                     21
dtype: int64

In [7]:
# filtrando as regioes

regioes = regioes[regioes.Regioes != 'Australia and New Zealand']
regioes = regioes[regioes.Regioes != 'North America']

regioes.groupby("Regioes").size()

Regioes
Central and Eastern Europe         29
Eastern Asia                        6
Latin America and Caribbean        22
Middle East and Northern Africa    20
Southeastern Asia                   9
Southern Asia                       7
Sub-Saharan Africa                 40
Western Europe                     21
dtype: int64

** O atributo alvo vai ser atualizado com os registros sem as regioes que possuim poucas ocorrencias **

In [8]:
regioes.head()

Unnamed: 0,Regioes
0,Western Europe
1,Central and Eastern Europe
2,Latin America and Caribbean
3,Middle East and Northern Africa
4,Western Europe


In [9]:
Y = regioes['Regioes'].values

** E tambem deverão ser removidos dentre os atributos de testes as entradas correspondentes às regiões que acabamosde retirar **

In [10]:
dados[dados.Region == ('Australia and New Zealand')]

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
8,New Zealand,Australia and New Zealand,9,7.286,0.03371,1.25018,1.31967,0.90837,0.63938,0.42922,0.47501,2.26425
9,Australia,Australia and New Zealand,10,7.284,0.04083,1.33358,1.30923,0.93156,0.65124,0.35637,0.43562,2.26646


In [11]:
dados[dados.Region == ("North America")]

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
14,United States,North America,15,7.119,0.03839,1.39451,1.24711,0.86179,0.54604,0.1589,0.40105,2.51011
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


In [12]:
indice = 0
for i in X[:,0:1]:
    
    if ((i > 1.3945 and i < 1.39452)or
        (i > 1.3262 and i < 1.3263)or
        (i > 1.3335 and i < 1.33359)or
        (i > 1.2501 and i < 1.25019)): 
        print ("indice {} = {} removido".format(indice, i))
        X = np.delete(X, 1, 0)
        
    indice = indice + 1

indice 13 = [ 1.39451] removido
indice 44 = [ 1.25018] removido
indice 97 = [ 1.32629] removido
indice 112 = [ 1.33358] removido


In [13]:

print ("Formato dos atributos \t\t(linhas, colunas) \t= {},  \nFormato do atributo alvo \t(linhas, colunas)\t= {}".format(X.shape, Y.shape))

Formato dos atributos 		(linhas, colunas) 	= (154, 7),  
Formato do atributo alvo 	(linhas, colunas)	= (154,)


In [14]:
# classificador Naive Bayes
clf = MultinomialNB()


In [15]:
scoring = ['precision_macro', 'recall_macro']

scores = cross_validate(clf, X, Y, scoring=scoring, cv=5, return_train_score=True)

sorted(scores.keys())

#clf.fit(X[:140], Y[:140])
#predict = clf.predict(X[-15:])

predicted_nb = cross_val_predict(clf, X, Y, cv=5)

data = {'Classificacao Naive Bayes': predicted_nb, "target": Y}

dfNV = pd.DataFrame(data=data)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


NameError: name 'cross_val_predict' is not defined


## Classes indicadas pelo NaiveBayes

In [402]:
print("K-Fold cross validation. K = 5")
print("Classificação com Naive Bayes")
print("- Acuracia \t= {}".format(metrics.accuracy_score(Y, predicted_3)))
print("- Recall \t= {}".format(scores['test_recall_macro']))
print("- Precisão \t= {}".format(scores['test_precision_macro']))

predicted_6 = cross_val_predict(clf, X, Y, cv=6)
predicted_3 = cross_val_predict(clf, X, Y, cv=3)

print ("\nAcuracia para \n 6-fold = {} \n 3-Fold = {}".format(metrics.accuracy_score(Y, predicted_6),
metrics.accuracy_score(Y, predicted_3)))

dfNV.groupby('Classificacao Naive Bayes').size()


K-Fold cross validation. K = 5
Classificação com Naive Bayes
- Acuracia 	= 0.2597402597402597
- Recall 	= [ 0.125  0.125  0.125  0.125  0.125]
- Precisão 	= [ 0.02941176  0.03125     0.03333333  0.03333333  0.03571429]

Acuracia para 
 6-fold = 0.2597402597402597 
 3-Fold = 0.2597402597402597


Classificacao Naive Bayes
Sub-Saharan Africa    154
dtype: int64

Vemos que para todas as entradas do banco foi indicada o mesmo atributo alvo, no caso pertence a região de "africa" que, com 40 entradas é a moda.


In [23]:
from sklearn.model_selection import cross_val_predict
from sklearn import metrics

predicted = cross_val_predict(clf, X, Y, cv=6)
metrics.accuracy_score(Y, predicted)



0.25324675324675322

In [24]:
from sklearn.model_selection import LeaveOneOut

loo = LeaveOneOut()
loo.get_n_splits(X)



154

##  Classificando com KNN

In [25]:

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)


StandardScaler(copy=True, with_mean=True, with_std=True)

In [26]:
scaled_features = scaler.transform(X)

In [27]:
X_train, y_train = X[:142], Y[:142]
X_test = X[142:]

In [28]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)


In [29]:
scoring = ['precision_macro', 'recall_macro']

scores = cross_validate(knn, X, Y, scoring=scoring,
                        cv=3, return_train_score=True)
sorted(scores.keys())

data = {'Classificação-KNN': predicted, "target": Y}

dfknn = pd.DataFrame(data=data)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [31]:
print("===== K fold cross validation  K = 3 ===== ")
print("===== Tentativa de classificação do KNN k= 5 ===== ")

predicted_knn_3 = cross_val_predict(knn, X, Y, cv=3)
predicted_knn_5 = cross_val_predict(knn, X, Y, cv=5)
predicted_knn_6 = cross_val_predict(knn, X, Y, cv=6)

print("- Acuracia \t= {}".format(metrics.accuracy_score(Y, predicted_knn_5)))
print("- Recall \t= {}".format(scores['test_recall_macro']))
print("- Precisão \t= {}".format(scores['test_precision_macro']))


print ("\nAcuracia para \n 6-fold = {} \n 3-Fold = {}".format(metrics.accuracy_score(Y, predicted_knn_6),
metrics.accuracy_score(Y, predicted_knn_3)))


dfknn.groupby('Classificação-KNN').size()

===== K fold cross validation  K = 3 ===== 
===== Tentativa de classificação do KNN k= 5 ===== 
- Acuracia 	= 0.24675324675324675
- Recall 	= [ 0.10133929  0.16295788  0.18070818]
- Precisão 	= [ 0.07904595  0.1758658   0.15502137]

Acuracia para 
 6-fold = 0.24675324675324675 
 3-Fold = 0.22077922077922077


Classificação-KNN
Central and Eastern Europe      2
Sub-Saharan Africa            151
Western Europe                  1
dtype: int64

In [27]:
predicted_10 = cross_val_predict(knn, X, Y, cv=10)
predicted_5 = cross_val_predict(knn, X, Y, cv=5)

print ("Acuracia para \n 10-fold = {} \n 5-Fold = {}".format(metrics.accuracy_score(Y, predicted_10),
metrics.accuracy_score(Y, predicted_5)))



Acuracia para 
 10-fold = 0.5443037974683544 
 5-Fold = 0.569620253164557


