# Avaliação de algoritmos de Machine Learning

In [1]:
import pandas as pd
import numpy as np

In [2]:
from scipy.stats import stats
import math
import statistics

In [3]:
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import MinMaxScaler

In [4]:
df_credit = pd.read_csv('Bases de dados/credit_data.csv')
print(df_credit.shape)
df_credit.head()

(2000, 5)


Unnamed: 0,i#clientid,income,age,loan,c#default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


In [5]:
df_credit.dropna(inplace=True)
df_credit.shape

(1997, 5)

In [6]:
X = df_credit.iloc[:, 1:4].values
y = df_credit.iloc[:, 4].values
X.shape, y.shape

((1997, 3), (1997,))

In [7]:
resultados_naive_bayes = []
resultados_logistic = []
resultados_forest = []
for i in range(30):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=i)
    
    # Naive Bayes
    naive_bayes = GaussianNB()
    naive_bayes.fit(X_train, y_train)
    resultados_naive_bayes.append(accuracy_score(y_test, naive_bayes.predict(X_test)))
    
    # Logisti Regression
    logistic = LogisticRegression()
    logistic.fit(X_train, y_train)
    resultados_logistic.append(accuracy_score(y_test, logistic.predict(X_test)))
    
    # Random Forest
    forest = RandomForestClassifier()
    forest.fit(X_train, y_train)
    resultados_forest.append(accuracy_score(y_test, forest.predict(X_test)))

In [8]:
print(resultados_naive_bayes)

[0.925, 0.925, 0.9325, 0.925, 0.92, 0.905, 0.9175, 0.9175, 0.9125, 0.9325, 0.9225, 0.9125, 0.935, 0.9175, 0.925, 0.9175, 0.9275, 0.92, 0.9325, 0.92, 0.93, 0.905, 0.9175, 0.9325, 0.9425, 0.9375, 0.94, 0.92, 0.935, 0.925]


In [9]:
print(resultados_logistic)

[0.9325, 0.91, 0.9125, 0.9225, 0.9075, 0.89, 0.91, 0.9075, 0.8775, 0.915, 0.9175, 0.9, 0.925, 0.9175, 0.9025, 0.9125, 0.9525, 0.91, 0.9225, 0.9075, 0.925, 0.905, 0.9075, 0.945, 0.9225, 0.9275, 0.9225, 0.9175, 0.91, 0.9]


In [10]:
print(resultados_forest)

[0.975, 0.99, 0.9825, 0.99, 0.9775, 0.98, 0.9775, 0.985, 0.9775, 0.9825, 0.9875, 0.9775, 0.975, 0.9875, 0.98, 0.975, 0.9875, 0.985, 0.99, 0.985, 0.9825, 0.9725, 0.9775, 0.985, 0.9875, 0.995, 0.9925, 0.9825, 0.99, 0.9925]


## Média

In [11]:
resultados_naive_bayes = np.array(resultados_naive_bayes)
resultados_logistic = np.array(resultados_logistic)
resultados_forest = np.array(resultados_forest)

In [12]:
print(f'Naive Bayes: {resultados_naive_bayes.mean()}')
print(f'Logistic Regression: {resultados_logistic.mean()}')
print(f'Random Forest:{resultados_forest.mean()}')

Naive Bayes: 0.92425
Logistic Regression: 0.9145
Random Forest:0.9834999999999999


## Moda

In [13]:
print(f'Naive Bayes: {statistics.mode(resultados_naive_bayes)}')
print(f'Logistic Regression: {statistics.mode(resultados_logistic)}')
print(f'Random Forest:{statistics.mode(resultados_forest)}')

Naive Bayes: 0.925
Logistic Regression: 0.91
Random Forest:0.9775


## Mediana

In [14]:
print(f'Naive Bayes: {np.median(resultados_naive_bayes)}')
print(f'Logistic Regression: {np.median(resultados_logistic)}')
print(f'Random Forest:{np.median(resultados_forest)}')

Naive Bayes: 0.925
Logistic Regression: 0.9125
Random Forest:0.98375


## Variância

In [15]:
print(f'Naive Bayes: {statistics.variance(resultados_naive_bayes)}')
print(f'Logistic Regression: {statistics.variance(resultados_logistic)}')
print(f'Random Forest:{statistics.variance(resultados_forest)}')

Naive Bayes: 9.058189655172414e-05
Logistic Regression: 0.00021655172413793106
Random Forest:3.732758620689658e-05


## Desvio Padrão

In [16]:
print(f'Naive Bayes: {statistics.stdev(resultados_naive_bayes)}')
print(f'Logistic Regression: {statistics.stdev(resultados_logistic)}')
print(f'Random Forest:{statistics.stdev(resultados_forest)}')

Naive Bayes: 0.009517452209059111
Logistic Regression: 0.014715696522350923
Random Forest:0.006109630611329672


## Coeficiente de variação

In [17]:
print(f'Naive Bayes: {stats.variation(resultados_naive_bayes)*100} %')
print(f'Logistic Regression: {stats.variation(resultados_logistic)*100} %')
print(f'Random Forest:{stats.variation(resultados_forest)*100} %')

Naive Bayes: 1.0124407504578252 %
Logistic Regression: 1.5821056616884057 %
Random Forest:0.6107717773577396 %


## EXERCICIO - Validação cruzada

In [18]:
resultados_naive_bayes_cv = []
resultados_logistic_cv = []
resultados_forest_cv = []
for i in range(30):
    kfold = KFold(n_splits=10, shuffle=True, random_state=i)
    
    # Naive Bayes
    naive_bayes = GaussianNB()
    scores = cross_val_score(naive_bayes, X, y, cv=kfold)
    resultados_naive_bayes_cv.append(scores.mean())
    
    # Logisti Regression
    logistic = LogisticRegression()
    scores = cross_val_score(logistic, X, y, cv=kfold)
    resultados_logistic_cv.append(scores.mean())
    
    # Random Forest
    forest = RandomForestClassifier()
    scores = cross_val_score(forest, X, y, cv=kfold)
    resultados_forest_cv.append(scores.mean())

## Média

In [19]:
resultados_naive_bayes_cv = np.array(resultados_naive_bayes_cv)
resultados_logistic_cv = np.array(resultados_logistic_cv)
resultados_forest_cv = np.array(resultados_forest_cv)

In [20]:
print(f'Naive Bayes: {resultados_naive_bayes_cv.mean()}')
print(f'Logistic Regression: {resultados_logistic_cv.mean()}')
print(f'Random Forest:{resultados_forest_cv.mean()}')

Naive Bayes: 0.9249359296482411
Logistic Regression: 0.9137817420435511
Random Forest:0.9867640703517588


## Moda

In [21]:
print(f'Naive Bayes: {statistics.mode(resultados_naive_bayes_cv)}')
print(f'Logistic Regression: {statistics.mode(resultados_logistic_cv)}')
print(f'Random Forest:{statistics.mode(resultados_forest_cv)}')

Naive Bayes: 0.9248894472361808
Logistic Regression: 0.9113718592964826
Random Forest:0.9874748743718593


## Mediana

In [22]:
print(f'Naive Bayes: {np.median(resultados_naive_bayes_cv)}')
print(f'Logistic Regression: {np.median(resultados_logistic_cv)}')
print(f'Random Forest:{np.median(resultados_forest_cv)}')

Naive Bayes: 0.9248919597989949
Logistic Regression: 0.9130979899497487
Random Forest:0.9872273869346735


## Variância

In [23]:
print(f'Naive Bayes: {statistics.variance(resultados_naive_bayes_cv)}')
print(f'Logistic Regression: {statistics.variance(resultados_logistic_cv)}')
print(f'Random Forest:{statistics.variance(resultados_forest_cv)}')

Naive Bayes: 6.608178433320306e-07
Logistic Regression: 1.3004529114700778e-05
Random Forest:2.019993116683719e-06


## Desvio Padrão

In [24]:
print(f'Naive Bayes: {statistics.stdev(resultados_naive_bayes_cv)}')
print(f'Logistic Regression: {statistics.stdev(resultados_logistic_cv)}')
print(f'Random Forest:{statistics.stdev(resultados_forest_cv)}')

Naive Bayes: 0.0008129070324040939
Logistic Regression: 0.003606179295972509
Random Forest:0.0014212646188109092


## Coeficiente de variação

In [25]:
print(f'Naive Bayes: {stats.variation(resultados_naive_bayes_cv)*100} %')
print(f'Logistic Regression: {stats.variation(resultados_logistic_cv)*100} %')
print(f'Random Forest:{stats.variation(resultados_forest_cv)*100} %')

Naive Bayes: 0.08641071566366061 %
Logistic Regression: 0.38801026116292653 %
Random Forest:0.14161197789219637 %


## Seleção de atributos utilizando variância

In [26]:
base_selecao = {'a': np.random.rand(20), 
                'b': np.array([0.5] * 20), 
                'classe': np.random.randint(0, 2, size=20)}
base_selecao = pd.DataFrame(base_selecao)
base_selecao.head()

Unnamed: 0,a,b,classe
0,0.086426,0.5,1
1,0.249144,0.5,1
2,0.808004,0.5,0
3,0.424204,0.5,0
4,0.536505,0.5,1


In [27]:
base_selecao.describe(include='all').T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
a,20.0,0.415997,0.283576,0.078285,0.217089,0.34051,0.580445,0.967914
b,20.0,0.5,0.0,0.5,0.5,0.5,0.5,0.5
classe,20.0,0.35,0.48936,0.0,0.0,0.0,1.0,1.0


In [28]:
np.var(base_selecao['a']), np.var(base_selecao['b'])

(0.0763943990873169, 0.0)

In [29]:
math.sqrt(np.var(base_selecao['a'])), math.sqrt(np.var(base_selecao['b']))

(0.2763953673405488, 0.0)

In [30]:
X = base_selecao.iloc[:, 0:2].values
y = base_selecao.iloc[:, 2].values
X.shape, y.shape

((20, 2), (20,))

In [31]:
selecao = VarianceThreshold(threshold=0.05)
X_novo = selecao.fit_transform(X)
X_novo.shape

(20, 1)

In [32]:
selecao.variances_

array([0.0763944, 0.       ])

In [33]:
indices = np.where(selecao.variances_ > 0.05)
indices

(array([0], dtype=int64),)

Registros com variancia zero pode ser excluido da base de dados!

## EXERCICIO

In [34]:
base_credit = pd.read_csv('Bases de dados/credit_data.csv')
base_credit.dropna(inplace=True)
print(base_credit.shape)
base_credit.head()

(1997, 5)


Unnamed: 0,i#clientid,income,age,loan,c#default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


In [35]:
base_credit.describe(include='all').T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
i#clientid,1997.0,1001.956935,576.702206,1.0,503.0,1002.0,1501.0,2000.0
income,1997.0,45333.864334,14325.131177,20014.48947,32804.904487,45788.7471,57787.565659,69995.685578
age,1997.0,40.807559,13.624469,-52.42328,28.990415,41.317159,52.58704,63.971796
loan,1997.0,4445.487716,3046.792457,1.37763,1936.813257,3977.287432,6440.861434,13766.051239
c#default,1997.0,0.141713,0.348842,0.0,0.0,0.0,0.0,1.0


In [36]:
print(f"income: {np.var(base_credit['income'])}")
print(f"age: {np.var(base_credit['age'])}")
print(f"loan: {np.var(base_credit['loan'])}")

income: 205106624.40660834
age: 185.53321543372624
loan: 9278295.832931679


In [53]:
X = base_credit.iloc[:, :4].values
y = base_credit.iloc[:, 4].values
X.shape, y.shape

((1997, 4), (1997,))

In [54]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X.shape

(1997, 4)

In [55]:
np.var(X[0]), np.var(X[1]), np.var(X[2])

(0.147779699597757, 0.09822573781519718, 0.13328359014656763)

In [58]:
selecao_credit = VarianceThreshold(threshold=0.08) # olhar a variancia dos atributos
X_novo = selecao_credit.fit_transform(X)
X_novo.shape

(1997, 2)

In [59]:
naive_sem_selecao = GaussianNB()
naive_sem_selecao.fit(X, y)
previsoes = naive_sem_selecao.predict(X)
accuracy_score(previsoes, y)

0.927391086629945

In [60]:
naive_com_selecao = GaussianNB()
naive_com_selecao.fit(X_novo, y)
previsoes_com = naive_com_selecao.predict(X_novo)
accuracy_score(previsoes_com, y)

0.8582874311467201

## Valores faltantes com média e moda

In [73]:
df_credit = pd.read_csv('Bases de dados/credit_data.csv')
df_credit.head()

Unnamed: 0,i#clientid,income,age,loan,c#default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


In [74]:
df_credit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   i#clientid  2000 non-null   int64  
 1   income      2000 non-null   float64
 2   age         1997 non-null   float64
 3   loan        2000 non-null   float64
 4   c#default   2000 non-null   int64  
dtypes: float64(3), int64(2)
memory usage: 78.2 KB


In [75]:
df_credit.isnull().sum()

i#clientid    0
income        0
age           3
loan          0
c#default     0
dtype: int64

In [None]:
nulos = df_Credit

In [65]:
df_autos = pd.read_csv('Bases de dados/autos.csv', encoding='ISO-8859-1')
print(df_autos.shape)
df_autos.head()

(371528, 20)


Unnamed: 0,dateCrawled,name,seller,offerType,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage,dateCreated,nrOfPictures,postalCode,lastSeen
0,2016-03-24 11:52:17,Golf_3_1.6,privat,Angebot,480,test,,1993,manuell,0,golf,150000,0,benzin,volkswagen,,2016-03-24 00:00:00,0,70435,2016-04-07 03:16:57
1,2016-03-24 10:58:45,A5_Sportback_2.7_Tdi,privat,Angebot,18300,test,coupe,2011,manuell,190,,125000,5,diesel,audi,ja,2016-03-24 00:00:00,0,66954,2016-04-07 01:46:50
2,2016-03-14 12:52:21,"Jeep_Grand_Cherokee_""Overland""",privat,Angebot,9800,test,suv,2004,automatik,163,grand,125000,8,diesel,jeep,,2016-03-14 00:00:00,0,90480,2016-04-05 12:47:46
3,2016-03-17 16:54:04,GOLF_4_1_4__3TÜRER,privat,Angebot,1500,test,kleinwagen,2001,manuell,75,golf,150000,6,benzin,volkswagen,nein,2016-03-17 00:00:00,0,91074,2016-03-17 17:40:17
4,2016-03-31 17:25:20,Skoda_Fabia_1.4_TDI_PD_Classic,privat,Angebot,3600,test,kleinwagen,2008,manuell,69,fabia,90000,7,diesel,skoda,nein,2016-03-31 00:00:00,0,60437,2016-04-06 10:17:21


In [69]:
df_autos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 371528 entries, 0 to 371527
Data columns (total 20 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   dateCrawled          371528 non-null  object
 1   name                 371528 non-null  object
 2   seller               371528 non-null  object
 3   offerType            371528 non-null  object
 4   price                371528 non-null  int64 
 5   abtest               371528 non-null  object
 6   vehicleType          333659 non-null  object
 7   yearOfRegistration   371528 non-null  int64 
 8   gearbox              351319 non-null  object
 9   powerPS              371528 non-null  int64 
 10  model                351044 non-null  object
 11  kilometer            371528 non-null  int64 
 12  monthOfRegistration  371528 non-null  int64 
 13  fuelType             338142 non-null  object
 14  brand                371528 non-null  object
 15  notRepairedDamage    299468 non-nu

In [70]:
df_autos.isnull().sum()

dateCrawled                0
name                       0
seller                     0
offerType                  0
price                      0
abtest                     0
vehicleType            37869
yearOfRegistration         0
gearbox                20209
powerPS                    0
model                  20484
kilometer                  0
monthOfRegistration        0
fuelType               33386
brand                      0
notRepairedDamage      72060
dateCreated                0
nrOfPictures               0
postalCode                 0
lastSeen                   0
dtype: int64