# Modelos ML e distribuições

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as srn
srn.set()
%matplotlib inline

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

## Naïve Bayes e distribuições

### Bernoulli

In [None]:
from sklearn.naive_bayes import BernoulliNB

In [None]:
df_census = pd.read_csv('Bases de dados/census.csv')
print(df_census.shape)
df_census.head()

In [None]:
df_census['sex'].value_counts()

Variável X:

In [None]:
# categoricos - male/female
X = df_census['sex'].values
print(np.unique(X, return_counts=True))

# discreto 1, 0
label_encoder = LabelEncoder()
X = label_encoder.fit_transform(X)
print(np.unique(X, return_counts=True), end='\n\n')

print(X.shape)  # vetor
X = X.reshape(-1, 1)
print(X.shape)  # matriz

Variável y:

In [None]:
y = df_census['income'].values
y.shape

In [None]:
srn.histplot(X, kde=False);

(isso é tão errado, mas tudo bem né, só estamos 'testando')

In [None]:
bernoulli_nb = BernoulliNB()
bernoulli_nb.fit(X, y)
previsoes = bernoulli_nb.predict(X)
previsoes.shape

In [None]:
accuracy_score(y, previsoes)

### Multinomial

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
df_census.columns.values[[1, 3, 5, 6, 7, 8, 13]]

In [None]:
X_census = df_census.iloc[:,[1, 3, 5, 6, 7, 8, 13]].values
y_census = df_census.iloc[:,14].values
X_census.shape, y_census.shape

In [None]:
# Para cada atributo:

label_encoder_workclass = LabelEncoder()
label_encoder_education = LabelEncoder()
label_encoder_marital = LabelEncoder()
label_encoder_occupation = LabelEncoder()
label_encoder_relationship = LabelEncoder()
label_encoder_race = LabelEncoder()
label_encoder_country = LabelEncoder()

X_census[:,0] = label_encoder_workclass.fit_transform(X_census[:,0])
X_census[:,1] = label_encoder_education.fit_transform(X_census[:,1])
X_census[:,2] = label_encoder_marital.fit_transform(X_census[:,2])
X_census[:,3] = label_encoder_occupation.fit_transform(X_census[:,3])
X_census[:,4] = label_encoder_relationship.fit_transform(X_census[:,4])
X_census[:,5] = label_encoder_race.fit_transform(X_census[:,5])
X_census[:,6] = label_encoder_country.fit_transform(X_census[:,6])

In [None]:
multinomial_nb = MultinomialNB()
multinomial_nb.fit(X_census, y_census)
previsoes = multinomial_nb.predict(X_census)
previsoes.shape

In [None]:
accuracy_score(y_census, previsoes)

## Algoritmo k-NN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

### Sem padronização

In [None]:
df_credit = pd.read_csv('Bases de dados/credit_data.csv')
df_credit.dropna(inplace=True)
print(df_credit.shape)
df_credit.head()

In [None]:
X = df_credit.iloc[:,1:4].values
y = df_credit.iloc[:,4].values
X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)
print(f'Treino: \n X: {X_train.shape} y: {y_train.shape}\n')
print(f'Teste: \n X: {X_test.shape} y: {y_test.shape}\n')

In [None]:
# Média, mediana e desvio padrão
print(f'Treino: {np.mean(X_train[0]), np.median(X_train[0]), np.std(X_train[0])}')
print(f'Teste: {np.mean(X_test[0]), np.median(X_test[0]), np.std(X_test[0])}')

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
previsoes = knn.predict(X_test)
previsoes.shape

In [None]:
accuracy_score(y_test, previsoes)

### Com padronização

In [None]:
z_score_train = StandardScaler()
z_score_test = StandardScaler()

X_train_p = z_score_train.fit_transform(X_train)
X_test_p = z_score_test.fit_transform(X_test)

X_train_p.shape, X_test_p.shape

In [None]:
# Média, mediana e desvio padrão - padronizado
print(f'Treino: {np.mean(X_train_p), np.median(X_train_p), np.std(X_train_p)}')
print(f'Teste: {np.mean(X_test_p), np.median(X_test_p), np.std(X_test_p)}')

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train_p, y_train)
previsoes_p = knn.predict(X_test_p)
previsoes_p.shape

In [None]:
accuracy_score(y_test, previsoes_p)

Bem melhor!

## Dados enviesados e ML
- Utilizando Regressão Linear

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

In [None]:
df_houses = pd.read_csv('Bases de dados/house_prices.csv')
print(df_houses.shape)
df_houses.head()

In [None]:
srn.histplot(df_houses['price'], kde=True);

In [None]:
srn.histplot(df_houses['sqft_living'], kde=True);

In [None]:
srn.histplot(df_houses['sqft_lot'], kde=True);

In [None]:
X = df_houses['sqft_living'].values.reshape(-1,1)  # matriz
y = df_houses['price'].values
X.shape, y.shape

In [None]:
regressor = LinearRegression()
regressor.fit(X, y)
previsoes = regressor.predict(X)
previsoes.shape

In [None]:
srn.histplot(previsoes, kde=True);

In [None]:
mean_absolute_error(y, previsoes)

In [None]:
r2_score(y, previsoes) # quanto mais próximo de 1, melhor o resultado

### Com tratamento

In [None]:
X_novo = np.log(X)
y_novo = np.log(y)
X_novo.shape, y_novo.shape

In [None]:
srn.histplot(X_novo, kde=True);  # se torna uma distribuição normal

In [None]:
regressor_novo = LinearRegression()
regressor_novo.fit(X_novo, y_novo)
previsoes_novo = regressor_novo.predict(X_novo)
previsoes_novo.shape

In [None]:
mean_absolute_error(y_novo, previsoes_novo)

In [None]:
r2_score(y_novo, previsoes_novo)

## Redes Neurais