# Dados desbalanceados

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as srn

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

## Preparação dos dados

In [None]:
base_census = pd.read_csv('dados\census.csv')
print(base_census.shape)
base_census.head()

In [None]:
np.unique(base_census['income'], return_counts=True)

Temos dados desbalanceados, pois há muito mais '<=50K' do que '>50K', cerca de 3x.

In [None]:
srn.countplot(x=base_census['income'])

In [None]:
X_census = base_census.iloc[:, 0:14].values
y_census = base_census.iloc[:, 14].values
X_census.shape, y_census.shape

In [None]:
X_census

In [None]:
# Label Encoder para atributos categóricos

label_encoder_workclass = LabelEncoder()
label_encoder_education = LabelEncoder()
label_encoder_marital = LabelEncoder()
label_encoder_occupation = LabelEncoder()
label_encoder_relationship = LabelEncoder()
label_encoder_race = LabelEncoder()
label_encoder_sex = LabelEncoder()
label_encoder_country = LabelEncoder()

X_census[:,1] = label_encoder_workclass.fit_transform(X_census[:,1])
X_census[:,3] = label_encoder_education.fit_transform(X_census[:,3])
X_census[:,5] = label_encoder_marital.fit_transform(X_census[:,5])
X_census[:,6] = label_encoder_occupation.fit_transform(X_census[:,6])
X_census[:,7] = label_encoder_relationship.fit_transform(X_census[:,7])
X_census[:,8] = label_encoder_race.fit_transform(X_census[:,8])
X_census[:,9] = label_encoder_sex.fit_transform(X_census[:,9])
X_census[:,13] = label_encoder_country.fit_transform(X_census[:,13])

In [None]:
X_census

## Subamostragem com TomekLinks

In [None]:
#pip install imblearn

In [None]:
from imblearn.under_sampling import TomekLinks

In [None]:
tl = TomekLinks(sampling_strategy='majority')
X_under, y_under = tl.fit_sample(X_census, y_census)
X_under.shape, y_under.shape

In [None]:
np.unique(y_census, return_counts=True)

In [None]:
np.unique(y_under, return_counts=True)

In [None]:
# OneHotEncoder
onehotencoder = ColumnTransformer(transformers=[('OneHot', OneHotEncoder(), [1,3,5,6,7,8,9,13])], remainder='passthrough')
X_census = onehotencoder.fit_transform(X_census).toarray()
X_census.shape

In [None]:
# treinamento e teste
X_census_train_under, X_census_test_under, y_census_train_under, y_census_test_under = train_test_split(X_under, y_under, test_size=0.15, random_state=0)
print('train:', X_census_train_under.shape, y_census_train_under.shape)
print('test:', X_census_test_under.shape, y_census_test_under.shape)

In [None]:
# randomforest
random_forest_census = RandomForestClassifier(criterion='entropy', min_samples_leaf=1, min_samples_split=5, n_estimators=100, random_state=0)
random_forest_census.fit(X_census_train_under, y_census_train_under)

In [None]:
previsoes = random_forest_census.predict(X_census_test_under)
accuracy_score(y_census_test_under, previsoes)

## Sobreamostragem com SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
X_census.shape, y_census.shape

In [None]:
smote = SMOTE(sampling_strategy='minority')
X_over, y_over = smote.fit_sample(X_census, y_census)
X_over.shape, y_over.shape

In [None]:
np.unique(y_census, return_counts=True)

In [None]:
np.unique(y_over, return_counts=True)

In [None]:
# OneHotEncoder
onehotencoder = ColumnTransformer(transformers=[('OneHot', OneHotEncoder(), [1,3,5,6,7,8,9,13])], remainder='passthrough')
X_census = onehotencoder.fit_transform(X_over).toarray()
X_census.shape

In [None]:
# treinamento e teste
X_census_train_over, X_census_test_over, y_census_train_over, y_census_test_over = train_test_split(X_over, y_over, test_size=0.15, random_state=0)
print('train:', X_census_train_over.shape, y_census_train_over.shape)
print('test:', X_census_test_over.shape, y_census_test_over.shape)

In [None]:
# randomforest
random_forest_census = RandomForestClassifier(criterion='entropy', min_samples_leaf=1, min_samples_split=5, n_estimators=100, random_state=0)
random_forest_census.fit(X_census_train_cover, y_census_train_cover)

In [None]:
previsoes = random_forest_census.predict(X_census_test_over)
accuracy_score(y_census_test_over, previsoes)