# Bibliotecas

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from scipy import stats
from sklearn import model_selection
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn import tree
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from warnings import filterwarnings
filterwarnings('ignore')

# Base de Dados

In [None]:
columns_name = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'class']
df = pd.read_csv("Dados/adult.data", names=columns_name, index_col=False)

# Análise Exploratória de Dados

explorar a base de dados para mostrar outliers, nivel de separatividade dos dados em relação as classes (grafico de dispersao), 

In [None]:
df.head(1)

## Tipos dos dados

In [None]:
df.dtypes

In [None]:
df['workclass'] = df['workclass'].astype('category')
df['education'] = df['education'].astype('category')
df['marital-status'] = df['marital-status'].astype('category')
df['occupation'] = df['occupation'].astype('category')
df['relationship'] = df['relationship'].astype('category')
df['race'] = df['race'].astype('category')
df['sex'] = df['sex'].astype('category')
df['native-country'] = df['native-country'].astype('category')
df['class'] = df['class'].astype('category')
df.dtypes

## Descrição dos dados

In [None]:
df.describe()

## Dados duplicados

In [None]:
df.drop_duplicates(inplace = True)

In [None]:
df[df.duplicated()]

## Preenchendo dados faltantes

In [None]:
for coluna in columns_name:
    if len(df[df[coluna] == " ?"]) > 0:
        print(coluna)
        print(len(df[df[coluna] == " ?"]))

Lista das categorias em ordem alfabética

In [None]:
#df.groupby("workclass").sum().index.to_numpy()
df.groupby("workclass").sum().index.tolist() 
#df["workclass"].value_counts().index.tolist()

Para cada atributo que tem dados faltantes vamos preencher utilizando a interpolação, para isso passamos para numerico antes.

In [None]:
atr_faltantes = ["workclass", "occupation", "native-country"]
for atr in atr_faltantes:
    categorias_atr = df.groupby(atr).sum().index.tolist()
    label_encoder = preprocessing.LabelEncoder()
    label_encoder.fit(categorias_atr)
    df[f"{atr}-num"] = label_encoder.transform(df[atr])
    df[f"{atr}-num"] = df[f"{atr}-num"].replace(0, np.nan)
    df[f"{atr}-num"] = df[f"{atr}-num"].interpolate(method='nearest')

In [None]:
df.head()

## Checando outliers

In [None]:
df['hours-per-week'].plot.box()

In [None]:
df['hours-per-week'].hist()

In [None]:
df['capital-gain'].plot.box()

In [None]:
df['capital-gain'].hist()

In [None]:
df['capital-loss'].plot.box()

In [None]:
df['capital-loss'].hist()

In [None]:
#q1 = dados['idade_log'].quantile(q=0.25)
#q3 = dados['idade_log'].quantile(q=0.75)
#iqr = q3 - q1
#print(iqr)

## Colunas redundantes

In [None]:
df.head()

education e education-num significam a mesma coisa, vamos utilizar education-num e dropar education (education-num já é a codificação ordinal de education)

In [None]:
df['education'].value_counts()

In [None]:
df['education-num'].value_counts()

## TO DO: Plotar região

## Frequência das variáveis categóricas (Value counts)

In [None]:
df['workclass'].value_counts()

In [None]:
df['education'].value_counts()

In [None]:
df['marital-status'].value_counts()

In [None]:
df['occupation'].value_counts()

In [None]:
df['relationship'].value_counts()

In [None]:
df['race'].value_counts()

In [None]:
df['sex'].value_counts()

In [None]:
df['native-country'].value_counts()

In [None]:
df['class'].value_counts()

# Carregando conjunto de teste

Devemos tratar base de teste? devemos remover as ?interrogações? SIM, mas ver com o professor

In [None]:
df_test = pd.read_csv("Dados/adult.test", names=columns_name, index_col=False, skiprows=1)
df_test.head()

In [None]:
for coluna in columns_name:
    print(coluna)
    print(len(df_test[df_test[coluna] == " ?"]))
    #df_test.drop(index=df_test[df_test[coluna] == ' ?'].index, inplace=True)

## Codificação das variáveis categóricas (variáveis nominais, faremos One Hot Encoder)

In [None]:
df.head()

In [None]:
df['workclass'].fillna...

In [None]:
#from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
#"workclass","marital-status", "occupation", "relationship", "race", "sex", "native-country"


In [None]:
# creating instance of one-hot-encoder
from sklearn.preprocessing import OneHotEncoder
#enc = OneHotEncoder(handle_unknown='ignore')

In [None]:
# passing bridge-types-cat column (label encoded values of bridge_types)
#enc_df = pd.DataFrame(enc.fit_transform(df[['workclass']]).toarray())

In [None]:
# merge with main df bridge_df on key values
#bridge_df = df.join(enc_df)
#bridge_df

In [None]:
colunas_cat = ["workclass", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]

for coluna in colunas_cat:
    df_coluna = pd.get_dummies(df[coluna], prefix=coluna)
    df = df.join(df_coluna)

df.head()

In [None]:
df[coluna].value_counts()

## Separando dados

In [None]:
X_train = df.drop("class", "education", axis = 1).to_numpy()
y_train = df["class"].values
X_test = df_test.drop("class", "education", axis = 1).to_numpy()
y_test = df_test["class"].values

# Codificando e normalizando

- Como devemos tratar os dados? Pq temos variaveis continuas e categoricas
- E como tratar de forma igual o train e test? Pode ocorrer erro na hora de codificar as variaveis categoricas
- como codificar as variaveis categoricas? fazer one hot encoding? ou label enconder?

In [None]:
label_encoder = preprocessing.LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.fit_transform(y_test)

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = SEED, stratify=y)
#X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.20, random_state = SEED, stratify=y_train)
X_train = preprocessing.minmax_scale(X_train)
X_test = preprocessing.minmax_scale(X_test)
#X_valid = preprocessing.minmax_scale(X_valid)

# KNN (Livy)

# Árvore de decisão simples (Priscilla)

# Random Forest (Lucas)

In [None]:
from sklearn.model_selection import GridSearchCV

# Definindo a lista de parâmetros e seus possíveis valores.

parameters = {
    "n_estimators": range(10, 301, 10),
    "criterion": ["gini", "entropy"],
    "max_features": ["auto", "sqrt", "log2"],
}


# Notem que a validação agora demora mais que com os modelos que usamos até então.
grid_search = GridSearchCV(RandomForestClassifier(random_state=SEED), 
                    parameters,
                    scoring  = "f1_weighted",
                    n_jobs= -1, 
                    verbose=4,
                    cv = 3)# Quando atribuímos um número inteiro (quantidade de folds) ao parâmetro cv, a validação cruzada é estratificada

grid_search.fit(X_train, y_train)


In [None]:
best_params = grid_search.best_params_
print(best_params)

In [None]:
model = RandomForestClassifier(**best_params, random_state = SEED)

print("Train >> ", score_model(model, X_train, y_train))
model.fit(X_train, y_train)
print("Test score pós-validação: ", f1_weighted(y_test, model.predict(X_test)))

model = RandomForestClassifier(random_state = SEED)
model.fit(X_train, y_train)
print("Test score PRÉ-validação: ", f1_weighted(y_test, model.predict(X_test)))

# Rede neural MLP (Mari)

# Comitê de Redes Neurais (Laianna)