# Projeto Final - Modelos preditivos - Dataset do Censo

## Grupo:
- Lucas Natan Correia Couri
- Mariama Celi Serafim de Oliveira
- Laianna Lana Virginio da Silva
- Priscilla Amarante de Lima
- Liviany Reis Rodrigues

# Bibliotecas

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from scipy import stats
from sklearn import model_selection
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn import tree
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from warnings import filterwarnings
filterwarnings('ignore')

# Base de Dados

In [None]:
SEED = 6138
columns_name = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'class']
df = pd.read_csv("Dados/adult.data", names=columns_name, index_col=False)

In [None]:
df['native-country'].value_counts()

# Análise Exploratória de Dados

explorar a base de dados para mostrar outliers, nivel de separatividade dos dados em relação as classes (grafico de dispersao), 

In [None]:
df.head(1)

## Tipos dos dados

In [None]:
df.dtypes

In [None]:
df['workclass'] = df['workclass'].astype('category')
df['education'] = df['education'].astype('category')
df['marital-status'] = df['marital-status'].astype('category')
df['occupation'] = df['occupation'].astype('category')
df['relationship'] = df['relationship'].astype('category')
df['race'] = df['race'].astype('category')
df['sex'] = df['sex'].astype('category')
df['native-country'] = df['native-country'].astype('category')
df['class'] = df['class'].astype('category')
df.dtypes

## Resolvendo o problema da Holanda

No dataset de treino há apenas uma obvservação como " Holand-Netherlands", diante do tamanho do dataset (mais de 30mil linhas) optou-se por remover essa única linha com native-country=" Holand-Netherlands" de forma a evitar problemas de ausência do valor no dataset de teste.

In [None]:
df = df[df['native-country']!=" Holand-Netherlands"]

## Descrição dos dados

In [None]:
df.describe()

## Dados duplicados

In [None]:
df.drop_duplicates(inplace = True)

In [None]:
df[df.duplicated()]

## Preenchendo dados faltantes

In [None]:
def tratamento_faltantes(df, columns_name):
    ## Printa os atributos com dados faltantes (" ?")
    for coluna in columns_name:
        if len(df[df[coluna] == " ?"]) > 0:
            print(coluna)
            print(len(df[df[coluna] == " ?"]))
    
    ## Tratamento dos dados faltantes, transforma para numerico, substitui " ?" por NaN e interpola os NaN
    atr_faltantes = ["workclass", "occupation", "native-country"]
    for atr in atr_faltantes:
        categorias_atr = df.groupby(atr).sum().index.tolist()
        label_encoder = preprocessing.LabelEncoder()
        label_encoder.fit(categorias_atr)
        df[f"{atr}-num"] = label_encoder.transform(df[atr])
        df[f"{atr}-num"] = df[f"{atr}-num"].replace(0, np.nan)
        df[f"{atr}-num"] = df[f"{atr}-num"].interpolate(method='nearest')

In [None]:
tratamento_faltantes(df, columns_name)

In [None]:
#for coluna in columns_name:
#    if len(df[df[coluna] == " ?"]) > 0:
#        print(coluna)
#        print(len(df[df[coluna] == " ?"]))

Para cada atributo que tem dados faltantes vamos preencher utilizando a interpolação, para isso passamos para numerico antes.

In [None]:
#atr_faltantes = ["workclass", "occupation", "native-country"]
#for atr in atr_faltantes:
#    categorias_atr = df.groupby(atr).sum().index.tolist()
#    label_encoder = preprocessing.LabelEncoder()
#    label_encoder.fit(categorias_atr)
#    df[f"{atr}-num"] = label_encoder.transform(df[atr])
#    df[f"{atr}-num"] = df[f"{atr}-num"].replace(0, np.nan)
#    df[f"{atr}-num"] = df[f"{atr}-num"].interpolate(method='nearest')

In [None]:
df.head()

## Checando outliers

In [None]:
df['hours-per-week'].plot.box()

In [None]:
df['hours-per-week'].hist()

In [None]:
df['capital-gain'].plot.box()

In [None]:
df['capital-gain'].hist()

In [None]:
df['capital-loss'].plot.box()

In [None]:
df['capital-loss'].hist()

In [None]:
#q1 = dados['idade_log'].quantile(q=0.25)
#q3 = dados['idade_log'].quantile(q=0.75)
#iqr = q3 - q1
#print(iqr)

## Colunas redundantes

In [None]:
df.head()

education e education-num significam a mesma coisa, vamos utilizar education-num e dropar education (education-num já é a codificação ordinal de education)

In [None]:
df['education'].value_counts()

In [None]:
df['education-num'].value_counts()

## TO DO: Plotar região

## Frequência das variáveis categóricas (Value counts)

In [None]:
df['workclass'].value_counts()

In [None]:
df['education'].value_counts()

In [None]:
df['marital-status'].value_counts()

In [None]:
df['occupation'].value_counts()

In [None]:
df['relationship'].value_counts()

In [None]:
df['race'].value_counts()

In [None]:
df['sex'].value_counts()

In [None]:
df['native-country'].value_counts()

In [None]:
df['class'].value_counts()

In [None]:
# df_test['class'].value_counts()

# Carregando e processando conjunto de teste

In [None]:
df_test = pd.read_csv("Dados/adult.test", names=columns_name, index_col=False, skiprows=1)
df_test.head()

In [None]:
tratamento_faltantes(df_test, columns_name)

## Codificação das variáveis categóricas (variáveis nominais, faremos One Hot Encoder)

In [None]:
def onehot_encoder(df):
    colunas_cat = ["workclass-num","marital-status", "occupation-num", "relationship", "race", "sex", "native-country-num"]
    for coluna in colunas_cat:
        print(coluna)
        df_coluna = pd.get_dummies(df[coluna], prefix=coluna)
        df = df.join(df_coluna)
    return df

In [None]:
df = onehot_encoder(df)
df.head()

In [None]:
df_test = onehot_encoder(df_test)
df_test.head()

## Normalizando variáveis contínuas

In [None]:
from sklearn.preprocessing import MinMaxScaler

normalize = MinMaxScaler()
df[["age", "fnlwgt", "capital-gain", "capital-loss", "hours-per-week", "education-num"]] = normalize.fit_transform(df[["age", "fnlwgt", "capital-gain", "capital-loss", "hours-per-week", "education-num"]])
df_test[["age", "fnlwgt", "capital-gain", "capital-loss", "hours-per-week", "education-num"]] = normalize.fit_transform(df_test[["age", "fnlwgt", "capital-gain", "capital-loss", "hours-per-week", "education-num"]])

In [None]:
df[["age", "fnlwgt", "capital-gain", "capital-loss", "hours-per-week", "education-num"]]

## Dividindo conjuntos de dados

In [None]:
X_train = df.drop(["class", "education", "workclass", "workclass-num","marital-status", "occupation", "occupation-num", "relationship", "race", "sex", "native-country", "native-country-num"], axis = 1).to_numpy()
y_train = df["class"].values
X_test = df_test.drop(["class", "education", "workclass", "workclass-num","marital-status", "occupation", "occupation-num", "relationship", "race", "sex", "native-country", "native-country-num"], axis = 1).to_numpy()
y_test = df_test["class"].values

In [None]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.fit_transform(y_test)

In [None]:
df.drop(["class", "education", "workclass", "workclass-num","marital-status", "occupation", "occupation-num", "relationship", "race", "sex", "native-country", "native-country-num"], axis = 1)

# Testando validação

In [None]:
#7. Realizar busca com o gridsearch ou randonsearhc para encontrar os melhores parametros de cada modelo
# define models
decisionTree = DecisionTreeClassifier()
#svc = SVC()

# define evaluation
cv = model_selection.StratifiedKFold(n_splits=10)

# define search space for decision tree
space = dict()
space['criterion'] = ['gini', 'entropy']
space['min_samples_split'] = [2,3,5,7]
space['max_depth'] = [3,5,6,7,9,11,13,15,17,19]
space['min_samples_leaf'] = [2, 3]


# define random search for decision tree
search = RandomizedSearchCV(decisionTree, space, n_iter=50, scoring='accuracy', n_jobs=-1, cv=cv, random_state=SEED)

# execute search
result_tree = search.fit(X_train, y_train)

# summarize result for decision tree
print('=========Random Search Results for TREE==========')
print('Best Score: %s' % result_tree.best_score_)
print('Best Hyperparameters: %s' % result_tree.best_params_)

In [None]:
decisionTree = DecisionTreeClassifier(**result_tree.best_params_, random_state=SEED)

result_tree = decisionTree.fit(X_train, y_train)

print(classification_report(y_test, decisionTree.predict(X_test)))

# KNN (Livy)

# Árvore de decisão simples (Priscilla)

In [None]:
from sklearn.model_selection import GridSearchCV
# define models
decisionTree = DecisionTreeClassifier()

# define evaluation
cv = model_selection.StratifiedKFold(n_splits=10)

# define search space for decision tree
space = dict()
space['criterion'] = ['gini', 'entropy']
space['min_samples_split'] = [2,15,5,22]
space['max_depth'] = range(1,60)
space['min_samples_leaf'] = [2, 6]


# define random search for decision tree
#search = RandomizedSearchCV(decisionTree, space, n_iter=50, scoring='accuracy', n_jobs=-1, cv=cv, random_state=SEED)
search = GridSearchCV(decisionTree, space, scoring='accuracy', n_jobs=-1, cv=cv, verbose=4)


# execute search
result_tree = search.fit(X_train, y_train)

# summarize result for decision tree
print('=========Random Search Results for TREE==========')
print('Best Score: %s' % result_tree.best_score_)
print('Best Hyperparameters: %s' % result_tree.best_params_)

In [None]:
#instanciando
tree_classifier = DecisionTreeClassifier(criterion='entropy', max_depth= 11, min_samples_leaf= 3, min_samples_split= 7, random_state=SEED)
model = tree_classifier.fit(X_train, y_train)

In [None]:
text_representation = tree.export_text(tree_classifier)
print(text_representation)

In [None]:
#plotar a melhor árvore
feature_names=df.drop(["class", "education", "workclass", "workclass-num","marital-status", "occupation", "occupation-num", "relationship", "race", "sex", "native-country", "native-country-num"], axis = 1).columns

fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(tree_classifier, 
                   feature_names=feature_names,  
                  #  class_names=list(label_encoder.classes_),
                   filled=True)

In [None]:
#Árvore de decisão ilustrada
from dtreeviz.trees import dtreeviz # remember to load the package

viz = dtreeviz(clf, X, y,
                target_name="target",
                fontname="Arial",
                title="Árvore de decisão ilustrada",
                title_fontsize=16,
                feature_names=features,
                orientation='LR',
                scale=1.2,
                class_names=list(label_encoder.classes_))

viz

# Random Forest (Lucas)

In [None]:
from sklearn.model_selection import GridSearchCV

def search_rf(parameters, cv, X_train, y_train, SEED):

    grid_search = GridSearchCV(RandomForestClassifier(random_state=SEED), 
                        parameters,
                        scoring  = "accuracy",
                        n_jobs= -1, 
                        verbose=4,
                        cv = cv)

    grid_search.fit(X_train, y_train)

    best_score = grid_search.best_score_
    best_params = grid_search.best_params_
    print(best_params)
    print(best_score)
    return best_params


## Tentativa 1 (18min)

In [None]:
parameters = {
    "n_estimators": range(10, 301, 20),
    "criterion": ["gini", "entropy"],
    "max_features": ["auto", "sqrt", "log2"],
}

rf_try1 = search_rf(parameters, cv, X_train, y_train, SEED)

## Tentativa 2 (47min)

In [None]:
parameters = {
    "n_estimators": range(300, 451, 10),
    "criterion": ["gini", "entropy"],
    "max_features": ["auto", "sqrt", "log2"],
}

rf_try2 = search_rf(parameters, cv, X_train, y_train, SEED)

## Tentativa 3 (17min)

In [None]:
parameters = {
    "n_estimators": range(408, 413, 1),
    "criterion": ["gini", "entropy"],
    "max_features": ["auto", "sqrt", "log2"],
}

rf_try3 = search_rf(parameters, cv, X_train, y_train, SEED)

## Tentativa 4

In [None]:
parameters = {
    "n_estimators": [190,440,412],#range(407, 412, 1),
    "criterion": ["gini", "entropy"],
    "max_features": ["auto", "sqrt", "log2"],
    'min_samples_leaf': [1, 4],
    'min_samples_split': [2, 10]#,
    #'max_depth': [10, 100, None]#[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None]
}

rf_try4 = search_rf(parameters, cv, X_train, y_train, SEED)

## Desempenho no teste

In [None]:
#{'criterion': 'gini', 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 440}
best_rf = RandomForestClassifier(**rf_try4, random_state = SEED)
best_rf.fit(X_train, y_train)

from sklearn.metrics import accuracy_score
print("Score de teste:", accuracy_score(y_test, best_rf.predict(X_test)))

# Rede neural MLP (Mari)

In [None]:
cv = model_selection.StratifiedKFold(n_splits=10)

def gridsearch_mlp(X_train, y_train, parameters, metric, seed = SEED):
    search = GridSearchCV(MLPClassifier(random_state = seed), parameters, scoring=metric,  n_jobs=-1, cv=cv, return_train_score=True, verbose=10)

    result_mlp = search.fit(X_train, y_train)
    print_result(result_mlp)
    return result_mlp


def randomsearch_mlp(X_train, y_train, parameters, metric, seed = SEED):
    search = RandomizedSearchCV(MLPClassifier(random_state = seed), parameters, n_iter=100, n_jobs=-1, scoring=metric, cv=cv, random_state=seed, return_train_score=True, verbose=10)

    result_mlp = search.fit(X_train, y_train)
    print_result(result_mlp)
    return result_mlp


def print_result(result): 
    # summarize result 
    print('=========Random Search Results for MLP==========')
    print('Best Score: %s' % result.best_score_)
    print('Best Hyperparameters: %s' % result.best_params_) 


### Tentativa 1 (55 min)

In [None]:
# define search space for MPL
space = dict()
space["hidden_layer_sizes"] = [(4,4), (20,15),(50,50), (100,50), (50,100), (100, 250),(4,10,4),(20,10,5),(250, 100, 50)]
space["activation"] = ["logistic", "tanh", "relu", "identity"]
space["solver"] = ["lbfgs", "sgd", "adam"]

result1 = randomsearch_mlp(X_train, y_train, space, "accuracy")


In [None]:
results = pd.DataFrame(result1.cv_results_)
results.sort_values(by='rank_test_score', inplace=True)
results.head(10)

### Tentativa 2

In [None]:
#Adicionando leaning_rate e retirando algumas opções
space = dict()
space["hidden_layer_sizes"] = [(4,4), (20,15),(50,50), (100,50), (50,100), (100, 250),(4,10,4),(20,10,5),(250, 100, 50)]
space["activation"] = ["logistic", "tanh", "relu"]
space["solver"] = ["lbfgs", "adam"]
space["learning_rate"] = ["constant", "invscaling", "adaptive"] 

result2 = gridsearch_mlp(X_train, y_train, space, "accuracy")


In [None]:
results = pd.DataFrame(result2.cv_results_)
results.sort_values(by='rank_test_score', inplace=True)
results.head(10)

### Tentativa 3

In [None]:
#Variando camadas e retirando alguns valores
space = dict()
space["hidden_layer_sizes"] = [(50,30,20,10),(20,15),(20,20,20,20),(100, 250),(100, 250, 300,450),(250, 100, 50)]
space["activation"] = ["tanh"]
space["solver"] = ["lbfgs"]
space["learning_rate"] = ["constant", "invscaling", "adaptive"] 

result3 = gridsearch_mlp(X_train, y_train, space, "accuracy")

### Tentativa 4

In [None]:
#Variando camadas e retirando alguns valores
space = dict()
space["hidden_layer_sizes"] = [(5,5,5,5),(10,10,10,10),(50,50,50,50),(10,10,10,10,10)]
space["activation"] = ["logistic", "tanh"]
space["solver"] = ["lbfgs"]

space["learning_rate"] = ["constant", "invscaling", "adaptive"] 

result4 = gridsearch_mlp(X_train, y_train, space, "accuracy")

# Comitê de Redes Neurais

In [None]:
from sklearn.ensemble import BaggingClassifier

In [None]:
mlp = MLPClassifier(random_state = SEED)
mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_test)
print(classification_report(y_test, y_pred))
print("\n")
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
def val_bgc(X_test, y_test, X_train, y_train, SEED, num_e, hidden_l, max_iterations, max_samples):

    parameters = {
        "n_estimators": num_e,
        "max_samples": max_samples
    }
    
    gs_bgc_mlp = GridSearchCV(BaggingClassifier(MLPClassifier(hidden_layer_sizes = hidden_l,
                                                              max_iter = max_iterations,
                                                              random_state = SEED)),
                              parameters,
                              scoring = "accuracy",
                              #cv = cv,
                              n_jobs= -1
                             )

    gs_bgc_mlp.fit(X_train, y_train)
    
    gs_bgc_mlp.fit(X_train, y_train)

    best_params = gs_bgc_mlp.best_params_

    return best_params, gs_bgc_mlp

In [None]:
hidden_l = (10, 10)
max_iterations = 200
num_e = [10, 20]
max_samples = [1000, 50]

best_params, gs_bgc_mlp = val_bgc(X_test, y_test, X_train, y_train, SEED, num_e, hidden_l, max_iterations, max_samples)

In [None]:
print(best_params)
means = gs_bgc_mlp.cv_results_["mean_test_score"]
stds = gs_bgc_mlp.cv_results_["std_test_score"]

for mean, std, params in zip(means, stds, gs_bgc_mlp.cv_results_["params"]):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

In [None]:
y_pred = gs_bgc_mlp.predict(X_test)

print(classification_report(y_test, y_pred))

# Comitê Heterogêneo