## Bruno Andrade Schiavone
## Barbara Eduarda Rocha
## Pedro Bernardi Alves

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statistics  as sts
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier

## 1) Resumo geral da base de dados

In [None]:
df=pd.read_csv('adult.csv')
df

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df['income'].value_counts()

In [None]:
df.shape

## 2) Pré-processamento

In [None]:
df.columns

In [None]:
df[df == '?'] = np.nan

In [None]:
total = df.isnull().sum().sort_values(ascending=False)
percent_1 = df.isnull().sum()/df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(5)

In [None]:
for col in ['workclass', 'occupation', 'native.country']:
   df[col].fillna(df[col].mode()[0], inplace=True)

In [None]:
df.isnull().sum()

### Encoding Categorical Features

In [None]:
df.nunique()

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
df['income'].value_counts()

In [None]:
df['sex'].value_counts()

In [None]:
labelencoder = LabelEncoder()
df[["income","sex", "education"]] = \
df[["income","sex", "education"]].apply(labelencoder.fit_transform)

In [None]:
df.info()

In [None]:
categorical = ['workclass', 'marital.status', 'occupation', 'relationship', 'race', 'native.country']
df = pd.get_dummies(df, columns=categorical)

In [None]:
df

### Normalização dos dados

In [None]:
df_copy = df.copy()

In [None]:
std=StandardScaler()
columns = ['age','fnlwgt','education.num', 'capital.gain', 'capital.loss', 'hours.per.week', 'education' ]
df[columns] =\
std.fit_transform(df[columns])

In [None]:
df.describe()

## 3) Algoritmos de Classificação
* Árvore de Decisão
* KNN
* Redes Neurais
* K-Means

In [None]:
models = []
models.append(['Decision Tree', DecisionTreeClassifier()])
models.append(['KNN', KNeighborsClassifier(n_neighbors=5)])
models.append(['MLP', MLPClassifier(hidden_layer_sizes=(10,5), activation='relu', solver='sgd', max_iter =800, tol=0.0001, random_state = 3)])

In [None]:
X = df.drop(['income'], axis=1).values
y = df['income'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
lst_1= []

for m in range(len(models)):
    lst_2= []
    model = models[m][1]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    DT_score = model.score(X_train, y_train)
    DT_test = model.score(X_test, y_test)

    cm = confusion_matrix(y_test,y_pred)
    print(models[m][0],':')
    print('Training Score',DT_score)
    print('Testing Score \n',DT_test)
    print(cm)
    print('-----------------------------------')
    lst_2.append(models[m][0])
    lst_2.append(DT_score)
    lst_2.append(DT_test)
    lst_1.append(lst_2)

In [None]:
df_compare = pd.DataFrame(lst_1, columns= ['Model', 'Training Score', 'Testing Score'])
df_compare

### K-Means

**Erro Mínimos Quadrados**

In [None]:
df_K = df.drop('income', axis=1)

In [None]:
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 15):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 5, max_iter = 300)
    kmeans.fit(df_K[['age','education']])
    # inertia: Método para gerar o wcss
    wcss.append(kmeans.inertia_)

In [None]:
wcss

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10,5))
plt.plot(range(1,15),wcss, marker='o')
plt.title('The Elbow Method')
plt.xlabel('Número de clusters')
plt.ylabel('WCSS')
plt.show()

**Agrupamento**

In [None]:
kmeans = KMeans(n_clusters = 4, init = 'k-means++', random_state = 5, max_iter = 300)
kmeans1 = kmeans.fit(df_K[['age','education']])

In [None]:
import plotly.express as px
centroids = kmeans1.cluster_centers_
classification = kmeans1.labels_
graph = px.scatter(x = df_K['age'], y = df_K['education'], color=classification)
graph.show()

In [None]:
#SepalLengthCm x PetalLengthCm
import plotly.graph_objects as go

graf1 = px.scatter(x = df_K['age'], y = df_K['education'], color=classification)
graf2 = px.scatter(x = centroids[:,0], y = centroids[:,1], size = [10, 10, 10, 10])
graf3 = go.Figure(data = graf1.data + graf2.data)
graf3.update_layout(width=800,height=500,title_text='Agrupamento K-Means')
graf3.update_xaxes(title = 'x')
graf3.update_yaxes(title = 'y')
graf3.show()

In [None]:
# Adiciona a coluna cluster
df_copy['cluster'] = kmeans1.labels_
df_copy.head()

In [None]:
colors = ['r', 'g', 'b', 'c']


# Create the scatter plot with different colors for each cluster
plt.figure(figsize=(8, 6))
for i in range(4):
    cluster_data = df_copy[df_copy['cluster'] == i]
    plt.scatter(cluster_data['age'], cluster_data['education'], c=colors[i], label=f'Cluster {i+1}')
    if i == 3:  # Add the centroid legend entry only for the first cluster
        plt.scatter(cluster_data['age'].mean(), cluster_data['education'].mean(), s=200, c='orange', marker='o', label='Cluster centroid')
    else:
        plt.scatter(cluster_data['age'].mean(), cluster_data['education'].mean(), s=200, c='orange', marker='o')

plt.xlabel('age')
plt.ylabel('education')
plt.title('Scatter Plot with Clusters')
plt.legend()
plt.grid(True)
plt.show()

## 4) validação cruzada

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
lst_1= []

for m in range(len(models)):
  lst_2= []
  model = models[m][1]
  # Create StratifiedKFold object.
  skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
  lst_accu_stratified = []

  for train_index, test_index in skf.split(X, y):
      x_train_fold, x_test_fold = X[train_index], X[test_index]
      y_train_fold, y_test_fold = y[train_index], y[test_index]
      model.fit(x_train_fold, y_train_fold)
      lst_accu_stratified.append(model.score(x_test_fold, y_test_fold))

  # Print the output.
  print(models[m][0],':')
  print('\nLista de ACC:', lst_accu_stratified)
  print('\nMaior ACC:',
        max(lst_accu_stratified)*100, '%')
  print('\nMenor ACC:',
        min(lst_accu_stratified)*100, '%')
  print('\nMédia ACC:',
        sts.mean(lst_accu_stratified)*100, '%')
  print('\nDesvio Padrão:', sts.stdev(lst_accu_stratified))
  print('-----------------------------------')
  lst_2.append(models[m][0])
  lst_2.append(max(lst_accu_stratified)*100)
  lst_2.append(min(lst_accu_stratified)*100)
  lst_2.append(sts.mean(lst_accu_stratified)*100)
  lst_2.append(sts.stdev(lst_accu_stratified))
  lst_1.append(lst_2)


In [None]:
df_compare = pd.DataFrame(lst_1, columns= ['Model', 'Maior ACC', 'Menor ACC', 'Média ACC', 'Desvio Padrão' ])
df_compare

## 5) Balanceamento das classes
- https://medium.com/analytics-vidhya/undersampling-and-oversampling-an-old-and-a-new-approach-4f984a0e8392
* Abordagem SMOTE

In [None]:
df['income'].value_counts()

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
lst_1= []

for m in range(len(models)):
  lst_2= []
  model = models[m][1]
  # Create StratifiedKFold object.
  skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
  lst_accu_stratified = []

  for train_index, test_index in skf.split(X, y):
      x_train_fold, x_test_fold = X[train_index], X[test_index]
      y_train_fold, y_test_fold = y[train_index], y[test_index]
      sm = SMOTE()
      x_train_oversampled, y_train_oversampled = sm.fit_resample(x_train_fold, y_train_fold)
      model.fit(x_train_oversampled, y_train_oversampled)
      lst_accu_stratified.append(model.score(x_test_fold, y_test_fold))
  # Print the output.
  print(models[m][0],':')
  print('\nLista de ACC:', lst_accu_stratified)
  print('\nMaior ACC:',
        max(lst_accu_stratified)*100, '%')
  print('\nMenor ACC:',
        min(lst_accu_stratified)*100, '%')
  print('\nMédia ACC:',
        sts.mean(lst_accu_stratified)*100, '%')
  print('\nDesvio Padrão:', sts.stdev(lst_accu_stratified))
  print('-----------------------------------')
  lst_2.append(models[m][0])
  lst_2.append(max(lst_accu_stratified)*100)
  lst_2.append(min(lst_accu_stratified)*100)
  lst_2.append(sts.mean(lst_accu_stratified)*100)
  lst_2.append(sts.stdev(lst_accu_stratified))
  lst_1.append(lst_2)

In [None]:
df_compare = pd.DataFrame(lst_1, columns= ['Model', 'Maior ACC', 'Menor ACC', 'Média ACC', 'Desvio Padrão' ])
df_compare

## 6) Técnicas de ajuste de hiperparâmetros
- https://scikit-learn.org/stable/modules/grid_search.html#
- Abordagens disponíveis no scikit-learn:
    - GridSearchCV: considera exaustivamente todas as combinações de parâmetros;
    - RandomizedSearchCV: pesquisa aleatória de parâmetros, em que cada configuração é amostrada a partir de uma distribuição de possíveis valores de parâmetro.

In [None]:
models.pop(2)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [None]:
def grid_search(estimator, param_grid, cv, return_train_score):
  return GridSearchCV(estimator=estimator, param_grid=param_grid, refit=True, cv=cv, return_train_score=return_train_score, n_jobs=-1)

In [None]:
def randomized_search(estimator, param_grid, cv, return_train_score):
  return RandomizedSearchCV(estimator=estimator, param_distributions=param_grid,refit=True,n_iter= 10, cv=cv, return_train_score=return_train_score, n_jobs=-1)

In [None]:
estimators = []
estimators.append(['GridSearchCV', grid_search])
estimators.append(['RandomizedSearchCV', randomized_search])

In [None]:
models_param_grid = {
    'Decision Tree' : {'criterion': ['gini', 'entropy', 'log_loss']},
    'KNN': {'n_neighbors': [3,5,7,9], 'metric':['euclidean', 'manhattan', 'cosine','minkowski']},
    'MLP': {'hidden_layer_sizes': [(10,30,10),(20,)], 'activation': ['tanh', 'relu'], 'solver': ['sgd', 'adam'], 'alpha': [0.0001, 0.05], 'learning_rate': ['constant','adaptive'],}
}

In [None]:
lst_1= []

for m in range(len(models)):
  model = models[m][1]
  print(models[m][0],':')
  for n in range(len(estimators)):
    lst_2= []
    estimator = estimators[n][1]
    e_search = estimator(model, models_param_grid[models[m][0]], 10, False)
    sm = SMOTE()
    x_train_oversampled, y_train_oversampled = sm.fit_resample(X_train, y_train)
    e_search.fit(x_train_oversampled, y_train_oversampled)
    print('-------')
    print(estimators[n][0],':')
    print(e_search.best_params_)
    print(e_search.best_score_)
    print(e_search.best_index_)
    print(e_search.cv_results_.keys())
    g_results =  pd.DataFrame(e_search.cv_results_)
    # Obtém a média das acurácias (10 folds) referente ao conjunto treino
    mean_test_score = g_results.loc[e_search.best_index_,'mean_test_score']
    print(mean_test_score)
    # Avalia o conjunto teste  com o melhor conjunto de parâmetros encontrado
    # best_estimator_ .Para tanto, o parâmetro refit precisa ser igual a True
    model = e_search.best_estimator_
    score = model.score(X_test,y_test)
    print(score)
    print('-------')
    lst_2.append(f'{models[m][0]}({estimators[n][0]})')
    lst_2.append(e_search.best_params_)
    lst_2.append(e_search.best_score_)
    lst_2.append(e_search.best_index_)
    lst_2.append(mean_test_score)
    lst_2.append(score)
    lst_1.append(lst_2)
  print('-----------------------------------')

In [None]:
df_compare = pd.DataFrame(lst_1, columns= ['Model', 'best_params_', 'best_score_','best_index_', 'mean_test_score','score' ])
df_compare