# Avaliação e validação de experimentos


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from pathlib import Path
import json

In [None]:
# Semente aleatória para reproducibilidade dos experimentos (reproducão dos experimentos)

SEED = 17

In [None]:
Path("../data/raw/").exists()

In [None]:
df_raw = pd.read_csv("../data/raw/bots_vs_users.csv", na_values=["Unknown"])


df_raw.info()


In [None]:
df_raw.T

In [None]:
# HEATMAP DOS MISSINGVALUES

from matplotlib.colors import ListedColormap

df_map = df_raw.replace("Unknown", np.nan)

# Criar um DataFrame booleano: True onde tem missing
missing_mask = df_map.isna()

# Plotar o heatmap
colors = ListedColormap(['#f5deb3', 'black'])

plt.figure(figsize=(16, 8))
plt.imshow(missing_mask, aspect='auto', cmap=colors, interpolation='none')

# plt.title("Total Missing Values Map (unknown + NaN)")

# Eixos com os nomes das colunas e índices
plt.xticks(ticks=np.arange(df_map.shape[1]), labels=df_map.columns, rotation=90)
plt.yticks(ticks=np.linspace(0, df_map.shape[0], 10).astype(int))  # menos ticks no Y para visualização melhor

plt.tight_layout()
plt.show()


In [None]:

# df_interm = df_raw[df_raw["has_domain"] != "Unknown"]
df_interm = df_raw

# Coluna "city" seria mto trabalhosa de tratar e pouco relevante para o resultado
df_interm = df_interm.drop(columns=["city"])

# Substitui os Unknown por nan, em seguida o SimpleImputer cuida
df_interm = df_interm.replace("Unknown", np.nan)

# Colunas com todos valores iguais
df_interm = df_interm.drop(columns=["has_domain", "has_short_name", "has_first_name", "has_last_name"])


# FIGURE 2 comente o codigo a baixo
# FIGURE 3 mantenha o codigo a baixo

# # Colunas com muitos (> 50%) "Unknown" ou NaN
# mask_un = df_raw.apply(lambda col: (col == "Unknown").mean() > 0.5)
# colunas_com_muito_un = df_raw.columns[mask_un]

# mask_na = df_raw.isna().mean() > 0.5
# colunas_com_muito_nan = df_raw.columns[mask_na]

# colunas_indesejadas = list(set(colunas_com_muito_nan).union(colunas_com_muito_un))

# df_interm = df_interm.drop(columns=colunas_indesejadas)



In [None]:
# Tirar uma amostra menor dos dados de forma extratificada
df, _ = train_test_split(df_interm,
                         train_size=1000,
                         stratify=df_interm["target"],
                         random_state=SEED
                         )


df.to_csv("data/dados_modelos.csv", index=False)

X = df.drop(columns=['target'])

yreg = df.target
ycla = yreg > 0

print(X.shape)



# # Tirar uma amostra menor dos dados de forma NAO extratificada

# df_1 = df_interm[df_interm['target'] == 1]
# df_0 = df_interm[df_interm['target'] == 0]

# n_total = 1000
# n_1 = int(0.10 * n_total)
# n_0 = n_total - n_1

# df_sample = pd.concat([
#     df_1.sample(n=n_1, random_state=SEED),
#     df_0.sample(n=n_0, random_state=SEED)
# ])

# df = df_sample.sample(frac=1, random_state=SEED).reset_index(drop=True)

# X = df.drop(columns=['target'])
# yreg = df.target
# ycla = yreg > 0



## Classificação

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_selection import VarianceThreshold, SelectKBest
from sklearn.decomposition import PCA

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import make_scorer

In [None]:
scorer = make_scorer(accuracy_score) # Teste outras

cv = StratifiedKFold(n_splits=15, shuffle=True, random_state=SEED)
gscv = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)

algorithms = {
    'knn':  GridSearchCV(
        Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', MinMaxScaler(feature_range=(0, 1))),
            ('selector', VarianceThreshold()),
            ('knn', KNeighborsClassifier())]),
        param_grid={
            'selector__threshold': [0, 0.01, 0.02, 0.03],
            'knn__n_neighbors': [1, 3, 5],
            'knn__p': [1, 2],
        },
        scoring=scorer,
        cv=gscv),


    'tree':  GridSearchCV(
        Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('tree', DecisionTreeClassifier(random_state=SEED))]),
        param_grid={
            'tree__max_depth': [5, 10, 20],
            'tree__criterion': ['entropy', 'gini'],
        },
        scoring=scorer,
        cv=gscv),


    'bigtree':  GridSearchCV(
        Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('tree', DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=SEED))]),
        param_grid={
            'tree__criterion': ['entropy', 'gini'],
        },
        scoring=scorer,
        cv=gscv),


    'nb': GridSearchCV(
        Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('selector', SelectKBest()),
            ('nb', GaussianNB())]),
        param_grid={
            'selector__k': [3, 5, 10],
        },
        scoring=scorer,
        cv=gscv),


    'svmlinear': GridSearchCV(
        Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
            ('pca', PCA(random_state=SEED)),
            ('svm', SVC(kernel='linear', random_state=SEED))]),
        param_grid={
            'pca__n_components': [2, 5, 10],
            'svm__C': [1.0, 2.0],
        },
        scoring=scorer,
        cv=gscv),

        
    'svmrbf': GridSearchCV(
        Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
            ('pca', PCA(random_state=SEED)),
            ('svm', SVC(kernel='rbf', random_state=SEED))]),
        param_grid={
            'pca__n_components': [2, 5, 10],
            'svm__C': [1.0, 2.0],
            'svm__gamma': [0.1, 1.0, 2.0],
        },
        scoring=scorer,
        cv=gscv),
}

In [None]:
from sklearn.model_selection import cross_val_score

result = {}
for alg, clf in algorithms.items():
  result[alg] = cross_val_score(clf, X, ycla, cv=cv)

result = pd.DataFrame.from_dict(result)

In [None]:
results_path = Path("../results")
results_path.exists()

In [None]:
boxplot_cross_val_complete

In [None]:
file_list_complete = results_path.glob("complete/*cross_val_scores.json")
results = {}
for file in file_list_complete:
    with open(file, "r") as f:
        results_tmp = json.load(f)
    filename = file.stem.split("_cross_val_scores")[0].split("_experiment_")[-1]
    results[filename] = list(results_tmp.values())[0]

In [None]:
results.keys()

In [None]:
file_list_subset = results_path.glob("subset/*cross_val_scores_subset.json")
results = {}
for file in file_list_subset:
    with open(file, "r") as f:
        results_tmp = json.load(f)
    filename = file.stem.split("_cross_val_scores")[0].split("_experiment_")[-1]
    results[filename] = list(results_tmp.values())[0]

In [None]:
results_df = pd.DataFrame(results)

In [None]:
columns = ['svm_linear', 'svm_rbf', 'knn', 'gaussian_nb', 'tree',  ]

In [None]:
# reorder columns
results_df = results_df[columns]

In [None]:
print(result)

In [None]:
result.apply(lambda x: "{:.2f} ± {:.2f}".format(x.mean(), x.std()))

In [None]:
results_df.apply(lambda x: "{:.2f} ± {:.2f}".format(x.mean(), x.std()))

In [None]:
import numpy as np
import matplotlib.pyplot as plt

data = [scores for alg, scores in results_df.items()]
labels = results_df.columns
positions = 1 + np.arange(len(data))

plt.boxplot(data)
plt.xticks(positions, labels)

# Adiciona mediana e std
for pos, scores in zip(positions, data):
    median = np.median(scores)
    std = np.std(scores)
    plt.text(pos, 0.77, f'Med: {median:.2f}', ha='center', fontsize=8)
    plt.text(pos, 0.76, f'Std: {std:.3f}', ha='center', fontsize=8)

plt.axhline(1, linestyle="dotted", color="black")
plt.ylim(0.75, 1.01)

plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

data = [scores for alg, scores in result.items()]
labels = result.columns
positions = 1 + np.arange(len(data))

plt.boxplot(data)
plt.xticks(positions, labels)

# Adiciona mediana e std
for pos, scores in zip(positions, data):
    median = np.median(scores)
    std = np.std(scores)
    plt.text(pos, 0.77, f'Med: {median:.2f}', ha='center', fontsize=8)
    plt.text(pos, 0.76, f'Std: {std:.3f}', ha='center', fontsize=8)

plt.axhline(1, linestyle="dotted", color="black")
plt.ylim(0.75, 1.01)

plt.show()


In [None]:
import matplotlib.pyplot as plt

plt.boxplot([ scores for alg, scores in result.items()])
plt.xticks(1 + np.arange(result.shape[1]), result.columns)

plt.axhline(1, linestyle="dotted", color="green")
plt.ylim(0,1.05)  # Nao mentir com dados para os coleguinhas

plt.show()

## Comparar desempenho (validar hipótese)

In [None]:
from scipy.stats import wilcoxon

In [None]:
# Comparar dois algoritmos:
wilcoxon(result.kNN, result.tree)

In [None]:
# Comparar todos contra todos: Friedman + Nemenyi (further reading), construir um diagrama de diferença crítica
# Orange3

## Melhor modelo (deploying)

In [None]:
classifier = algorithms['bigtree']
classifier.fit(X, ycla) # Uso a base toda!
print(classifier.best_estimator_)

In [None]:
x = X.iloc[0:1, :]
print(x) # Faz de conta que é novo

In [None]:
classifier.predict(x)

### Um outro modelo

In [None]:
classifier = algorithms['svmrbf']
classifier.fit(X, ycla) # Uso a base toda
print(classifier.best_estimator_)

In [None]:
print(x)
x.iloc[0, 13] = np.nan
print(x)
classifier.predict(x)

## Regressão

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_selection import VarianceThreshold, SelectKBest
from sklearn.decomposition import PCA

from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import make_scorer

In [None]:
scorer = make_scorer(mean_squared_error, greater_is_better=False)
algorithms = {}