# Avaliação e validação de experimentos


In [None]:
import numpy as np
import pandas as pd

In [None]:
# Semente aleatória para reproducibilidade dos experimentos (reproducão dos experimentos)

SEED = 17

In [None]:
df_raw = pd.read_csv('data/stars_complete.csv')

print(df_raw.shape)

In [None]:
df_raw.head(9).T

In [None]:
df_raw.info()
df_raw.describe().T


In [None]:
# COLUNAS QUE PRECISAM DE ATENCAO
# subints: remover outliers
# subbands: remover outliers
# profile: normalizar

def remove_outliers(df_raw: pd.DataFrame, col_names: list[str], min:float = 0.25, max:float = 0.75) -> pd.DataFrame:
    
    rows_list = pd.Series(True, index=df_raw.index)

    for col_name in col_names:
        q1 = df_raw[col_name].quantile(min)
        q3 = df_raw[col_name].quantile(max)
        iqr = q3 - q1

        limite_inferior = q1 - 1.5 * iqr
        limite_superior = q3 + 1.5 * iqr

        batata = (df_raw[col_name] >= limite_inferior) & (df_raw[col_name] <= limite_superior)

        rows_list &= batata

    return df_raw[rows_list]


colunas = ["subints", "profile", "subbands"]
df_filtered = remove_outliers(df_raw, colunas)

df_filtered.describe().T



In [None]:
# # ATENCAO
# # subbands ficou tudo zero apoós remover outliers, conferir se esse atributo eh importante
# df_filtered = df_filtered.drop(columns="subbands")


# # A coluna "profile" esta variando de mais (min 5.260601e+96, max 1.203514e+98)
# from sklearn.preprocessing import PowerTransformer

# pt = PowerTransformer(method='yeo-johnson')
# df_filtered['profile'] = pt.fit_transform(df_filtered[['profile']])

# df_filtered.describe().T

In [None]:
# # Perdemos muito tempo com essas colunas problematicas, removerei elas
# colunas = ["subints", "profile", "subbands"]
# df = df_filtered.drop(columns=colunas)


# Remover algumas colunas aleatorias para dificultar o problema
disperso = ['subints', 'subbands', 'profile', 'period_index', 'accn_values']
concentrado = ['dm_index', 'dm_values', 'dm_curve_snr_values']#, 'accn_curve_snr_values']
numero = ['rajd', 'decjd', 'bary_period', 'topo_period', 'dm', 'snr', 'width', 'accn', 'hits', 'rank']#, 'fftsnr']

df = df_filtered.drop(columns=disperso+concentrado+numero)

df.shape


In [None]:
from sklearn.model_selection import train_test_split

df, _ = train_test_split(df, train_size=1000, stratify=df["pulsar"], random_state=SEED)

df.shape

In [None]:
# To simplify the code, we will only use numeric features

X = df.drop(columns="pulsar")

yreg = df.pulsar
ycla = yreg > 0

print(X.shape)

## Classificação

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_selection import VarianceThreshold, SelectKBest
from sklearn.decomposition import PCA

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import make_scorer

In [None]:
scorer = make_scorer(accuracy_score) # Teste outras

cv = StratifiedKFold(n_splits=15, shuffle=True, random_state=SEED)
gscv = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)

algorithms = {
    'kNN':  GridSearchCV(
        Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', MinMaxScaler(feature_range=(0, 1))),
            ('selector', VarianceThreshold()),
            ('knn', KNeighborsClassifier())]),
        param_grid={
            'selector__threshold': [0, 0.01, 0.02, 0.03],
            'knn__n_neighbors': [1, 3, 5],
            'knn__p': [1, 2],
        },
        scoring=scorer,
        cv=gscv),


    'tree':  GridSearchCV(
        Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('tree', DecisionTreeClassifier(random_state=SEED))]),
        param_grid={
            'tree__max_depth': [5, 10, 20],
            'tree__criterion': ['entropy', 'gini'],
        },
        scoring=scorer,
        cv=gscv),


    'bigtree':  GridSearchCV(
        Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('tree', DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=SEED))]),
        param_grid={
            'tree__criterion': ['entropy', 'gini'],
        },
        scoring=scorer,
        cv=gscv),


    'nb': GridSearchCV(
        Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('selector', SelectKBest()),
            ('nb', GaussianNB())]),
        param_grid={
            'selector__k': [3, 5, 10],
        },
        scoring=scorer,
        cv=gscv),


    'svmlinear': GridSearchCV(
        Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
            ('pca', PCA()),
            ('svm', SVC(kernel='linear', random_state=SEED))]),
        param_grid={
            'pca__n_components': [2, 5, 10],
            'svm__C': [1.0, 2.0],
        },
        scoring=scorer,
        cv=gscv),

        
    'svmrbf': GridSearchCV(
        Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
            ('pca', PCA(random_state=SEED)),
            ('svm', SVC(kernel='rbf', random_state=SEED))]),
        param_grid={
            'pca__n_components': [2, 5, 10],
            'svm__C': [1.0, 2.0],
            'svm__gamma': [0.1, 1.0, 2.0],
        },
        scoring=scorer,
        cv=gscv),
}

In [None]:
from sklearn.model_selection import cross_val_score

result = {}
for alg, clf in algorithms.items():
  result[alg] = cross_val_score(clf, X, ycla, cv=cv)

result = pd.DataFrame.from_dict(result)


In [None]:
print(result)

In [None]:
result.apply(lambda x: "{:.2f} ± {:.2f}".format(x.mean(), x.std()))

In [None]:
import matplotlib.pyplot as plt

plt.boxplot([ scores for alg, scores in result.items()])
plt.xticks(1 + np.arange(result.shape[1]), result.columns)

plt.axhline(1, linestyle="dotted", color="green")
# plt.ylim(0,1.05)  # Nao mentir com dados para os coleguinhas

plt.show()