In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from imblearn.under_sampling import NearMiss
from collections import Counter
import gc

In [3]:
df = pd.read_csv("df_train.csv")

In [13]:
df.dtypes.value_counts()

int64      40
float64    37
Name: count, dtype: int64

In [None]:
def reduzir_tipos(df):
    df_otimizado = df.copy()

    for col in df.columns:
        col_data = df[col]

        if pd.api.types.is_numeric_dtype(col_data):
            col_min = col_data.min()
            col_max = col_data.max()

            # transforma em booleano se tiver só 0 e 1
            if set(col_data.unique()).issubset({0, 1}):
                df_otimizado[col] = col_data.astype(bool)
                continue

            # inteirso
            if pd.api.types.is_integer_dtype(col_data):
                if np.iinfo(np.int8).min <= col_min and col_max <= np.iinfo(np.int8).max:
                    df_otimizado[col] = col_data.astype(np.int8)
                elif np.iinfo(np.int16).min <= col_min and col_max <= np.iinfo(np.int16).max:
                    df_otimizado[col] = col_data.astype(np.int16)
                elif np.iinfo(np.int32).min <= col_min and col_max <= np.iinfo(np.int32).max:
                    df_otimizado[col] = col_data.astype(np.int32)
                else:
                    df_otimizado[col] = col_data.astype(np.int64)

            #floats
            elif pd.api.types.is_float_dtype(col_data):
                if np.finfo(np.float16).min <= col_min and col_max <= np.finfo(np.float16).max:
                    df_otimizado[col] = col_data.astype(np.float16)
                elif np.finfo(np.float32).min <= col_min and col_max <= np.finfo(np.float32).max:
                    df_otimizado[col] = col_data.astype(np.float32)
                else:
                    df_otimizado[col] = col_data.astype(np.float64)

    return df_otimizado
df_otimizado = reduzir_tipos(df)
df_otimizado.dtypes.value_counts()

float32    26
bool       18
int32      15
float16     9
int16       4
float64     2
int8        2
int64       1
Name: count, dtype: int64

In [18]:
df_otimizado.to_csv("df_train_otimizado.csv", index=False)

In [12]:
df_otimizado = pd.read_csv("df_train_otimizado.csv")

In [7]:
x = df_otimizado.drop('label', axis=1)
y = df_otimizado['label']

x_clean = x.replace([np.inf, -np.inf], np.nan)
x_clean = x_clean.fillna(x_clean.mean())

x1 = x_clean.values
y1 = y.values

In [8]:
del df_otimizado, x, y, x_clean

In [9]:
rf = RandomForestClassifier(n_estimators=200, max_depth=30, n_jobs=-1, random_state=42)
kf = KFold(n_splits=10, shuffle=True, random_state=42)

test_scores_RF_mdi, precision_scores_RF_mdi, recall_scores_RF_mdi, f1_scores_RF_mdi = [], [], [], []
feature_importances_list = []

rf_base = RandomForestClassifier(max_features='sqrt', random_state=42)

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(x1, y1, test_size=0.2, stratify=y1, random_state=42)

rf.fit(X_train, Y_train)

feature_importances_list.append(rf.feature_importances_)

y_pred = rf.predict(X_test)
test_scores_RF_mdi.append(accuracy_score(Y_test, y_pred))
precision_scores_RF_mdi.append(precision_score(Y_test, y_pred, average='weighted'))
recall_scores_RF_mdi.append(recall_score(Y_test, y_pred, average='weighted'))
f1_scores_RF_mdi.append(f1_score(Y_test, y_pred, average='weighted'))

del X_train, X_test, Y_train, Y_test, rf, y_pred
gc.collect()

72

In [13]:
# Redução de features
importances_mean = np.mean(feature_importances_list, axis=0)
importances_series = pd.Series(importances_mean, index=df_otimizado.drop('label', axis=1).columns)
selected_features = importances_series[importances_series >= importances_series.median()].index.tolist()
df_reduced = df_otimizado[selected_features + ['label']]

In [14]:
metrics_mdi = pd.DataFrame({
    "Modelo": ["RandomForest"],
    "Acurácia": [np.mean(test_scores_RF_mdi)],
    "Precisão": [np.mean(precision_scores_RF_mdi)],
    "Recall": [np.mean(recall_scores_RF_mdi)],
    "F1 Score": [np.mean(f1_scores_RF_mdi)]
})

metrics_mdi.to_csv("metrics_mdi.csv", index=False)
df_reduced.to_csv("df_reduced_quant.csv", index=False)

In [17]:
df_reduced.columns.__len__()

39

In [5]:
df_sampled = df.groupby('label').apply(lambda x: x.sample(n=1000, random_state=42)).reset_index(drop=True)

  df_sampled = df.groupby('label').apply(lambda x: x.sample(n=1000, random_state=42)).reset_index(drop=True)


In [6]:
df_sampled.to_csv("df_sampled.csv", index=False)

In [7]:
df_sampled['label'].value_counts()

label
0    1000
1    1000
2    1000
3    1000
4    1000
Name: count, dtype: int64