In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from imblearn.under_sampling import NearMiss
from collections import Counter
import gc

In [3]:
df = pd.read_csv("df_train.csv")

In [3]:
x = df.drop('label', axis=1)
y = df['label']

x_clean = x.replace([np.inf, -np.inf], np.nan)
x_clean = x_clean.fillna(x_clean.mean())

x1 = x_clean.values
y1 = y.values

In [4]:
del df, x, y, x_clean

In [13]:
x1 = x1.astype('float32')

rf = RandomForestClassifier(n_estimators=200, max_depth=30, n_jobs=-1, random_state=42)
kf = KFold(n_splits=10, shuffle=True, random_state=42)

test_scores_RF_mdi, precision_scores_RF_mdi, recall_scores_RF_mdi, f1_scores_RF_mdi = [], [], [], []
feature_importances_list = []

rf_base = RandomForestClassifier(max_features='sqrt', random_state=42)

In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(x1, y1, test_size=0.2, stratify=y1, random_state=42)

rf.fit(X_train, Y_train)

feature_importances_list.append(rf.feature_importances_)

y_pred = rf.predict(X_test)
test_scores_RF_mdi.append(accuracy_score(Y_test, y_pred))
precision_scores_RF_mdi.append(precision_score(Y_test, y_pred, average='weighted'))
recall_scores_RF_mdi.append(recall_score(Y_test, y_pred, average='weighted'))
f1_scores_RF_mdi.append(f1_score(Y_test, y_pred, average='weighted'))

del X_train, X_test, Y_train, Y_test, rf, y_pred
gc.collect()


5398

In [19]:
# Redução de features
importances_mean = np.mean(feature_importances_list, axis=0)
importances_series = pd.Series(importances_mean, index=df.drop('label', axis=1).columns)
selected_features = importances_series[importances_series >= importances_series.median()].index.tolist()
df_reduced = df[selected_features + ['label']]

In [20]:
metrics_mdi = pd.DataFrame({
    "Modelo": ["RandomForest"],
    "Acurácia": [np.mean(test_scores_RF_mdi)],
    "Precisão": [np.mean(precision_scores_RF_mdi)],
    "Recall": [np.mean(recall_scores_RF_mdi)],
    "F1 Score": [np.mean(f1_scores_RF_mdi)]
})

metrics_mdi.to_csv("metrics_mdi.csv", index=False)
df_reduced.to_csv("df_reduced.csv", index=False)

In [22]:
df_reduced['label'].value_counts()

label
0    6879056
1    1722730
2     381790
3     288157
4     158930
Name: count, dtype: int64

In [1]:
df_sampled = df_reduced.groupby('label').apply(lambda x: x.sample(n=1000, random_state=42)).reset_index(drop=True)

NameError: name 'df_reduced' is not defined

In [26]:
df_sampled.to_csv("df_sampled.csv", index=False)

In [27]:
df_sampled['label'].value_counts()

label
0    100
1    100
2    100
3    100
4    100
Name: count, dtype: int64