<a href="https://colab.research.google.com/github/micaelCZ/Paper_Repositorio/blob/main/A%20RANDOM%20FOREST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

# Cargar el Dataset
datapath = 'https://raw.githubusercontent.com/micaelCZ/Paper_Repositorio/main/dataset/datasetPreprocesado/Escenario1.csv'
dataframe = pd.read_csv(datapath, low_memory=False, sep=';')

# Normalizar los datos
def dfNormalize(df):
    for feature_name in df.columns:
        df.loc[:, feature_name] = pd.to_numeric(df.loc[:, feature_name], errors='coerce').fillna(0)
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()   
        if (max_value - min_value) > 0:
            df.loc[:, feature_name] = (df.loc[:, feature_name] - min_value) / (max_value - min_value)
        else:
            df.loc[:, feature_name] = (df.loc[:, feature_name] - min_value)    
    return df

# Reordenar los datos aleatoriamente
dataframe = dataframe.reindex(np.random.permutation(dataframe.index)).copy()

keys = dataframe.keys()
data_to_process = dataframe[keys[4:len(keys) - 1]].copy()
x_normalised = dfNormalize(data_to_process)

# Cambiar las etiquetas para que sean 0 o 1
change_labels = lambda x: 1 if x == 'nonTOR' else 0
y_normalised = dataframe['label'].apply(change_labels)

# Dividir los datos en train y test
X_train, X_test, y_train, y_test = train_test_split(x_normalised, y_normalised, test_size=0.3, random_state=42)

# Definir los parámetros a buscar en el GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Definir el modelo a utilizar en el GridSearchCV
model = RandomForestClassifier()

# Definir el GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)

# Entrenar el GridSearchCV
grid_search.fit(X_train, y_train)

# Obtener los mejores parámetros del GridSearchCV
best_params = grid_search.best_params_

# Realizar la validación cruzada con los mejores parámetros encontrados
cv_scores = cross_val_score(model, x_normalised, y_normalised, cv=5)
print("Cross-validation scores:", cv_scores)

# Entrenar el modelo con los mejores parámetros y la validación cruzada
model = RandomForestClassifier(**best_params)
model.fit(x_normalised, y_normalised)

# Realizar la predicción en el conjunto de test
y_pred = model.predict(X_test)

# Imprimir las métricas de f1-score, recall y accuracy
print(classification_report(y_test, y_pred))


Cross-validation scores: [0.99071429 0.99285714 0.99071429 0.99       0.99071429]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1043
           1       1.00      1.00      1.00      1057

    accuracy                           1.00      2100
   macro avg       1.00      1.00      1.00      2100
weighted avg       1.00      1.00      1.00      2100

