<a href="https://colab.research.google.com/github/micaelCZ/Paper_Repositorio/blob/main/B_RANDOM_FOREST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow-addons

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import roc_curve, auc, classification_report
import matplotlib.pyplot as plt
     
# Load the data
url = 'https://raw.githubusercontent.com/micaelCZ/Paper_Repositorio/main/dataset/datasetPreprocesado/Escenario2.csv'
df = pd.read_csv(url,low_memory=False)
dataframe = pd.read_csv(url,low_memory=False)
     

# Normalise the data
def dfNormalize(df):
    for feature_name in df.columns:
        df.loc[:,feature_name]= pd.to_numeric(df.loc[:,feature_name], errors='coerce').fillna(0)
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()   
        if (max_value - min_value) > 0:
            df.loc[:,feature_name] = (df.loc[:,feature_name] - min_value) / (max_value - min_value)
        else:
            df.loc[:,feature_name] = (df.loc[:,feature_name]- min_value)    
    return df
     

# Split the data into train and test sets
y = df['label']
label_map = {'AUDIO': 0, 'BROWSING': 1, 'CHAT': 2, 'FILE-TRANSFER' : 3, 'MAIL' : 4, 'P2P' : 5, 'VIDEO' : 6, 'VOIP' : 7}
change_labels = lambda x: label_map[x] if x in label_map else -1
y = df['label'].apply(change_labels)
     

X = df.iloc[:, 4:-1]
X = dfNormalize(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define the parameter grid
param_grid = {
    'max_depth': [5, 10, 15],
    'n_estimators': [50, 100, 200]
}

# Instantiate the model and the grid search
model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(model, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the grid search to the data using cross-validation
grid_search.fit(X_train, y_train)
cv_scores = cross_val_score(grid_search, X_train, y_train, cv=5)

print(f"Cross validation scores: {cv_scores}")
print(f"Mean cross validation score: {np.mean(cv_scores)}")
     
# Get the best model and evaluate its performance on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Cross validation scores: [0.82595183 0.83993784 0.83139083 0.83527584 0.83216783]
Mean cross validation score: 0.8329448329448331
              precision    recall  f1-score   support

           0       0.77      0.76      0.76       144
           1       0.66      0.83      0.74       321
           2       0.50      0.25      0.33        65
           3       0.91      0.92      0.92       173
           4       0.78      0.38      0.51        56
           5       0.98      0.97      0.97       217
           6       0.77      0.71      0.74       175
           7       0.99      0.98      0.99       458

    accuracy                           0.84      1609
   macro avg       0.80      0.72      0.74      1609
weighted avg       0.84      0.84      0.84      1609

