<a href="https://colab.research.google.com/github/micaelCZ/Paper_Repositorio/blob/main/rf_a_FACHERITO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, make_scorer
from scipy.stats import randint
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, recall_score, precision_score



In [3]:
# load the Dataset
datapath = 'https://raw.githubusercontent.com/jonathan-elian-toapanta/INTERNETWORKING/main/ESCENARIOS/ScenarioA.csv'
dataframe = pd.read_csv(datapath, low_memory=False, sep=',')

dataframe.head()

Unnamed: 0,Source IP,Source Port,Destination IP,Destination Port,Protocol,Flow Duration,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,...,Bwd IAT Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,label
0,10.0.2.15,53913,216.58.208.46,80,6,435,0.0,4597.701149,435.0,0.0,...,0,0,0,0,0,0,0,0,0,nonTOR
1,10.0.2.15,53913,216.58.208.46,80,6,259,0.0,7722.007722,259.0,0.0,...,0,0,0,0,0,0,0,0,0,nonTOR
2,10.0.2.15,53913,216.58.208.46,80,6,891,0.0,2244.668911,891.0,0.0,...,0,0,0,0,0,0,0,0,0,nonTOR
3,10.0.2.15,53913,216.58.208.46,80,6,1074,0.0,1862.197393,1074.0,0.0,...,0,0,0,0,0,0,0,0,0,nonTOR
4,10.0.2.15,53913,216.58.208.46,80,6,315,0.0,6349.206349,315.0,0.0,...,0,0,0,0,0,0,0,0,0,nonTOR


In [4]:
# Reduce the number of examples of the minority class
nonTOR_df = dataframe[dataframe['label'] == 'nonTOR']
TOR_df = dataframe[dataframe['label'] == 'TOR'].sample(n=len(nonTOR_df) // 2, replace=True, random_state=42)
dataframe = pd.concat([nonTOR_df, TOR_df], axis=0).reset_index(drop=True)


In [5]:
# Normalize the data

def dfNormalize(df):
    for feature_name in df.columns:
        df.loc[:, feature_name] = pd.to_numeric(df.loc[:, feature_name], errors='coerce').fillna(0)
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        if (max_value - min_value) > 0:
            df.loc[:, feature_name] = (df.loc[:, feature_name] - min_value) / (max_value - min_value)
        else:
            df.loc[:, feature_name] = (df.loc[:, feature_name] - min_value)
    return df

dataframe = dataframe.reindex(np.random.permutation(dataframe.index)).copy()
keys = dataframe.keys()
data_to_process = dataframe[keys[4:len(keys) - 1]].copy()
x_normalised = dfNormalize(data_to_process)

  df.loc[:, feature_name] = (df.loc[:, feature_name] - min_value) / (max_value - min_value)


In [6]:
change_labels = lambda x: 1 if x == 'nonTOR' else 0
y_normalised = dataframe['label'].apply(change_labels)
X_train, X_test, y_train, y_test = train_test_split(x_normalised, y_normalised, test_size=0.3, random_state=42)



In [7]:
# IMPUTE
imputer = SimpleImputer(strategy='median')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)


In [8]:
# RANDOM FOREST
param_dist = {
    'n_estimators': randint(10, 50),  # reduce the number of estimators
    'max_depth': [2, 5],  # reduce the maximum depth of the trees
    'min_samples_split': [2, 3, 4],  # increase the minimum number of samples required to split a node
    'min_samples_leaf': [1, 2],  # increase the minimum number of samples required to form a leaf
    'max_features': ['sqrt', 'log2']  # reduce the maximum number of features used in each tree
}

random_search = RandomizedSearchCV(RandomForestClassifier(random_state=42), param_distributions=param_dist, n_iter=50,
                                   cv=5, scoring=make_scorer(f1_score), random_state=42)
random_search.fit(X_train_imputed, y_train)
print(f'Best parameters: {random_search.best_params_}')


Best parameters: {'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 49}


In [9]:
y_pred = random_search.predict(X_test_imputed)
accuracy = accuracy_score(y_test, y_pred)
print(f'Test accuracy: {accuracy:.4f}')

Test accuracy: 0.9218


In [10]:
# Train a Random Forest classifier
rfc = RandomForestClassifier(n_estimators=random_search.best_params_['n_estimators'], 
                              max_depth=random_search.best_params_['max_depth'],
                              min_samples_split=random_search.best_params_['min_samples_split'],
                              min_samples_leaf=random_search.best_params_['min_samples_leaf'],
                              max_features=random_search.best_params_['max_features'],
                              random_state=42)
rfc.fit(X_train_imputed, y_train)



In [11]:
# Predict on test data and evaluate accuracy, F1, Recall, and Precision
y_pred = rfc.predict(X_test_imputed)
accuracy = (y_pred == y_test).mean()
f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)


In [12]:

print(f'Test accuracy: {accuracy:.4f}')
print(f'F1 score: {f1:.4f}')
print(f'Recall score: {recall:.4f}')
print(f'Precision score: {precision:.4f}')

Test accuracy: 0.9218
F1 score: 0.9394
Recall score: 0.9107
Precision score: 0.9700
