In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

# Leer datos desde URL
datapath = 'https://raw.githubusercontent.com/beespinosa1/Inter/main/Escenario3.csv'
names = ['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'Sload', 'Dload', 'Spkts', 'Dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'trans_depth', 'res_bdy_len', 'Sjit', 'Djit', 'Stime', 'Ltime', 'Sintpkt', 'Dintpkt', 'tcprtt', 'synack', 'ackdat', 'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat', 'label']
dataframe = pd.read_csv(datapath, names=names, low_memory=False, sep=',')

# Codificar variables categóricas
columnas_codificar = ['srcip', 'dstip', 'dsport', 'proto', 'state', 'dur', 'service', 'Sload', 'Dload', 'Sjit', 'Djit', 'Sintpkt', 'Dintpkt', 'ct_ftp_cmd', 'attack_cat']
for col in columnas_codificar:
    le = LabelEncoder()
    columna_codificada = le.fit_transform(dataframe[col])
    dataframe[col] = columna_codificada

X = dataframe.iloc[:, :-1].values
scaler = StandardScaler()
x_normalised = scaler.fit_transform(X)
y = dataframe['label']

imputer = KNNImputer(n_neighbors=5)
X_train_imputed = imputer.fit_transform(x_normalised)

# Imputar los valores faltantes con la media de cada columna
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Convertir el resultado a un DataFrame de pandas
X = pd.DataFrame(X_imputed, columns=dataframe.columns[:-1])

y = dataframe['label']

# Dividir datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_normalised = scaler.fit_transform(X_train)
X_test_normalised = scaler.transform(X_test)

imputer = KNNImputer(n_neighbors=5)
X_train_imputed = imputer.fit_transform(X_train_normalised)
X_test_imputed = imputer.transform(X_test_normalised)

# Definir hiperparámetros para grid search
param_grid = {
    'n_estimators': [10, 50, 100],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [2, 4, 6, 8],
    'min_samples_split': [2, 5, 10],
    'random_state': [42],
}

# Realizar grid search y cross validation
rf = RandomForestClassifier(random_state=42)
rf_cv = GridSearchCV(rf, param_grid, cv=5)
rf_cv.fit(X_train_imputed, y_train)

# Obtener resultados
y_pred = rf_cv.predict(X_test_imputed)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
scores = cross_val_score(rf_cv.best_estimator_, X_train_imputed, y_train, cv=5)

# Imprimir resultados
print('Precision:', np.mean(precision))
print('Recall:', np.mean(recall))
print('F1 Score:', np.mean(f1))



Precision: 1.0
Recall: 1.0
F1 Score: 1.0
