# IDS con Machine Learning (NSL-KDD) — EDA + 10 Modelos + 3 Ensambles

Este notebook realiza un análisis exploratorio de datos (EDA) con Plotly y entrena múltiples modelos de Machine Learning para **detección de intrusiones** usando el dataset **NSL-KDD**. Incluye además tres métodos de ensamble (Voting, Stacking, Bagging).

In [1]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, precision_recall_fscore_support, roc_auc_score,
                            classification_report, confusion_matrix, roc_curve, precision_recall_curve)
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier,
                              AdaBoostClassifier, VotingClassifier, StackingClassifier, BaggingClassifier)
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

OUTPUTS = Path('CiberTelepatia') / 'outputs'
OUTPUTS.mkdir(parents=True, exist_ok=True)

TRAIN_URL = 'https://raw.githubusercontent.com/jmnwong/NSL-KDD-Dataset/master/KDDTrain%2B.txt'
TEST_URL = 'https://raw.githubusercontent.com/jmnwong/NSL-KDD-Dataset/master/KDDTest%2B.txt'

COLUMN_NAMES = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins',
    'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root',
    'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
    'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
    'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
    'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
    'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'attack', 'difficulty'
]


## 1. Cargar datos

In [2]:
train_df = pd.read_csv(TRAIN_URL, header=None, names=COLUMN_NAMES)
test_df = pd.read_csv(TEST_URL, header=None, names=COLUMN_NAMES)
train_df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,difficulty
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


## 2. Preprocesamiento

In [3]:
def preprocess(train_df: pd.DataFrame, test_df: pd.DataFrame):
    train_df = train_df.copy()
    test_df = test_df.copy()
    # Etiqueta binaria
    train_df['is_attack'] = (train_df['attack'] != 'normal').astype(int)
    test_df['is_attack'] = (test_df['attack'] != 'normal').astype(int)
    # Quitar columnas no usadas
    train_df.drop(columns=['attack', 'difficulty'], inplace=True)
    test_df.drop(columns=['attack', 'difficulty'], inplace=True)
    # One-hot categóricas
    cat_cols = ['protocol_type', 'service', 'flag']
    train_proc = pd.get_dummies(train_df, columns=cat_cols)
    test_proc = pd.get_dummies(test_df, columns=cat_cols)
    # Alinear columnas
    missing = set(train_proc.columns) - set(test_proc.columns)
    for c in missing: test_proc[c] = 0
    extra = set(test_proc.columns) - set(train_proc.columns)
    for c in extra: test_proc.drop(columns=[c], inplace=True)
    test_proc = test_proc[train_proc.columns]
    # Separar X,y
    y_train = train_proc['is_attack']
    X_train = train_proc.drop(columns=['is_attack'])
    y_test = test_proc['is_attack']
    X_test = test_proc.drop(columns=['is_attack'])
    # Escalar numéricas (evitar dummies)
    num_cols = [c for c in X_train.columns if not (c.startswith('protocol_type_') or c.startswith('service_') or c.startswith('flag_'))]
    scaler = StandardScaler(with_mean=False)
    X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
    X_test[num_cols] = scaler.transform(X_test[num_cols])
    return X_train, y_train, X_test, y_test, num_cols

X_train, y_train, X_test, y_test, num_cols = preprocess(train_df, test_df)
X_train.shape, X_test.shape

((125973, 122), (22544, 122))

## 3. EDA (Análisis Exploratorio) con Plotly

In [4]:
# 3.1 Distribución de tipos de ataque (en train)
attack_counts = train_df['attack'].value_counts().reset_index()
attack_counts.columns = ['Attack Type', 'Count']
fig = px.bar(attack_counts, x='Attack Type', y='Count', title='Distribución de Tipos de Ataque (Train)')
fig.show()
fig.write_html(OUTPUTS / 'attack_type_distribution.html')

In [5]:
# 3.2 Balance de clases (normal vs ataque)
class_counts = train_df.assign(is_attack = (train_df['attack'] != 'normal').astype(int))['is_attack'].value_counts()
fig = px.pie(values=class_counts.values, names=['Normal', 'Ataque'], title='Balance de clases (Train)')
fig.show()
fig.write_html(OUTPUTS / 'class_balance_pie.html')

In [6]:
# 3.3 Distribuciones numéricas (muestra)
num_df = train_df.select_dtypes(include=np.number).copy()
num_cols_simple = num_df.columns[:8]  # primeras 8 para ejemplo rápido
fig = make_subplots(rows=2, cols=4, subplot_titles=list(num_cols_simple))
r, c = 1, 1
for col in num_cols_simple:
    hist = go.Histogram(x=num_df[col], nbinsx=50, name=col, showlegend=False)
    fig.add_trace(hist, row=r, col=c)
    c += 1
    if c==5:
        r += 1; c = 1
fig.update_layout(title='Distribuciones de variables numéricas (subset)')
fig.show()
fig.write_html(OUTPUTS / 'numeric_distributions.html')

In [7]:
# 3.4 Boxplot de bytes totales (log) vs tipo de conexión
tmp = train_df.copy()
tmp['is_attack'] = (tmp['attack']!='normal').astype(int)
tmp['importance_score'] = np.log(tmp['src_bytes'] + tmp['dst_bytes'] + 1)
fig = px.box(tmp, x='is_attack', y='importance_score',
             title='Puntuación de Importancia (log(src+dst+1)) por tipo', labels={'is_attack':'0=Normal,1=Ataque'})
fig.show()
fig.write_html(OUTPUTS / 'importance_vs_attack.html')

In [8]:
# 3.5 Dispersión duración vs src_bytes (muestra), coloreado por ataque
sample_df = train_df.sample(n=min(12000, len(train_df)), random_state=42)
fig = px.scatter(sample_df, x='duration', y='src_bytes', color=(sample_df['attack']!='normal'),
                 title='Duración vs src_bytes (muestra) por tipo', labels={'color':'Ataque?'},
                 log_x=True, log_y=True)
fig.show()
fig.write_html(OUTPUTS / 'duration_vs_src_bytes.html')

## 4. Importancia de variables (ExtraTrees)

In [9]:
et = ExtraTreesClassifier(n_estimators=300, random_state=42, n_jobs=-1)
et.fit(X_train, y_train)
imp = pd.Series(et.feature_importances_, index=X_train.columns).sort_values(ascending=False).head(20)
fig = px.bar(imp[::-1], orientation='h', title='Top 20 características por importancia (ExtraTrees)')
fig.show()
fig.write_html(OUTPUTS / 'feature_importance_extratrees.html')

## 5. Modelado: 10+ clasificadores

In [10]:
models = {
    'LogReg': LogisticRegression(max_iter=1000, solver='saga', n_jobs=-1),
    'LinearSVC': LinearSVC(),
    'SVC_RBF': SVC(kernel='rbf', C=3.0, gamma='scale', probability=False),
    'KNN': KNeighborsClassifier(n_neighbors=15),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'RandomForest': RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1),
    'ExtraTrees': ExtraTreesClassifier(n_estimators=300, random_state=42, n_jobs=-1),
    'GradBoost': GradientBoostingClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'GaussianNB': GaussianNB(),
    'LDA': LinearDiscriminantAnalysis()
}

def evaluate_all(models, X_train, y_train, X_test, y_test):
    rows = []
    for name, m in models.items():
        print(f'Entrenando {name}…')
        m.fit(X_train, y_train)
        y_pred = m.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary', zero_division=0)
        # ROC-AUC con decision_function o predict_proba si existe
        try:
            if hasattr(m, 'predict_proba'):
                y_score = m.predict_proba(X_test)[:,1]
            elif hasattr(m, 'decision_function'):
                y_score = m.decision_function(X_test)
            else:
                y_score = None
            roc = roc_auc_score(y_test, y_score) if y_score is not None else np.nan
        except Exception:
            roc = np.nan
        rows.append({'model':name, 'accuracy':acc, 'precision':prec, 'recall':rec, 'f1':f1, 'roc_auc':roc})
    return pd.DataFrame(rows).sort_values(by=['f1','accuracy'], ascending=False)

results = evaluate_all(models, X_train, y_train, X_test, y_test)
results

Entrenando LogReg…



The max_iter was reached which means the coef_ did not converge



Entrenando LinearSVC…
Entrenando SVC_RBF…
Entrenando KNN…
Entrenando DecisionTree…
Entrenando RandomForest…
Entrenando ExtraTrees…
Entrenando GradBoost…
Entrenando AdaBoost…
Entrenando GaussianNB…
Entrenando LDA…


Unnamed: 0,model,accuracy,precision,recall,f1,roc_auc
7,GradBoost,0.807266,0.971765,0.681212,0.800953,0.944143
6,ExtraTrees,0.790055,0.967884,0.652848,0.779748,0.95736
4,DecisionTree,0.785353,0.964767,0.646536,0.774227,0.807856
2,SVC_RBF,0.784422,0.960707,0.647783,0.773806,0.8898
10,LDA,0.761666,0.924926,0.632666,0.751377,0.848395
5,RandomForest,0.768187,0.967663,0.613263,0.750739,0.961077
8,AdaBoost,0.764505,0.963757,0.609211,0.746527,0.932736
3,KNN,0.763973,0.967164,0.605938,0.745077,0.841318
1,LinearSVC,0.751553,0.916398,0.62012,0.739694,0.786178
0,LogReg,0.734785,0.913989,0.589574,0.716783,0.860326


In [11]:
# 5.1 Barras de Accuracy y F1
fig = go.Figure()
fig.add_bar(x=results['model'], y=results['accuracy'], name='Accuracy')
fig.add_bar(x=results['model'], y=results['f1'], name='F1')
fig.update_layout(barmode='group', title='Rendimiento por modelo (Accuracy y F1)', xaxis_title='Modelo', yaxis_title='Score')
fig.show()
fig.write_html(OUTPUTS / 'accuracy_f1_by_model.html')

In [12]:
# 5.2 Matrices de confusión de los 4 mejores por F1
top4 = results['model'].head(4).tolist()
fig = make_subplots(rows=2, cols=2, subplot_titles=top4)
r=c=1
for name in top4:
    m = models[name]
    y_pred = m.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    heat = go.Heatmap(z=cm, x=['Pred Normal','Pred Ataque'], y=['Real Normal','Real Ataque'], showscale=False)
    fig.add_trace(heat, row=r, col=c)
    c += 1
    if c==3: r += 1; c = 1
fig.update_layout(title='Matrices de confusión (Top-4 F1)')
fig.show()
fig.write_html(OUTPUTS / 'confusion_matrices_top4.html')

In [13]:
# 5.3 Curvas ROC de los 4 mejores (si es posible)
fig = go.Figure()
for name in top4:
    m = models[name]
    try:
        if hasattr(m, 'predict_proba'):
            y_score = m.predict_proba(X_test)[:,1]
        elif hasattr(m, 'decision_function'):
            y_score = m.decision_function(X_test)
        else:
            y_score = None
        if y_score is not None:
            fpr, tpr, _ = roc_curve(y_test, y_score)
            fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=name))
    except Exception:
        pass
fig.add_trace(go.Scatter(x=[0,1], y=[0,1], mode='lines', name='Azar', line=dict(dash='dash')))
fig.update_layout(title='Curvas ROC (Top-4)')
fig.show()
fig.write_html(OUTPUTS / 'roc_curves_top4.html')

## 6. Ensambles: Voting, Stacking, Bagging

In [14]:
# Voting (hard)
voting = VotingClassifier(estimators=[('lr', models['LogReg']), ('rf', models['RandomForest']), ('et', models['ExtraTrees'])], voting='hard', n_jobs=-1)
voting.fit(X_train, y_train)
y_pred = voting.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary', zero_division=0)
ens_results = pd.DataFrame([{'model':'Voting(hard)', 'accuracy':acc, 'precision':prec, 'recall':rec, 'f1':f1, 'roc_auc':np.nan}])
ens_results

Unnamed: 0,model,accuracy,precision,recall,f1,roc_auc
0,Voting(hard),0.772889,0.967852,0.621678,0.75707,


In [15]:
# Stacking
stacking = StackingClassifier(
    estimators=[('rf', RandomForestClassifier(n_estimators=150, random_state=42, n_jobs=-1)),
               ('gb', GradientBoostingClassifier(random_state=42)),
               ('lda', LinearDiscriminantAnalysis())],
    final_estimator=LogisticRegression(max_iter=1000, solver='lbfgs'), n_jobs=-1)
stacking.fit(X_train, y_train)
y_pred = stacking.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary', zero_division=0)
ens_results = pd.concat([ens_results, pd.DataFrame([{'model':'Stacking', 'accuracy':acc, 'precision':prec, 'recall':rec, 'f1':f1, 'roc_auc':np.nan}])], ignore_index=True)
ens_results

Unnamed: 0,model,accuracy,precision,recall,f1,roc_auc
0,Voting(hard),0.772889,0.967852,0.621678,0.75707,
1,Stacking,0.791075,0.968725,0.654095,0.78091,


In [17]:
# Bagging
bagging = BaggingClassifier(DecisionTreeClassifier(random_state=42), n_estimators=100, random_state=42, n_jobs=-1)
bagging.fit(X_train, y_train)
y_pred = bagging.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary', zero_division=0)
ens_results = pd.concat([ens_results, pd.DataFrame([{'model':'Bagging(Tree)', 'accuracy':acc, 'precision':prec, 'recall':rec, 'f1':f1, 'roc_auc':np.nan}])], ignore_index=True)
ens_results

Unnamed: 0,model,accuracy,precision,recall,f1,roc_auc
0,Voting(hard),0.772889,0.967852,0.621678,0.75707,
1,Stacking,0.791075,0.968725,0.654095,0.78091,
2,Bagging(Tree),0.79937,0.968222,0.669524,0.791634,


In [18]:
# 6.1 Comparativa final modelos + ensambles
final_results = pd.concat([results, ens_results], ignore_index=True).sort_values(by=['f1','accuracy'], ascending=False)
final_results.to_csv(OUTPUTS / 'results_summary.csv', index=False)
fig = go.Figure()
fig.add_bar(x=final_results['model'], y=final_results['f1'], name='F1')
fig.update_layout(title='Comparativa final (Modelos + Ensambles) — F1', xaxis_title='Modelo', yaxis_title='F1')
fig.show()
fig.write_html(OUTPUTS / 'final_f1_comparison.html')
final_results

Unnamed: 0,model,accuracy,precision,recall,f1,roc_auc
0,GradBoost,0.807266,0.971765,0.681212,0.800953,0.944143
13,Bagging(Tree),0.79937,0.968222,0.669524,0.791634,
12,Stacking,0.791075,0.968725,0.654095,0.78091,
1,ExtraTrees,0.790055,0.967884,0.652848,0.779748,0.95736
2,DecisionTree,0.785353,0.964767,0.646536,0.774227,0.807856
3,SVC_RBF,0.784422,0.960707,0.647783,0.773806,0.8898
11,Voting(hard),0.772889,0.967852,0.621678,0.75707,
4,LDA,0.761666,0.924926,0.632666,0.751377,0.848395
5,RandomForest,0.768187,0.967663,0.613263,0.750739,0.961077
6,AdaBoost,0.764505,0.963757,0.609211,0.746527,0.932736
