In [None]:





import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.utils import resample
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, BatchNormalization, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import CSVLogger, ModelCheckpoint, EarlyStopping
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import ipywidgets as widgets
from IPython.display import display, clear_output, HTML
import pickle
import joblib
import os


warnings.filterwarnings('ignore')


np.random.seed(42)
tf.random.set_seed(42)


sns.set(style="whitegrid")
plt.style.use('fivethirtyeight')

print("Libraries imported successfully!")


def load_data(file_path):
    """
    Load the CICIDS 2018 dataset
    """
    print(f"Loading data from {file_path}...")
    start_time = time.time()
    
    
    try:
        df = pd.read_csv(file_path)
        print(f"Data loaded successfully in {time.time() - start_time:.2f} seconds!")
        print(f"Dataset shape: {df.shape}")
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None


def perform_eda(df):
    """
    Perform exploratory data analysis on the dataset
    """
    print("\n" + "="*50)
    print("EXPLORATORY DATA ANALYSIS")
    print("="*50)
    
    
    print("\nBasic Information:")
    print(f"Number of samples: {df.shape[0]}")
    print(f"Number of features: {df.shape[1]}")
    
    
    print("\nMissing values summary:")
    missing_values = df.isnull().sum()
    print(f"Total features with missing values: {sum(missing_values > 0)}")
    if sum(missing_values > 0) > 0:
        print(missing_values[missing_values > 0])
    
    
    print("\nAttack distribution:")
    attack_counts = df['Label'].value_counts()
    print(attack_counts)
    
    
    plt.figure(figsize=(12, 6))
    ax = sns.countplot(x='Label', data=df)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
    ax.set(xlabel='Attack Type', ylabel='Number of Attacks')
    plt.title('Distribution of Network Intrusion Types')
    plt.tight_layout()
    plt.show()
    
    
    print("\nData types:")
    print(df.dtypes.value_counts())
    
    
    print("\nGenerating correlation heatmap for key features...")
    numeric_df = df.select_dtypes(include=[np.number]).iloc[:, :10]  
    plt.figure(figsize=(12, 10))
    sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
    plt.title('Correlation Heatmap of Key Features')
    plt.tight_layout()
    plt.show()
    
    
    print("\nGenerating interactive scatter plot...")
    sample_df = df.sample(min(10000, len(df)))
    fig = px.scatter(sample_df, 
                     x='Flow Duration', 
                     y='Tot Fwd Pkts',  
                     color='Label',
                     title='Flow Duration vs Total Forward Packets',
                     opacity=0.7)
    fig.show()
    
    return None



def preprocess_data(df, balanced_sample_size=20000):
    """
    Preprocess the data for model training
    """
    print("\n" + "="*50)
    print("DATA PREPROCESSING")
    print("="*50)
    
    
    print("\nHandling missing values...")
    df_cleaned = df.dropna()
    print(f"Shape after dropping missing values: {df_cleaned.shape}")
    
    
    print("\nEncoding labels...")
    label_encoder = LabelEncoder()
    df_cleaned['Label'] = label_encoder.fit_transform(df_cleaned['Label'])
    print(f"Unique encoded labels: {df_cleaned['Label'].unique()}")
    print(f"Label mapping: {dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))}")
    
    
    joblib.dump(label_encoder, 'label_encoder.pkl')
    print("Label encoder saved to 'label_encoder.pkl'")
    
    
    print("\nBalancing the dataset...")
    
    label_frames = []
    unique_labels = df_cleaned['Label'].unique()
    
    for label in unique_labels:
        label_df = df_cleaned[df_cleaned['Label'] == label]
        if len(label_df) > balanced_sample_size:
            label_df = resample(label_df, n_samples=balanced_sample_size, random_state=42, replace=False)
        else:
            label_df = resample(label_df, n_samples=balanced_sample_size, random_state=42, replace=True)
        label_frames.append(label_df)
    
    
    balanced_df = pd.concat(label_frames)
    print(f"Shape of balanced dataset: {balanced_df.shape}")
    
    
    plt.figure(figsize=(10, 6))
    sns.countplot(x='Label', data=balanced_df)
    plt.title('Distribution After Balancing')
    plt.xlabel('Attack Type (Encoded)')
    plt.ylabel('Count')
    plt.show()
    
    
    print("\nRemoving non-feature columns...")
    columns_to_drop = ['Timestamp', 'Label']
    
    for col in balanced_df.columns:
        if balanced_df[col].dtype == 'object' and col != 'Label':
            columns_to_drop.append(col)
    
    
    y = balanced_df['Label']
    X = balanced_df.drop(columns=columns_to_drop, errors='ignore')
    
    print(f"Features shape after dropping non-feature columns: {X.shape}")
    
    
    print("\nChecking for infinity or extremely large values...")
    
    X.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    
    nan_count = X.isna().sum().sum()
    if nan_count > 0:
        print(f"Found {nan_count} NaN values after replacing infinities.")
        
        
        for col in X.columns:
            if X[col].isna().sum() > 0:
                median_val = X[col].median()
                X[col].fillna(median_val, inplace=True)
    
    
    print("\nClipping extremely large values...")
    for col in X.columns:
        q1 = X[col].quantile(0.01)
        q3 = X[col].quantile(0.99)
        iqr = q3 - q1
        lower_bound = q1 - 3 * iqr
        upper_bound = q3 + 3 * iqr
        
        X[col] = X[col].clip(lower_bound, upper_bound)
    
    
    print("\nNormalizing features...")
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    
    
    joblib.dump(scaler, 'feature_scaler.pkl')
    print("Feature scaler saved to 'feature_scaler.pkl'")
    
    
    y_onehot = to_categorical(y, num_classes=len(unique_labels))
    
    
    print("\nSplitting data into training and testing sets...")
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42, stratify=y
    )
    
    X_train_onehot, X_test_onehot, y_train_onehot, y_test_onehot = train_test_split(
        X_scaled, y_onehot, test_size=0.2, random_state=42, stratify=y
    )
    
    
    X_train_cnn = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
    X_test_cnn = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
    
    print(f"Training set shape: {X_train.shape}")
    print(f"Testing set shape: {X_test.shape}")
    print(f"CNN training set shape: {X_train_cnn.shape}")
    
    
    feature_names = X.columns.tolist()
    with open('feature_names.pkl', 'wb') as f:
        pickle.dump(feature_names, f)
    print("Feature names saved to 'feature_names.pkl'")
    
    return {
        'X_train': X_train, 
        'X_test': X_test,
        'y_train': y_train, 
        'y_test': y_test,
        'X_train_cnn': X_train_cnn,
        'X_test_cnn': X_test_cnn,
        'y_train_onehot': y_train_onehot,
        'y_test_onehot': y_test_onehot,
        'unique_labels': unique_labels,
        'feature_names': feature_names,
        'label_encoder': label_encoder
    }



def build_rf_model(data_dict):
    """
    Build and train a Random Forest model
    """
    print("\n" + "="*50)
    print("RANDOM FOREST MODEL TRAINING")
    print("="*50)
    
    X_train = data_dict['X_train']
    y_train = data_dict['y_train']
    X_test = data_dict['X_test']
    y_test = data_dict['y_test']
    feature_names = data_dict['feature_names']
    
    print("\nTraining Random Forest model...")
    start_time = time.time()
    
    
    rf_model = RandomForestClassifier(
        n_estimators=100,
        max_depth=20,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1
    )
    
    rf_model.fit(X_train, y_train)
    
    training_time = time.time() - start_time
    print(f"Model trained in {training_time:.2f} seconds")
    
    
    print("\nEvaluating model on test data...")
    y_pred = rf_model.predict(X_test)
    
    
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    print(f"Accuracy: {accuracy:.4f}")
    print("\nConfusion Matrix:")
    print(conf_matrix)
    print("\nClassification Report:")
    print(report)
    
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix - Random Forest')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.show()
    
    
    if len(feature_names) > 0:
        importances = rf_model.feature_importances_
        indices = np.argsort(importances)[::-1]
        
        
        plt.figure(figsize=(12, 8))
        plt.title('Feature Importances - Random Forest')
        plt.bar(range(min(20, len(feature_names))), 
                importances[indices][:20],
                align='center')
        plt.xticks(range(min(20, len(feature_names))), 
                  [feature_names[i] for i in indices[:20]], 
                  rotation=90)
        plt.tight_layout()
        plt.show()
    
    
    joblib.dump(rf_model, 'random_forest_model.pkl')
    print("Random Forest model saved to 'random_forest_model.pkl'")
    
    return rf_model


def build_cnn_model(data_dict):
    """
    Build and train a CNN model
    """
    print("\n" + "="*50)
    print("CNN MODEL TRAINING")
    print("="*50)
    
    X_train_cnn = data_dict['X_train_cnn']
    y_train_onehot = data_dict['y_train_onehot']
    X_test_cnn = data_dict['X_test_cnn']
    y_test_onehot = data_dict['y_test_onehot']
    
    
    n_timesteps = X_train_cnn.shape[1]
    n_features = 1
    n_outputs = y_train_onehot.shape[1]
    
    print(f"Input shape: ({n_timesteps}, {n_features})")
    print(f"Output shape: {n_outputs}")
    
    
    print("\nBuilding CNN model...")
    model = Sequential()
    
    
    model.add(Conv1D(filters=64, kernel_size=6, activation='relu', 
                   padding='same', input_shape=(n_timesteps, n_features)))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=3, strides=2, padding='same'))
    
    
    model.add(Conv1D(filters=128, kernel_size=6, activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=3, strides=2, padding='same'))
    
    
    model.add(Conv1D(filters=256, kernel_size=6, activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=3, strides=2, padding='same'))
    
    
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(n_outputs, activation='softmax'))
    
    
    model.compile(
        loss='categorical_crossentropy',
        optimizer='adam',
        metrics=['accuracy']
    )
    
    
    model.summary()
    
    
    csv_logger = CSVLogger('cnn_training_log.csv', append=True)
    checkpoint = ModelCheckpoint(
        'best_cnn_model.h5',
        monitor='val_accuracy',
        save_best_only=True,
        mode='max',
        verbose=1
    )
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True,
        verbose=1
    )
    
    
    print("\nTraining CNN model...")
    start_time = time.time()
    
    history = model.fit(
        X_train_cnn, y_train_onehot,
        epochs=30,
        batch_size=32,
        validation_data=(X_test_cnn, y_test_onehot),
        callbacks=[csv_logger, checkpoint, early_stopping],
        verbose=2
    )
    
    training_time = time.time() - start_time
    print(f"Model trained in {training_time:.2f} seconds")
    
    
    print("\nEvaluating CNN model on test data...")
    scores = model.evaluate(X_test_cnn, y_test_onehot, verbose=0)
    print(f"Test Accuracy: {scores[1]:.4f}")
    
    
    plt.figure(figsize=(12, 5))
    
    
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.tight_layout()
    plt.show()
    
    
    model.save('cnn_model.h5')
    print("CNN model saved to 'cnn_model.h5'")
    
    return model, history



def create_interactive_predictor(rf_model, cnn_model, data_dict):
    """
    Create an interactive interface for predicting individual samples
    """
    print("\n" + "="*50)
    print("INTERACTIVE PREDICTION INTERFACE")
    print("="*50)
    
    
    scaler = joblib.load('feature_scaler.pkl')
    label_encoder = data_dict['label_encoder']
    feature_names = data_dict['feature_names']
    
    
    
    if isinstance(data_dict['X_test'], np.ndarray):
        X_test_df = pd.DataFrame(data_dict['X_test'], columns=feature_names)
    else:
        X_test_df = data_dict['X_test']
        
    if isinstance(data_dict['y_test'], np.ndarray):
        y_test_series = pd.Series(data_dict['y_test'])
    else:
        y_test_series = data_dict['y_test']
    
    
    def predict_sample(sample_data):
        
        sample_array = np.array(sample_data).reshape(1, -1)
        sample_scaled = scaler.transform(sample_array)
        
        
        rf_pred = rf_model.predict(sample_scaled)[0]
        rf_prob = rf_model.predict_proba(sample_scaled)[0]
        
        
        sample_cnn = sample_scaled.reshape(1, sample_scaled.shape[1], 1)
        cnn_prob = cnn_model.predict(sample_cnn)[0]
        cnn_pred = np.argmax(cnn_prob)
        
        
        rf_label = label_encoder.inverse_transform([rf_pred])[0]
        cnn_label = label_encoder.inverse_transform([cnn_pred])[0]
        
        return {
            'rf_prediction': rf_label,
            'rf_confidence': np.max(rf_prob) * 100,
            'cnn_prediction': cnn_label,
            'cnn_confidence': np.max(cnn_prob) * 100,
            'rf_probabilities': rf_prob,
            'cnn_probabilities': cnn_prob
        }
    
    
    max_idx = min(len(X_test_df) - 1, len(y_test_series) - 1)  
    sample_idx = np.random.randint(0, max_idx + 1)  
    
    
    sample = X_test_df.iloc[sample_idx].values if hasattr(X_test_df, 'iloc') else X_test_df[sample_idx]
    true_label_value = y_test_series.iloc[sample_idx] if hasattr(y_test_series, 'iloc') else y_test_series[sample_idx]
    true_label = label_encoder.inverse_transform([true_label_value])[0]
    
    prediction_results = predict_sample(sample)
    
    print("\nSample Prediction Demo:")
    print(f"True label: {true_label}")
    print(f"Random Forest prediction: {prediction_results['rf_prediction']} with {prediction_results['rf_confidence']:.2f}% confidence")
    print(f"CNN prediction: {prediction_results['cnn_prediction']} with {prediction_results['cnn_confidence']:.2f}% confidence")
    
    
    labels = label_encoder.classes_
    
    
    fig = make_subplots(rows=1, cols=2, subplot_titles=("Random Forest Probabilities", "CNN Probabilities"))
    
    fig.add_trace(
        go.Bar(
            x=labels,
            y=prediction_results['rf_probabilities'],
            marker_color='blue',
            name='Random Forest'
        ),
        row=1, col=1
    )
    
    fig.add_trace(
        go.Bar(
            x=labels,
            y=prediction_results['cnn_probabilities'],
            marker_color='red',
            name='CNN'
        ),
        row=1, col=2
    )
    
    fig.update_layout(
        title_text="Prediction Probabilities Comparison",
        height=500,
        width=1000,
        showlegend=True
    )
    
    fig.show()
    
    
    print("\nEnter custom sample features for prediction:")
    
    
    feature_widgets = {}
    for i, feature in enumerate(feature_names):
        feature_val = float(sample[i]) if i < len(sample) else 0.0  
        feature_widgets[feature] = widgets.FloatText(
            value=feature_val,
            description=f"{feature}:",
            style={'description_width': 'initial'},
            layout=widgets.Layout(width='80%')
        )
    
    
    output_text = widgets.Output()
    output_graph = widgets.Output()
    
    
    def on_predict_button_clicked(b):
        with output_text:
            clear_output()
            
            custom_sample = [feature_widgets[feature].value for feature in feature_names]
            results = predict_sample(custom_sample)
            
            print(f"Random Forest prediction: {results['rf_prediction']} with {results['rf_confidence']:.2f}% confidence")
            print(f"CNN prediction: {results['cnn_prediction']} with {results['cnn_confidence']:.2f}% confidence")
        
        with output_graph:
            clear_output()
            
            fig = make_subplots(rows=1, cols=2, subplot_titles=("Random Forest Probabilities", "CNN Probabilities"))
            
            fig.add_trace(
                go.Bar(
                    x=labels,
                    y=results['rf_probabilities'],
                    marker_color='blue',
                    name='Random Forest'
                ),
                row=1, col=1
            )
            
            fig.add_trace(
                go.Bar(
                    x=labels,
                    y=results['cnn_probabilities'],
                    marker_color='red',
                    name='CNN'
                ),
                row=1, col=2
            )
            
            fig.update_layout(
                title_text="Prediction Probabilities Comparison",
                height=500,
                width=1000,
                showlegend=True
            )
            
            fig.show()
    
    
    predict_button = widgets.Button(
        description='Predict',
        button_style='success',
        tooltip='Click to predict'
    )
    predict_button.on_click(on_predict_button_clicked)
    
    
    
    feature_chunks = [feature_names[i:i+5] for i in range(0, len(feature_names), 5)]
    
    
    tabs = widgets.Tab()
    tab_children = []
    
    for i, chunk in enumerate(feature_chunks):
        chunk_container = widgets.VBox([feature_widgets[f] for f in chunk])
        tab_children.append(chunk_container)
        
    tabs.children = tab_children
    
    
    for i in range(len(feature_chunks)):
        tabs.set_title(i, f'Features {i*5+1}-{min((i+1)*5, len(feature_names))}')
    
    
    layout = widgets.VBox([
        widgets.HTML("<h3>Insider Threat Detection - Interactive Predictor</h3>"),
        widgets.HTML("<p>Adjust feature values and click Predict to see the model results</p>"),
        tabs,
        predict_button,
        widgets.HTML("<h4>Prediction Results:</h4>"),
        output_text,
        output_graph
    ])
    
    display(layout)
    
    
    on_predict_button_clicked(None)
    
    return predict_sample


def run_insider_threat_detection():
    """
    Main function to run the entire pipeline
    """
    print("="*70)
    print("AI-POWERED INSIDER THREAT DETECTION SYSTEM")
    print("Based on CICIDS 2018 Dataset")
    print("="*70)
    
    
    file_path = '/home/heheboi/Desktop/IS Project/IDS Intrusion Dataset/02-14-2018.csv'  
    df = load_data(file_path)
    
    if df is None:
        print("Error loading data. Please check the file path.")
        return
        
    
    perform_eda(df)
    
    
    data_dict = preprocess_data(df)
    
    
    rf_model = build_rf_model(data_dict)
    
    
    cnn_model, history = build_cnn_model(data_dict)
    
    
    create_interactive_predictor(rf_model, cnn_model, data_dict)
    
    print("\n" + "="*50)
    print("PROJECT COMPLETE")
    print("="*50)
    
    return {
        'rf_model': rf_model,
        'cnn_model': cnn_model,
        'data_dict': data_dict
    }


if __name__ == "__main__":
    run_insider_threat_detection()