# CICIDS2017 Dataset Exploration

This notebook provides comprehensive exploration and analysis of the CICIDS2017 dataset for the BERT-IDS research project.

## Objectives:
1. Load and examine the dataset structure
2. Analyze feature distributions and statistics
3. Explore attack types and class imbalance
4. Identify data quality issues
5. Visualize key patterns and relationships
6. Prepare insights for tokenization strategy

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
from pathlib import Path
import os

# Configure plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("📚 Libraries imported successfully!")
print(f"📊 Pandas version: {pd.__version__}")
print(f"🔢 NumPy version: {np.__version__}")

## 1. Dataset Loading and Initial Inspection

In [None]:
# Define data paths
DATA_DIR = Path('../data/raw/cicids2017')
PROCESSED_DIR = Path('../data/processed')

# Create processed directory if it doesn't exist
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

# List available CSV files
csv_files = list(DATA_DIR.glob('*.csv'))
print(f"📁 Found {len(csv_files)} CSV files:")
for file in csv_files:
    print(f"   - {file.name}")

if not csv_files:
    print("⚠️  No CSV files found!")
    print("📋 Please download CICIDS2017 dataset files to: data/raw/cicids2017/")
    print("🔗 Download from: https://www.unb.ca/cic/datasets/ids-2017.html")

In [None]:
# Function to load and combine all CSV files
def load_cicids2017_data(data_dir, sample_size=None):
    """
    Load CICIDS2017 dataset from multiple CSV files.
    
    Args:
        data_dir: Path to directory containing CSV files
        sample_size: Number of samples to load (None for all)
    
    Returns:
        Combined DataFrame
    """
    csv_files = list(data_dir.glob('*.csv'))
    
    if not csv_files:
        print("❌ No CSV files found!")
        return None
    
    dataframes = []
    total_rows = 0
    
    for file in csv_files:
        print(f"📖 Loading {file.name}...")
        try:
            # Load with error handling
            df = pd.read_csv(file, encoding='utf-8', low_memory=False)
            
            # Clean column names (remove spaces, special characters)
            df.columns = df.columns.str.strip().str.replace(' ', '_')
            
            print(f"   ✅ Loaded {len(df):,} rows, {len(df.columns)} columns")
            
            dataframes.append(df)
            total_rows += len(df)
            
        except Exception as e:
            print(f"   ❌ Error loading {file.name}: {e}")
    
    if not dataframes:
        return None
    
    # Combine all dataframes
    print(f"🔄 Combining {len(dataframes)} files...")
    combined_df = pd.concat(dataframes, ignore_index=True)
    
    # Sample if requested
    if sample_size and len(combined_df) > sample_size:
        print(f"🎲 Sampling {sample_size:,} rows from {len(combined_df):,} total rows")
        combined_df = combined_df.sample(n=sample_size, random_state=42)
    
    print(f"✅ Final dataset: {len(combined_df):,} rows, {len(combined_df.columns)} columns")
    return combined_df

# Load dataset (sample for initial exploration)
# For full analysis, set sample_size=None
df = load_cicids2017_data(DATA_DIR, sample_size=100000)  # Sample 100k rows for faster exploration

In [None]:
# Basic dataset information
if df is not None:
    print("📊 Dataset Overview:")
    print(f"   Shape: {df.shape}")
    print(f"   Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    print(f"   Data types: {df.dtypes.value_counts().to_dict()}")
    
    # Display first few rows
    print("\n🔍 First 5 rows:")
    display(df.head())
    
    # Display column names
    print(f"\n📋 Column names ({len(df.columns)} total):")
    for i, col in enumerate(df.columns, 1):
        print(f"   {i:2d}. {col}")

## 2. Target Variable Analysis

In [None]:
# Identify label column (usually 'Label' or similar)
label_columns = [col for col in df.columns if 'label' in col.lower()]
print(f"🏷️  Potential label columns: {label_columns}")

if label_columns:
    label_col = label_columns[0]  # Use first label column
    print(f"📊 Using '{label_col}' as target variable")
    
    # Analyze class distribution
    class_counts = df[label_col].value_counts()
    class_percentages = df[label_col].value_counts(normalize=True) * 100
    
    print(f"\n🎯 Class Distribution:")
    for class_name, count in class_counts.items():
        percentage = class_percentages[class_name]
        print(f"   {class_name:<25}: {count:>8,} ({percentage:>5.2f}%)")
    
    # Visualize class distribution
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Bar plot
    class_counts.plot(kind='bar', ax=ax1, color='skyblue')
    ax1.set_title('Class Distribution (Counts)')
    ax1.set_xlabel('Attack Type')
    ax1.set_ylabel('Count')
    ax1.tick_params(axis='x', rotation=45)
    
    # Pie chart
    ax2.pie(class_counts.values, labels=class_counts.index, autopct='%1.1f%%', startangle=90)
    ax2.set_title('Class Distribution (Percentages)')
    
    plt.tight_layout()
    plt.show()
    
else:
    print("❌ No label column found!")
    label_col = None

## 3. Feature Analysis

In [None]:
# Separate features from target
if label_col:
    feature_cols = [col for col in df.columns if col != label_col]
    X = df[feature_cols]
    y = df[label_col]
    
    print(f"📊 Feature Analysis:")
    print(f"   Number of features: {len(feature_cols)}")
    print(f"   Numeric features: {X.select_dtypes(include=[np.number]).shape[1]}")
    print(f"   Categorical features: {X.select_dtypes(include=['object']).shape[1]}")
    
    # Data quality check
    print(f"\n🔍 Data Quality:")
    missing_data = X.isnull().sum()
    missing_percentage = (missing_data / len(X)) * 100
    
    quality_df = pd.DataFrame({
        'Missing_Count': missing_data,
        'Missing_Percentage': missing_percentage,
        'Data_Type': X.dtypes
    })
    
    # Show features with missing values
    missing_features = quality_df[quality_df['Missing_Count'] > 0]
    if len(missing_features) > 0:
        print(f"   Features with missing values: {len(missing_features)}")
        display(missing_features.head(10))
    else:
        print("   ✅ No missing values found!")
    
    # Check for infinite values
    numeric_cols = X.select_dtypes(include=[np.number]).columns
    inf_counts = {}
    for col in numeric_cols:
        inf_count = np.isinf(X[col]).sum()
        if inf_count > 0:
            inf_counts[col] = inf_count
    
    if inf_counts:
        print(f"\n⚠️  Features with infinite values:")
        for col, count in inf_counts.items():
            print(f"   {col}: {count} infinite values")
    else:
        print("\n✅ No infinite values found!")

In [None]:
# Statistical summary of numeric features
if len(numeric_cols) > 0:
    print("📈 Statistical Summary (Top 10 Numeric Features):")
    stats_summary = X[numeric_cols].describe()
    display(stats_summary.iloc[:, :10])  # Show first 10 features
    
    # Identify features with zero variance
    zero_var_features = [col for col in numeric_cols if X[col].var() == 0]
    if zero_var_features:
        print(f"\n⚠️  Features with zero variance: {len(zero_var_features)}")
        print(f"   {zero_var_features[:10]}...")  # Show first 10
    else:
        print("\n✅ All numeric features have non-zero variance!")

## 4. Feature Distributions and Correlations

In [None]:
# Select top features for visualization (by variance)
if len(numeric_cols) > 0:
    # Calculate variance for each numeric feature
    feature_variance = X[numeric_cols].var().sort_values(ascending=False)
    top_features = feature_variance.head(12).index.tolist()
    
    print(f"📊 Top 12 features by variance:")
    for i, feature in enumerate(top_features, 1):
        print(f"   {i:2d}. {feature:<30} (var: {feature_variance[feature]:.2e})")
    
    # Plot distributions of top features
    fig, axes = plt.subplots(3, 4, figsize=(20, 15))
    axes = axes.ravel()
    
    for i, feature in enumerate(top_features):
        # Handle infinite values for plotting
        data = X[feature].replace([np.inf, -np.inf], np.nan).dropna()
        
        if len(data) > 0:
            axes[i].hist(data, bins=50, alpha=0.7, color='skyblue', edgecolor='black')
            axes[i].set_title(f'{feature}\n(n={len(data):,})')
            axes[i].set_xlabel('Value')
            axes[i].set_ylabel('Frequency')
            
            # Add statistics
            mean_val = data.mean()
            std_val = data.std()
            axes[i].axvline(mean_val, color='red', linestyle='--', alpha=0.7, label=f'Mean: {mean_val:.2f}')
            axes[i].legend()
    
    plt.tight_layout()
    plt.show()

In [None]:
# Correlation analysis
if len(numeric_cols) > 0:
    print("🔗 Correlation Analysis:")
    
    # Select subset for correlation (too many features can be slow)
    correlation_features = top_features[:10]  # Top 10 features
    corr_matrix = X[correlation_features].corr()
    
    # Plot correlation heatmap
    plt.figure(figsize=(12, 10))
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))  # Mask upper triangle
    sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
                square=True, linewidths=0.5, cbar_kws={"shrink": .8})
    plt.title('Feature Correlation Matrix (Top 10 Features)')
    plt.tight_layout()
    plt.show()
    
    # Find highly correlated feature pairs
    high_corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            corr_val = corr_matrix.iloc[i, j]
            if abs(corr_val) > 0.8:  # High correlation threshold
                high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_val))
    
    if high_corr_pairs:
        print(f"\n⚠️  Highly correlated feature pairs (|r| > 0.8):")
        for feat1, feat2, corr_val in high_corr_pairs:
            print(f"   {feat1} <-> {feat2}: {corr_val:.3f}")
    else:
        print("\n✅ No highly correlated feature pairs found!")

## 5. Attack Type Analysis

In [None]:
# Analyze feature distributions by attack type
if label_col and len(numeric_cols) > 0:
    print("🎯 Feature Analysis by Attack Type:")
    
    # Select a few key features for analysis
    key_features = top_features[:6]  # Top 6 features
    
    # Create box plots for each feature by attack type
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    axes = axes.ravel()
    
    for i, feature in enumerate(key_features):
        # Prepare data (handle infinite values)
        plot_data = df[[feature, label_col]].copy()
        plot_data[feature] = plot_data[feature].replace([np.inf, -np.inf], np.nan)
        plot_data = plot_data.dropna()
        
        if len(plot_data) > 0:
            # Create box plot
            sns.boxplot(data=plot_data, x=label_col, y=feature, ax=axes[i])
            axes[i].set_title(f'{feature} by Attack Type')
            axes[i].tick_params(axis='x', rotation=45)
            
            # Limit y-axis for better visualization (remove extreme outliers)
            q1 = plot_data[feature].quantile(0.25)
            q3 = plot_data[feature].quantile(0.75)
            iqr = q3 - q1
            lower_bound = q1 - 1.5 * iqr
            upper_bound = q3 + 1.5 * iqr
            axes[i].set_ylim(lower_bound, upper_bound)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Statistical comparison between normal and attack traffic
if label_col:
    print("📊 Normal vs Attack Traffic Comparison:")
    
    # Separate normal and attack traffic
    normal_traffic = df[df[label_col] == 'BENIGN'] if 'BENIGN' in df[label_col].values else df[df[label_col].str.contains('BENIGN|Normal', case=False, na=False)]
    attack_traffic = df[df[label_col] != 'BENIGN'] if 'BENIGN' in df[label_col].values else df[~df[label_col].str.contains('BENIGN|Normal', case=False, na=False)]
    
    print(f"   Normal traffic samples: {len(normal_traffic):,}")
    print(f"   Attack traffic samples: {len(attack_traffic):,}")
    
    if len(normal_traffic) > 0 and len(attack_traffic) > 0:
        # Compare key features
        comparison_features = top_features[:5]  # Top 5 features
        
        comparison_stats = []
        for feature in comparison_features:
            normal_mean = normal_traffic[feature].mean()
            attack_mean = attack_traffic[feature].mean()
            normal_std = normal_traffic[feature].std()
            attack_std = attack_traffic[feature].std()
            
            comparison_stats.append({
                'Feature': feature,
                'Normal_Mean': normal_mean,
                'Attack_Mean': attack_mean,
                'Normal_Std': normal_std,
                'Attack_Std': attack_std,
                'Mean_Ratio': attack_mean / normal_mean if normal_mean != 0 else np.inf
            })
        
        comparison_df = pd.DataFrame(comparison_stats)
        display(comparison_df)

## 6. Data Preprocessing Insights

In [None]:
# Preprocessing recommendations
print("🔧 Data Preprocessing Recommendations:")
print("\n1. 📊 Feature Engineering:")

# Check for categorical features that might need encoding
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
if categorical_features:
    print(f"   - Encode categorical features: {categorical_features}")
else:
    print("   ✅ No categorical features found")

# Check for features with high cardinality
high_cardinality_features = []
for col in categorical_features:
    unique_count = X[col].nunique()
    if unique_count > 50:  # High cardinality threshold
        high_cardinality_features.append((col, unique_count))

if high_cardinality_features:
    print(f"   - High cardinality features (consider target encoding):")
    for col, count in high_cardinality_features:
        print(f"     * {col}: {count} unique values")

print("\n2. 🧹 Data Cleaning:")
if inf_counts:
    print(f"   - Handle infinite values in: {list(inf_counts.keys())}")
if len(missing_features) > 0:
    print(f"   - Handle missing values in: {missing_features.index.tolist()}")
if zero_var_features:
    print(f"   - Remove zero variance features: {len(zero_var_features)} features")

print("\n3. ⚖️ Class Imbalance:")
if label_col:
    class_imbalance_ratio = class_counts.max() / class_counts.min()
    print(f"   - Imbalance ratio: {class_imbalance_ratio:.2f}:1")
    if class_imbalance_ratio > 10:
        print("   - Consider: SMOTE, class weights, or stratified sampling")
    else:
        print("   ✅ Relatively balanced dataset")

print("\n4. 🔤 Tokenization Strategy for BERT:")
print("   - Convert numeric features to text representation")
print("   - Consider binning continuous features")
print("   - Create flow-based sequences")
print("   - Normalize feature values before tokenization")

print("\n5. 📏 Scaling:")
print("   - StandardScaler for features with normal distribution")
print("   - RobustScaler for features with outliers")
print("   - MinMaxScaler for bounded features")

## 7. Export Processed Data Sample

In [None]:
# Save a processed sample for further analysis
if df is not None:
    print("💾 Saving processed data sample...")
    
    # Create a clean sample
    sample_df = df.copy()
    
    # Basic cleaning
    for col in numeric_cols:
        # Replace infinite values with NaN
        sample_df[col] = sample_df[col].replace([np.inf, -np.inf], np.nan)
    
    # Save sample
    sample_path = PROCESSED_DIR / 'cicids2017_sample.csv'
    sample_df.to_csv(sample_path, index=False)
    print(f"✅ Saved sample to: {sample_path}")
    
    # Save feature information
    feature_info = {
        'total_features': len(feature_cols),
        'numeric_features': len(numeric_cols),
        'categorical_features': len(categorical_features),
        'top_features': top_features,
        'zero_variance_features': zero_var_features,
        'high_correlation_pairs': high_corr_pairs,
        'class_distribution': class_counts.to_dict() if label_col else None
    }
    
    import json
    info_path = PROCESSED_DIR / 'feature_analysis.json'
    with open(info_path, 'w') as f:
        json.dump(feature_info, f, indent=2, default=str)
    print(f"✅ Saved feature analysis to: {info_path}")

print("\n🎉 Data exploration completed!")
print("\n📋 Next Steps:")
print("   1. Download full CICIDS2017 dataset")
print("   2. Implement data preprocessing pipeline")
print("   3. Develop tokenization strategy for BERT")
print("   4. Train baseline models")
print("   5. Implement BERT-IDS architecture")