# Import Library

In [None]:
import pandas as pd
import ipaddress
import pickle
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from xgboost import XGBClassifier

# Model Building (Pickle)

In [None]:
# Custom transformer to convert IP addresses to numeric form
class IpToNumericTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = X[col].apply(lambda x: int(ipaddress.IPv4Address(x)))
        return X

# Custom transformer to calculate 'duration' from the timestamp columns
class DurationTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, start_col, end_col, new_col):
        self.start_col = start_col
        self.end_col = end_col
        self.new_col = new_col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        # Convert to datetime format
        X[self.start_col] = pd.to_datetime(X[self.start_col], errors='coerce')
        X[self.end_col] = pd.to_datetime(X[self.end_col], errors='coerce')

        # Debug: Check if datetime conversion is correct
        print("Timestamps after conversion:")
        print(X[[self.start_col, self.end_col]].head())

        # Calculate the duration in seconds
        X[self.new_col] = (X[self.end_col] - X[self.start_col]).dt.total_seconds()

        # Drop the original timestamp columns
        X = X.drop(columns=[self.start_col, self.end_col], errors='ignore')

        # Debug: Check the transformed data
        print("Data after duration calculation:")
        print(X.head())

        return X
        
# Load the dataset
df = pd.read_csv('dataset-final.csv')

# Preprocessing steps applied to the DataFrame
df = df.drop(['Unnamed: 0', "@timestamp"], axis=1)

# Separate features and target
X = df.drop(columns=["label"])
y = df["label"]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define ColumnTransformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('ip_to_numeric', IpToNumericTransformer(columns=['netflow.ipv4_src_addr', 'netflow.ipv4_dst_addr']), 
         ['netflow.ipv4_src_addr', 'netflow.ipv4_dst_addr']),
        ('calculate_duration', DurationTransformer(start_col='netflow.first_switched', end_col='netflow.last_switched', new_col='duration'),
         ['netflow.first_switched', 'netflow.last_switched'])
    ], remainder='passthrough')

# Create a pipeline with preprocessing and the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(n_estimators=10))
])

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Save the entire pipeline (preprocessing + model)
with open('pipeline.pkl', 'wb') as f:
    pickle.dump(pipeline, f)

print("Pipeline saved successfully!")