In [1]:

import pandas as pd
import sys
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

def load_module(module_name, path):
    """Dynamically load a module from a given path."""
    import importlib.util
    spec = importlib.util.spec_from_file_location(module_name, path)
    module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(module)
    return module

# Construct the path to model.py
model_path = os.path.join(os.getcwd(), '..', 'scripts', 'model.py')

# Load the module dynamically
model_module = load_module("model", model_path)

# Access the classes
RandomForestModel = model_module.RandomForestModel
NeuralNetworkModel = model_module.NeuralNetworkModel

# Construct the path to the CSV file
csv_file_path = os.path.abspath(os.path.join('..', 'data', 'logs.csv'))

# Check if the CSV file exists
if not os.path.isfile(csv_file_path):
    raise FileNotFoundError(f"No such file: '{csv_file_path}'")

# Check if the file is empty
if os.path.getsize(csv_file_path) == 0:
    raise ValueError(f"The CSV file is empty: '{csv_file_path}'")

# Load and preprocess the data
try:
    data = pd.read_csv(csv_file_path)  # Load your log data
    if data.empty:
        raise ValueError("The CSV file is empty.")
except Exception as e:
    print(f"Error loading data: {e}")
    raise

# Proceed with preprocessing
if 'anomaly' not in data.columns:
    raise ValueError("The 'anomaly' column is missing from the data.")

# Features and target variable
X = data.drop('anomaly', axis=1)  # Features
y = data['anomaly']                # Labels

# Convert timestamps to datetime and extract features
if 'Timestamp' in X.columns:
    X['Timestamp'] = pd.to_datetime(X['Timestamp'])
    X['Year'] = X['Timestamp'].dt.year
    X['Month'] = X['Timestamp'].dt.month
    X['Day'] = X['Timestamp'].dt.day
    X['Hour'] = X['Timestamp'].dt.hour
    X['Minute'] = X['Timestamp'].dt.minute
    X.drop('Timestamp', axis=1, inplace=True)  # Drop original timestamp

# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

# Create a pipeline for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X.select_dtypes(include=['int64', 'float64']).columns),
        ('cat', OneHotEncoder(), categorical_cols)
    ]
)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit and transform the features
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Save preprocessed data for later use
os.makedirs('../data/processed', exist_ok=True)  # Ensure the directory exists
pd.DataFrame(X_train).to_csv('../data/processed/X_train.csv', index=False)
pd.DataFrame(X_test).to_csv('../data/processed/X_test.csv', index=False)
pd.Series(y_train).to_csv('../data/processed/y_train.csv', index=False)
pd.Series(y_test).to_csv('../data/processed/y_test.csv', index=False)

print("Data preprocessing complete and files saved successfully.")

Data preprocessing complete and files saved successfully.


In [2]:
import os
print(os.getcwd())
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), 'scripts', '..')))


c:\Users\Nicholas Bing\Documents\App_logs_anomaly_detection\notebooks
