In [None]:
import pandas as pd

df = pd.read_csv('data/synthetic_keylock_data.csv')

# Display basic information about the dataset
print("Dataset shape:", df.shape)
print("\nNumber of anomalies:", sum(df['is_anomaly']))
print("Number of normal events:", sum(~df['is_anomaly']))
print("\nFeature columns:", df.columns.tolist())


df.head()

In [None]:
%set_env MLFLOW_TRACKING_URI=arn:aws:sagemaker:eu-central-1:559317267498:mlflow-tracking-server/mlflow-experiments
%set_env MLFLOW_EXPERIMENT_NAME=anomaly-detection
%set_env AWS_PROFILE=AWSAdministratorAccess-559317267498

In [None]:
import os
import mlflow
import logging

# set logging level to DEBUG
logger = logging.getLogger('mlflow')
logger.setLevel(logging.DEBUG)

def preprocess_data(df):
    # Create copy to avoid modifying original data
    df_processed = df.copy()
    
    import pandas as pd
    from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
    from sklearn.compose import ColumnTransformer
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import FunctionTransformer
    import mlflow

    # set mlflow experiment and server
    mlflow.set_tracking_uri(os.environ['MLFLOW_TRACKING_URI'])
    mlflow.set_experiment(os.environ['MLFLOW_EXPERIMENT_NAME'])

    with mlflow.start_run(run_name="data_preprocessing") as run:

        mlflow.autolog()
        mlflow.log_param("input_rows", df.shape[0])
        # 1. Convert timestamp to datetime
        df_processed['timestamp'] = pd.to_datetime(df['timestamp'])

        # Feature Engineering
        # Time-based features
        df_processed['month'] = df_processed['timestamp'].dt.month
        df_processed['day'] = df_processed['timestamp'].dt.day
        df_processed['hour'] = df_processed['timestamp'].dt.hour
        df_processed['minute'] = df_processed['timestamp'].dt.minute
        
        # Create time windows for access patterns
        df_processed['time_of_day'] = pd.cut(df_processed['hour'], 
                                bins=[0, 6, 12, 18, 24], 
                                labels=['night', 'morning', 'afternoon', 'evening'])
        
        # Calculate access frequency features
        df_processed['user_access_frequency'] = df.groupby('user_id')['timestamp'].transform('count')
        df_processed['lock_access_frequency'] = df.groupby('lock_id')['timestamp'].transform('count')
        
        # Calculate average access duration per user and lock
        df_processed['user_avg_duration'] = df.groupby('user_id')['access_duration'].transform('mean')
        df_processed['lock_avg_duration'] = df.groupby('lock_id')['access_duration'].transform('mean')
        
        # Calculate failed attempts ratio
        df_processed['failed_attempts_ratio'] = df['failed_attempts'] / (df.groupby('user_id')['failed_attempts'].transform('sum') + 1)
            
        categorical_cols = ['user_id', 'lock_id', 'access_level', 'time_of_day']

        numerical_cols = ['access_duration', 'failed_attempts', 'time_since_last_access',
                        'user_access_frequency', 'lock_access_frequency',
                        'user_avg_duration', 'lock_avg_duration', 'failed_attempts_ratio']

        boolean_cols = ['access_granted', 'is_weekend', 'is_business_hours', 'is_anomaly']
        
        # Apply transformations
        transformer = ColumnTransformer([
                ('numeric', MinMaxScaler(), numerical_cols),
                ('categorical', OrdinalEncoder(), categorical_cols),
                ('boolean', FunctionTransformer(lambda x: x.astype(int)), boolean_cols)
            ],
            remainder='drop'
        )

        # Fit and transform the data
        df_processed = pd.DataFrame(transformer.fit_transform(df_processed), columns = numerical_cols + categorical_cols + boolean_cols)

        mlflow.log_input(mlflow.data.from_pandas(df_processed), context="processed")

        print(f"Current MLflow tracking URI: {mlflow.get_tracking_uri()}")
        # Print current experiment info
        current_experiment = mlflow.get_experiment(run.info.experiment_id)
        if current_experiment:
            print(f"Current experiment name: {current_experiment.name}")
            print(f"Current experiment ID: {current_experiment.experiment_id}")
            print(f"Current experiment artifact location: {current_experiment.artifact_location}")

        # 4. Split features and target
        X = df_processed.drop(['is_anomaly'], axis=1)
        y = df_processed['is_anomaly']
        
        # 5. Train-test split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test


## Run Locally

Let's first run our data processing script locally

In [None]:
X_train, X_test, y_train, y_test = preprocess_data(df)


## Run using the @remote decorator

Let's run the script and dependencies using SageMaker's @remote decorator.

In [None]:
from sagemaker.remote_function import remote

preprocess_data = remote(preprocess_data, instance_type='ml.m5.large')

SyntaxError: invalid syntax (1566392784.py, line 2)