In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import time
import math
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go

from warnings import filterwarnings
filterwarnings(action='ignore')

from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

from sklearn.metrics import mean_absolute_error

In [52]:
def prepare_data(data_path, replace_func=None, subset_fn=None, 
                 date_colums:list=None, drop_columns:list=None,
                 num_scaling_fn=None, categorical_encoding_fn=None,
                 train_split_size=0.8, target_transform_fn=None, target_inverse_transform_fn=None):
    
    """
    Read data from a CSV file and preprocess it for machine learning tasks.

    Parameters:
    - data_path: Path to the CSV file.
    - replace_func: Function to replace nulls.
    - subset_fn: Function to subset the data.
    - date_columns: List of columns to convert to datetime format.
    - drop_columns: List of columns to drop.
    - num_scaling_fn: Function for numerical feature scaling.
    - categorical_encoding_fn: For categorical feature encoding.
    - train_split_size: training data size.
    - target_transform_fn: Function to transform the target variable.
    - target_inverse_transform_fn: Function to inverse transform the target variable.

    Returns:
    - data: Preprocessed DataFrame.
    - X_train, X_valid: Training and validation features.
    - y_train, y_valid: Training and validation target variables.
    """

    
    data = pd.read_csv(data_path)
    
    if replace_func:
        data = replace_func(data)
        
    if subset_fn:
        data = subset_fn(data)
        
    if date_colums:
        for col in date_colums:
            data[col] = pd.to_datetime(data[col], dayfirst=True,format='%d/%m/%Y %H:%M')
    
    ## New Feature
    data['opened_at_weekday'] = data["opened_at"].apply(lambda x: datetime.weekday(x)).astype(object)

        
    ## Creating the target.
    data['time_of_completion_dt'] = data.closed_at - data.opened_at
    data['time_of_completion_m'] = data.time_of_completion_dt.apply(lambda x: x.total_seconds()/60)

    
    object_cols = []
    for col in data.columns:
        if (data[col].dtype == 'object') or (data[col].dtype == 'bool'):
            object_cols.append(col)
    
    X = data.drop(['time_of_completion_m','time_of_completion_dt'],axis=1)
    y = data.time_of_completion_m
    
    if drop_columns:
        X = X.drop(drop_columns,axis=1)
    
    
    if num_scaling_fn:
        num_cols = list(set(X.columns.values) - set(object_cols))
        X[num_cols] = num_scaling_fn.fit_transform(X[num_cols])

    if categorical_encoding_fn:
        X = categorical_encoding_fn(X)
    
    X_train, X_valid, y_train, y_valid = train_test_split(X,y,train_size=train_split_size)
    
    if target_transform_fn:
        y_train = y_train.apply(target_transform_fn)
        y_valid = y_valid.apply(target_transform_fn)
    
    return data, X_train, X_valid, y_train, y_valid

In [60]:
def fit_model_run_predictions(model, X_train, X_valid, y_train, y_valid):
    model.fit(X_train,y_train)
    y_train_predict = model.predict(X_train)
    y_valid_predict = model.predict(X_valid)
    
    print('Train predictions are off by', (abs(y_train-y_train_predict)).apply(lambda x: timedelta(minutes=10**(x))).mean())
    print('Valid predictions are off by', (abs(y_valid-y_valid_predict)).apply(lambda x: timedelta(minutes=10**(x))).mean())

    print('Train MAE', mean_absolute_error(y_train,y_train_predict))
    print('Valid MAE', mean_absolute_error(y_valid,y_valid_predict))
    
    train_prediction_df = pd.DataFrame({"y_train": y_train, "y_train_predict": y_train_predict})
    valid_prediction_df = pd.DataFrame({"y_valid": y_valid, "y_valid_predict": y_valid_predict})

    return model, train_prediction_df, valid_prediction_df


In [61]:
## custom functions for preprocessing the data. 
## These are moved out of prepare_data function because these can change with experiments
replace_nulls = lambda df: df.replace('?', 'unknown')
create_subset = lambda df: df[(df.incident_state == 'Closed') & (df.opened_at != df.closed_at)]
categorical_encoding = lambda df: pd.get_dummies(df)
target_transform = lambda target: math.log10(target)
target_inverse_transform = lambda target: 10**target
sscaler = StandardScaler()

features_to_drop = ['number','incident_state','active','sys_created_by', 'sys_created_at', 'cmdb_ci', 'notify', 'caused_by', 'vendor', 'rfc', 'problem_id', 
'opened_at','closed_at','resolved_by','resolved_at', 'closed_at', 'opened_at','closed_code']


clean_data, X_train, X_valid, y_train, y_valid = prepare_data(data_path = 'data/incident_event_log.csv', 
                            replace_func=replace_nulls, 
                            subset_fn=create_subset, 
                            date_colums=['opened_at', 'closed_at'],
                            drop_columns=features_to_drop,
                            num_scaling_fn=sscaler, 
                            categorical_encoding_fn=categorical_encoding,
                            train_split_size=0.8, 
                            target_transform_fn=target_transform_fn, 
                            target_inverse_transform_fn=target_inverse_transform)



In [62]:
model, train_prediction_df, valid_prediction_df = fit_model_run_predictions(XGBRegressor(max_depth=15), X_train, X_valid, y_train, y_valid)

Train predictions are off by 0 days 00:01:03.134206824
Valid predictions are off by 0 days 00:01:40.151689352
Train MAE 0.02133144457763282
Valid MAE 0.04741699527684314
