In [None]:
import pandas as pd
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error 
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
import re

pd.set_option('display.max_columns', 999)
pd.set_option('display.max_rows', 500)

# Run multiple supervised models on static (whole) dataset

In [None]:
# import and prepare data

path = './Datasets/'

log_df = pd.read_parquet(path + "logs.parquet")

# Note: deleting all logs without a response time (errors) - this loses key information but should simplify the problem while preserving the basic relationship

log_df.dropna(inplace=True)

# Feature transformations

# Convert resp_time and bytes_setn into float

log_df.resp_time = log_df.resp_time.astype("float")
log_df.bytes_sent = log_df.bytes_sent.astype("float")

app_cols = [col for col in log_df.columns if "app_" in col]

# Scale key cols between 0 and 1

log_df["resp_time"] = MinMaxScaler().fit_transform(log_df[["resp_time"]])
log_df[app_cols] = MinMaxScaler().fit_transform(log_df[app_cols])

log_df.drop(columns=["bytes_sent", "resp_code", "url", "datetime"], inplace=True)

In [None]:
# Prepare modelling tables

y = log_df.resp_time

X = log_df.drop(columns=["resp_time"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

X_train = np.ascontiguousarray(X_train)
y_train = np.ascontiguousarray(y_train)

X_test = np.ascontiguousarray(X_test)
y_test = np.ascontiguousarray(y_test)

In [None]:
# Linear regression

model = LinearRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

models_dict = {
    'Mean Absolute Error': mean_absolute_error(y_test, y_pred),
    'Mean Squared Error': mean_squared_error(y_test, y_pred),
    'Root Mean Squared Error': np.sqrt(mean_squared_error(y_test, y_pred)),
    "Datetime": "_".join(str(datetime.datetime.now()).split()),
    "Coefficients": model.coef_
}

print(models_dict)

In [None]:
for k, v in models_dict.items():
    print(k)
    print(v)

In [None]:
# Random forest

model = RandomForestRegressor(max_depth=5, max_features="sqrt",  n_jobs=-1, n_estimators= 300)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

models_dict = {
    'Mean Absolute Error': mean_absolute_error(y_test, y_pred),
    'Mean Squared Error': mean_squared_error(y_test, y_pred),
    'Root Mean Squared Error': np.sqrt(mean_squared_error(y_test, y_pred)),
    "Datetime": "_".join(str(datetime.datetime.now()).split()),
    "Feature importances": model.feature_importances_
}

print(models_dict)

In [None]:
for k, v in models_dict.items():
    print(k)
    print(v)