In [None]:
import pandas as pd
import os
import sys
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

import re

pd.set_option('display.max_columns', 999)
pd.set_option('display.max_rows', 500)

In [None]:
log_df = pd.read_parquet("logs.parquet")

In [None]:
# Feature transformations

# Create categorical variables for url requests

log_df = pd.get_dummies(log_df, columns = ['url'])

# Create url request only dataframe

log_df.columns.tolist()

url_features = [col for col in log_df.columns if "url" in col]

req_df = log_df[url_features]

req_df.shape

In [None]:
# Create sequence prediction outcome variable -> next 250 requests

for url in url_features:
     req_df["CS_" + url] = req_df[url].rolling(250).sum()

cumsum_cols = [col for col in req_df.columns if "CS_" in col]
        
# Delete NaN rows

req_df.dropna(inplace=True)

# Shift output variable 250 places down

req_df[cumsum_cols] = req_df[cumsum_cols].shift(-250)

# Delete NaN rows again

req_df.dropna(inplace=True)

# Scale CS cols between 0 and 1

req_df[cumsum_cols] = MinMaxScaler().fit_transform(req_df[cumsum_cols])

In [None]:
# Split 50/50 into train and test sets (keeping distribution)

train_df = req_df.iloc[::2]  # even

test_df = req_df.iloc[1::2]  # odd


In [None]:
# Create subset (every 2th request) and trim

# trim to size

train_df = train_df.iloc[0:2241000]
test_df = test_df.iloc[0:2241000]

In [None]:
train_df.to_parquet('train.parquet')
test_df.to_parquet('test.parquet')