In [31]:
import sys
sys.executable

'd:\\Documents\\AutoScaling Analysis\\Autoscaling-Analysis\\venv\\Scripts\\python.exe'

In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
from datetime import datetime

LOG_PATTERN = re.compile(r'(?P<host>\S+) - - \[(?P<timestamp>.*?)\] "(?P<request>.*?)" (?P<status>\d{3}) (?P<bytes>\S+)')

In [33]:
# Parse một dòng log raw thành dictionary
def parse_log_line(line):
    match = LOG_PATTERN.match(line)
    if match:
        data = match.groupdict()
        data['bytes'] = 0 if data['bytes'] == '-' else int(data['bytes'])
        data['status'] = int(data['status'])
        return data
    return None

# Đọc file txt và trả về DataFrame
def load_and_process_logs(file_paths):
    parsed_data = []
    
    for file_path in file_paths:
        if not os.path.exists(file_path):
            print(f"Warning: File {file_path} not found. Skipping.")
            continue
            
        print(f"Reading file: {file_path}...")
        with open(file_path, 'r', encoding='latin-1') as f:
            for line in f:
                parsed = parse_log_line(line)
                if parsed:
                    parsed_data.append(parsed)
    
    if not parsed_data:
        return pd.DataFrame()

    df = pd.DataFrame(parsed_data)
    
    df['timestamp'] = pd.to_datetime(df['timestamp'], format='%d/%b/%Y:%H:%M:%S %z')
    
    df = df.sort_values('timestamp')
    
    return df

In [34]:
os.getcwd()

'd:\\Documents\\AutoScaling Analysis\\Autoscaling-Analysis\\notebooks\\experimental'

In [35]:
from pathlib import Path

PROJECT_ROOT = Path().resolve().parent.parent
DATA_RAW_DIR =  PROJECT_ROOT / "data" / "raw"
DATA_CLEANED_DIR = PROJECT_ROOT / "data" / "cleaned"

In [36]:
train_log_path = DATA_RAW_DIR / "train.txt"
test_log_path  = DATA_RAW_DIR / "test.txt"

log_files = [train_log_path, test_log_path]
raw_df = load_and_process_logs(log_files)

print(f"Shape of Data: {raw_df.shape}")

Reading file: D:\Documents\AutoScaling Analysis\Autoscaling-Analysis\data\raw\train.txt...
Reading file: D:\Documents\AutoScaling Analysis\Autoscaling-Analysis\data\raw\test.txt...
Shape of Data: (3461612, 5)


In [37]:
raw_df.head(5)

Unnamed: 0,host,timestamp,request,status,bytes
0,199.72.81.55,1995-07-01 00:00:01-04:00,GET /history/apollo/ HTTP/1.0,200,6245
1,unicomp6.unicomp.net,1995-07-01 00:00:06-04:00,GET /shuttle/countdown/ HTTP/1.0,200,3985
2,199.120.110.21,1995-07-01 00:00:09-04:00,GET /shuttle/missions/sts-73/mission-sts-73.ht...,200,4085
3,burger.letters.com,1995-07-01 00:00:11-04:00,GET /shuttle/countdown/liftoff.html HTTP/1.0,304,0
4,199.120.110.21,1995-07-01 00:00:11-04:00,GET /shuttle/missions/sts-73/sts-73-patch-smal...,200,4179


In [38]:
raw_df.tail(5)

Unnamed: 0,host,timestamp,request,status,bytes
3461606,gatekeeper.uccu.com,1995-08-31 23:59:49-04:00,GET /images/kscmap-tiny.gif HTTP/1.0,304,0
3461608,gatekeeper.uccu.com,1995-08-31 23:59:49-04:00,GET /images/lc39a-logo.gif HTTP/1.0,304,0
3461610,www-c8.proxy.aol.com,1995-08-31 23:59:52-04:00,GET /icons/unknown.xbm HTTP/1.0,200,515
3461609,cys-cap-9.wyoming.com,1995-08-31 23:59:52-04:00,GET /shuttle/missions/sts-71/movies/sts-71-lau...,200,57344
3461611,cindy.yamato.ibm.co.jp,1995-08-31 23:59:53-04:00,GET /images/kscmap-small.gif HTTP/1.0,200,39017


In [39]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3461612 entries, 0 to 3461611
Data columns (total 5 columns):
 #   Column     Dtype                    
---  ------     -----                    
 0   host       object                   
 1   timestamp  datetime64[ns, UTC-04:00]
 2   request    object                   
 3   status     int64                    
 4   bytes      int64                    
dtypes: datetime64[ns, UTC-04:00](1), int64(2), object(2)
memory usage: 158.5+ MB


In [49]:
def resample_traffic(df, window='5min'):
    df_idx = df.set_index('timestamp')
    
    resampled = df_idx.resample(window).agg({
        'request': 'count',
        'bytes': 'sum',
        'host': 'nunique',
        'status': lambda x: (x >= 400).sum()
    })
    
    resampled.columns = ['requests', 'bytes', 'hosts', 'errors']
    
    # Fill 0 cho những khoảng trống 
    resampled = resampled.fillna(0)
    
    return resampled

df_5m = resample_traffic(raw_df, window='5min')
print("Resampled to 5-minute intervals:")
df_5m.head()

Resampled to 5-minute intervals:


Unnamed: 0_level_0,requests,bytes,hosts,errors
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1995-07-01 00:00:00-04:00,301,5277675,60,1
1995-07-01 00:05:00-04:00,267,5041043,53,0
1995-07-01 00:10:00-04:00,242,6111846,55,4
1995-07-01 00:15:00-04:00,282,4559748,55,4
1995-07-01 00:20:00-04:00,319,7262385,66,0


In [52]:
def add_features(df_input):
    df = df_input.copy()

    outages = [
        ('1995-08-01 14:52:02', '1995-08-03 04:36:12'),
        ('1995-07-28 13:32:26', '1995-08-01 00:00:00')
    ]

    df['is_system_down'] = 0

    for start, end in outages:
        mask = (df.index >= start) & (df.index <= end)
        df.loc[mask, 'is_system_down'] = 1

    df['requests_imputed'] = df['requests']

    # 7 * 24 * 12
    steps_7_days = 2016

    mask_down = df['is_system_down'] == 1
    df.loc[mask_down, 'requests_imputed'] = df['requests'].shift(steps_7_days)

    df['requests_imputed'] = df['requests_imputed'].fillna(method='ffill').fillna(method='bfill')

    target_for_lag = 'requests_imputed'

    df['req_lag_1']   = df[target_for_lag].shift(1)
    df['req_lag_12']  = df[target_for_lag].shift(12) 
    df['req_lag_288'] = df[target_for_lag].shift(288)

    df['rolling_mean_1h'] = df[target_for_lag].rolling(window=12).mean()
    df['rolling_std_1h']  = df[target_for_lag].rolling(window=12).std()

    df['rolling_mean_24h'] = df[target_for_lag].rolling(window=288).mean()
    
    # Tính Error Rate
    df['error_rate'] = df['errors'] / df['requests']
    df['error_rate'] = df['error_rate'].fillna(0.0)
    
    # Extract Time Features (Cyclic)
    df['hour_of_day'] = df.index.hour # không dùng với LSTM
    df['day_of_week'] = df.index.dayofweek
    df['is_weekend'] = df['day_of_week'] >= 5

    df['hour_sin'] = np.sin(2 * np.pi * df['hour_of_day'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour_of_day'] / 24)

    df = df.drop(columns=['requests_imputed'])
    df = df.fillna(0)
    
    # Type Casting 
    df['requests'] = df['requests'].astype(int)
    df['bytes'] = df['bytes'].astype(int)
    df['hosts'] = df['hosts'].astype(int)
    df['hour_of_day'] = df['hour_of_day'].astype(int)
    df['day_of_week'] = df['day_of_week'].astype(int)
    df['is_weekend'] = df['is_weekend'].astype(int)
    df['error_rate'] = df['error_rate'].astype(float)
    
    
    return df

df_5m_after = add_features(df_5m)
df_5m_after['1995-07-28 13:32:25':'1995-08-01 00:05:01']

  df['requests_imputed'] = df['requests_imputed'].fillna(method='ffill').fillna(method='bfill')


Unnamed: 0_level_0,requests,bytes,hosts,errors,is_system_down,req_lag_1,req_lag_12,req_lag_288,rolling_mean_1h,rolling_std_1h,rolling_mean_24h,error_rate,hour_of_day,day_of_week,is_weekend,hour_sin,hour_cos
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1995-07-28 13:35:00-04:00,0,0,0,0,1,209.0,298.0,427.0,299.083333,83.593669,200.347222,0.000000,13,4,0,-0.258819,-0.965926
1995-07-28 13:40:00-04:00,0,0,0,0,1,533.0,275.0,351.0,317.083333,99.649803,200.833333,0.000000,13,4,0,-0.258819,-0.965926
1995-07-28 13:45:00-04:00,0,0,0,0,1,491.0,325.0,376.0,324.083333,103.145492,200.947917,0.000000,13,4,0,-0.258819,-0.965926
1995-07-28 13:50:00-04:00,0,0,0,0,1,409.0,236.0,423.0,340.250000,103.287881,200.972222,0.000000,13,4,0,-0.258819,-0.965926
1995-07-28 13:55:00-04:00,0,0,0,0,1,430.0,315.0,507.0,346.750000,104.006228,200.576389,0.000000,13,4,0,-0.258819,-0.965926
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995-07-31 23:45:00-04:00,0,0,0,0,1,139.0,199.0,139.0,155.416667,27.678210,223.663194,0.000000,23,0,0,-0.258819,0.965926
1995-07-31 23:50:00-04:00,0,0,0,0,1,143.0,134.0,170.0,150.833333,35.104606,223.347222,0.000000,23,0,0,-0.258819,0.965926
1995-07-31 23:55:00-04:00,0,0,0,0,1,79.0,158.0,197.0,148.666667,35.422985,223.121528,0.000000,23,0,0,-0.258819,0.965926
1995-08-01 00:00:00-04:00,220,2490673,29,0,1,132.0,157.0,144.0,150.000000,36.060555,223.222222,0.000000,0,1,0,0.000000,1.000000


In [53]:
df_5m_after.tail()

Unnamed: 0_level_0,requests,bytes,hosts,errors,is_system_down,req_lag_1,req_lag_12,req_lag_288,rolling_mean_1h,rolling_std_1h,rolling_mean_24h,error_rate,hour_of_day,day_of_week,is_weekend,hour_sin,hour_cos
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1995-08-31 23:35:00-04:00,277,4183450,43,0,0,219.0,299.0,221.0,235.666667,32.761766,315.135417,0.0,23,3,0,-0.258819,0.965926
1995-08-31 23:40:00-04:00,170,4511364,41,4,0,277.0,239.0,291.0,229.916667,37.792395,314.715278,0.023529,23,3,0,-0.258819,0.965926
1995-08-31 23:45:00-04:00,145,2805157,36,1,0,170.0,220.0,359.0,223.666667,45.0804,313.972222,0.006897,23,3,0,-0.258819,0.965926
1995-08-31 23:50:00-04:00,174,1883814,36,0,0,145.0,232.0,293.0,218.833333,47.166693,313.559028,0.0,23,3,0,-0.258819,0.965926
1995-08-31 23:55:00-04:00,123,2315469,32,0,0,174.0,292.0,303.0,204.75,48.544497,312.934028,0.0,23,3,0,-0.258819,0.965926


In [54]:
df_5m_after.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 17856 entries, 1995-07-01 00:00:00-04:00 to 1995-08-31 23:55:00-04:00
Freq: 5min
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   requests          17856 non-null  int64  
 1   bytes             17856 non-null  int64  
 2   hosts             17856 non-null  int64  
 3   errors            17856 non-null  int64  
 4   is_system_down    17856 non-null  int64  
 5   req_lag_1         17856 non-null  float64
 6   req_lag_12        17856 non-null  float64
 7   req_lag_288       17856 non-null  float64
 8   rolling_mean_1h   17856 non-null  float64
 9   rolling_std_1h    17856 non-null  float64
 10  rolling_mean_24h  17856 non-null  float64
 11  error_rate        17856 non-null  float64
 12  hour_of_day       17856 non-null  int64  
 13  day_of_week       17856 non-null  int64  
 14  is_weekend        17856 non-null  int64  
 15  hour_sin          17856 non-n

In [55]:
df_1m = resample_traffic(raw_df, window='1min')
df_1m_after = add_features(df_1m)
df_1m_after.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 89280 entries, 1995-07-01 00:00:00-04:00 to 1995-08-31 23:59:00-04:00
Freq: min
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   requests          89280 non-null  int64  
 1   bytes             89280 non-null  int64  
 2   hosts             89280 non-null  int64  
 3   errors            89280 non-null  int64  
 4   is_system_down    89280 non-null  int64  
 5   req_lag_1         89280 non-null  float64
 6   req_lag_12        89280 non-null  float64
 7   req_lag_288       89280 non-null  float64
 8   rolling_mean_1h   89280 non-null  float64
 9   rolling_std_1h    89280 non-null  float64
 10  rolling_mean_24h  89280 non-null  float64
 11  error_rate        89280 non-null  float64
 12  hour_of_day       89280 non-null  int64  
 13  day_of_week       89280 non-null  int64  
 14  is_weekend        89280 non-null  int64  
 15  hour_sin          89280 non-nu

  df['requests_imputed'] = df['requests_imputed'].fillna(method='ffill').fillna(method='bfill')


In [56]:
df_1m_after['1995-07-28 13:32:25':'1995-08-01 00:00:01']

Unnamed: 0_level_0,requests,bytes,hosts,errors,is_system_down,req_lag_1,req_lag_12,req_lag_288,rolling_mean_1h,rolling_std_1h,rolling_mean_24h,error_rate,hour_of_day,day_of_week,is_weekend,hour_sin,hour_cos
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1995-07-28 13:33:00-04:00,0,0,0,0,1,42.0,70.0,37.0,55.500000,21.989667,55.663194,0.0,13,4,0,-0.258819,-0.965926
1995-07-28 13:34:00-04:00,0,0,0,0,1,21.0,29.0,50.0,55.666667,21.777108,55.597222,0.0,13,4,0,-0.258819,-0.965926
1995-07-28 13:35:00-04:00,0,0,0,0,1,31.0,53.0,72.0,53.916667,22.829241,55.458333,0.0,13,4,0,-0.258819,-0.965926
1995-07-28 13:36:00-04:00,0,0,0,0,1,32.0,50.0,37.0,51.916667,24.212913,55.420139,0.0,13,4,0,-0.258819,-0.965926
1995-07-28 13:37:00-04:00,0,0,0,0,1,26.0,47.0,38.0,50.083333,25.421746,55.375000,0.0,13,4,0,-0.258819,-0.965926
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995-07-31 23:56:00-04:00,0,0,0,0,1,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,23,0,0,-0.258819,0.965926
1995-07-31 23:57:00-04:00,0,0,0,0,1,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,23,0,0,-0.258819,0.965926
1995-07-31 23:58:00-04:00,0,0,0,0,1,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,23,0,0,-0.258819,0.965926
1995-07-31 23:59:00-04:00,0,0,0,0,1,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,23,0,0,-0.258819,0.965926


In [57]:
df_15m = resample_traffic(raw_df, window='15min')
df_15m_after = add_features(df_15m)
df_15m_after.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5952 entries, 1995-07-01 00:00:00-04:00 to 1995-08-31 23:45:00-04:00
Freq: 15min
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   requests          5952 non-null   int64  
 1   bytes             5952 non-null   int64  
 2   hosts             5952 non-null   int64  
 3   errors            5952 non-null   int64  
 4   is_system_down    5952 non-null   int64  
 5   req_lag_1         5952 non-null   float64
 6   req_lag_12        5952 non-null   float64
 7   req_lag_288       5952 non-null   float64
 8   rolling_mean_1h   5952 non-null   float64
 9   rolling_std_1h    5952 non-null   float64
 10  rolling_mean_24h  5952 non-null   float64
 11  error_rate        5952 non-null   float64
 12  hour_of_day       5952 non-null   int64  
 13  day_of_week       5952 non-null   int64  
 14  is_weekend        5952 non-null   int64  
 15  hour_sin          5952 non-nu

  df['requests_imputed'] = df['requests_imputed'].fillna(method='ffill').fillna(method='bfill')


In [58]:
data_1m_path = DATA_CLEANED_DIR / 'data_1m.csv'
data_5m_path = DATA_CLEANED_DIR / 'data_5m.csv'
data_15m_path = DATA_CLEANED_DIR / 'data_15m.csv'

df_1m_after.to_csv(data_1m_path, index=True)
df_5m_after.to_csv(data_5m_path, index=True)
df_15m_after.to_csv(data_15m_path, index=True)