In [1]:
import sys
sys.executable

'd:\\Documents\\AutoScaling Analysis\\Autoscaling-Analysis\\venv\\Scripts\\python.exe'

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
from datetime import datetime

LOG_PATTERN = re.compile(r'(?P<host>\S+) - - \[(?P<timestamp>.*?)\] "(?P<request>.*?)" (?P<status>\d{3}) (?P<bytes>\S+)')

In [None]:
# Parse một dòng log raw thành dictionary
def parse_log_line(line):
    match = LOG_PATTERN.match(line)
    if match:
        data = match.groupdict()
        data['bytes'] = 0 if data['bytes'] == '-' else int(data['bytes'])
        data['status'] = int(data['status'])
        return data
    return None

# Đọc file txt và trả về DataFrame
def load_and_process_logs(file_paths):
    parsed_data = []
    
    for file_path in file_paths:
        if not os.path.exists(file_path):
            print(f"Warning: File {file_path} not found. Skipping.")
            continue
            
        print(f"Reading file: {file_path}...")
        with open(file_path, 'r', encoding='latin-1') as f:
            for line in f:
                parsed = parse_log_line(line)
                if parsed:
                    parsed_data.append(parsed)
    
    if not parsed_data:
        return pd.DataFrame()

    df = pd.DataFrame(parsed_data)
    
    df['timestamp'] = pd.to_datetime(df['timestamp'], format='%d/%b/%Y:%H:%M:%S %z')
    
    df = df.sort_values('timestamp')
    
    return df

In [4]:
os.getcwd()

'd:\\Documents\\AutoScaling Analysis\\Autoscaling-Analysis\\notebooks\\experimental'

In [27]:
from pathlib import Path

PROJECT_ROOT = Path().resolve().parent.parent
DATA_RAW_DIR =  PROJECT_ROOT / "data" / "raw"
DATA_CLEANED_DIR = PROJECT_ROOT / "data" / "cleaned"

In [10]:
train_log_path = DATA_RAW_DIR / "train.txt"
test_log_path  = DATA_RAW_DIR / "test.txt"

log_files = [train_log_path, test_log_path]
raw_df = load_and_process_logs(log_files)

print(f"Shape of Data: {raw_df.shape}")

Reading file: D:\Documents\AutoScaling Analysis\Autoscaling-Analysis\data\raw\train.txt...
Reading file: D:\Documents\AutoScaling Analysis\Autoscaling-Analysis\data\raw\test.txt...
Shape of Data: (3461612, 5)


In [12]:
raw_df.head(5)

Unnamed: 0,host,timestamp,request,status,bytes
0,199.72.81.55,1995-07-01 00:00:01-04:00,GET /history/apollo/ HTTP/1.0,200,6245
1,unicomp6.unicomp.net,1995-07-01 00:00:06-04:00,GET /shuttle/countdown/ HTTP/1.0,200,3985
2,199.120.110.21,1995-07-01 00:00:09-04:00,GET /shuttle/missions/sts-73/mission-sts-73.ht...,200,4085
3,burger.letters.com,1995-07-01 00:00:11-04:00,GET /shuttle/countdown/liftoff.html HTTP/1.0,304,0
4,199.120.110.21,1995-07-01 00:00:11-04:00,GET /shuttle/missions/sts-73/sts-73-patch-smal...,200,4179


In [13]:
raw_df.tail(5)

Unnamed: 0,host,timestamp,request,status,bytes
3461606,gatekeeper.uccu.com,1995-08-31 23:59:49-04:00,GET /images/kscmap-tiny.gif HTTP/1.0,304,0
3461608,gatekeeper.uccu.com,1995-08-31 23:59:49-04:00,GET /images/lc39a-logo.gif HTTP/1.0,304,0
3461610,www-c8.proxy.aol.com,1995-08-31 23:59:52-04:00,GET /icons/unknown.xbm HTTP/1.0,200,515
3461609,cys-cap-9.wyoming.com,1995-08-31 23:59:52-04:00,GET /shuttle/missions/sts-71/movies/sts-71-lau...,200,57344
3461611,cindy.yamato.ibm.co.jp,1995-08-31 23:59:53-04:00,GET /images/kscmap-small.gif HTTP/1.0,200,39017


In [29]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3461612 entries, 0 to 3461611
Data columns (total 5 columns):
 #   Column     Dtype                    
---  ------     -----                    
 0   host       object                   
 1   timestamp  datetime64[ns, UTC-04:00]
 2   request    object                   
 3   status     int64                    
 4   bytes      int64                    
dtypes: datetime64[ns, UTC-04:00](1), int64(2), object(2)
memory usage: 158.5+ MB


In [18]:
def resample_traffic(df, window='5min'):
    df_idx = df.set_index('timestamp')
    
    resampled = df_idx.resample(window).agg({
        'request': 'count',
        'bytes': 'sum',
        'host': 'nunique',
        'status': lambda x: (x >= 400).sum()
    })
    
    resampled.columns = ['requests', 'bytes', 'hosts', 'errors']
    
    # Fill 0 cho những khoảng trống (như lúc bị bão)
    resampled = resampled.fillna(0)
    
    return resampled

df_5m = resample_traffic(raw_df, window='5min')
print("Resampled to 5-minute intervals:")
df_5m.head()

Resampled to 5-minute intervals:


Unnamed: 0_level_0,requests,bytes,hosts,errors
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1995-07-01 00:00:00-04:00,301,5277675,60,1
1995-07-01 00:05:00-04:00,267,5041043,53,0
1995-07-01 00:10:00-04:00,242,6111846,55,4
1995-07-01 00:15:00-04:00,282,4559748,55,4
1995-07-01 00:20:00-04:00,319,7262385,66,0


In [21]:
def add_features(df_input):
    df = df_input.copy()

    storm_start = '1995-08-01 14:52:00'
    storm_end =   '1995-08-03 04:36:13'

    df['is_system_down'] = 0

    mask_storm = (df.index >= storm_start) & (df.index <= storm_end)
    df.loc[mask_storm, 'is_system_down'] = 1
    
    # Tính Error Rate
    df['error_rate'] = df['errors'] / df['requests']
    df['error_rate'] = df['error_rate'].fillna(0.0)
    
    # Extract Time Features (Cyclic)
    df['hour_of_day'] = df.index.hour
    df['day_of_week'] = df.index.dayofweek
    df['is_weekend'] = df['day_of_week'] >= 5
    
    # Type Casting 
    df['requests'] = df['requests'].astype(int)
    df['bytes'] = df['bytes'].astype(int)
    df['hosts'] = df['hosts'].astype(int)
    df['hour_of_day'] = df['hour_of_day'].astype(int)
    df['day_of_week'] = df['day_of_week'].astype(int)
    df['is_weekend'] = df['is_weekend'].astype(bool)
    df['error_rate'] = df['error_rate'].astype(float)
    
    
    return df

df_5m_after = add_features(df_5m)
df_5m_after['1995-08-02 12:00':'1995-08-02 12:30']

Unnamed: 0_level_0,requests,bytes,hosts,errors,is_system_down,error_rate,hour_of_day,day_of_week,is_weekend
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1995-08-02 12:00:00-04:00,0,0,0,0,1,0.0,12,2,False
1995-08-02 12:05:00-04:00,0,0,0,0,1,0.0,12,2,False
1995-08-02 12:10:00-04:00,0,0,0,0,1,0.0,12,2,False
1995-08-02 12:15:00-04:00,0,0,0,0,1,0.0,12,2,False
1995-08-02 12:20:00-04:00,0,0,0,0,1,0.0,12,2,False
1995-08-02 12:25:00-04:00,0,0,0,0,1,0.0,12,2,False
1995-08-02 12:30:00-04:00,0,0,0,0,1,0.0,12,2,False


In [22]:
df_5m_after.tail()

Unnamed: 0_level_0,requests,bytes,hosts,errors,is_system_down,error_rate,hour_of_day,day_of_week,is_weekend
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1995-08-31 23:35:00-04:00,277,4183450,43,0,0,0.0,23,3,False
1995-08-31 23:40:00-04:00,170,4511364,41,4,0,0.023529,23,3,False
1995-08-31 23:45:00-04:00,145,2805157,36,1,0,0.006897,23,3,False
1995-08-31 23:50:00-04:00,174,1883814,36,0,0,0.0,23,3,False
1995-08-31 23:55:00-04:00,123,2315469,32,0,0,0.0,23,3,False


In [23]:
df_5m_after.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 17856 entries, 1995-07-01 00:00:00-04:00 to 1995-08-31 23:55:00-04:00
Freq: 5min
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   requests        17856 non-null  int64  
 1   bytes           17856 non-null  int64  
 2   hosts           17856 non-null  int64  
 3   errors          17856 non-null  int64  
 4   is_system_down  17856 non-null  int64  
 5   error_rate      17856 non-null  float64
 6   hour_of_day     17856 non-null  int64  
 7   day_of_week     17856 non-null  int64  
 8   is_weekend      17856 non-null  bool   
dtypes: bool(1), float64(1), int64(7)
memory usage: 1.7 MB


In [24]:
df_1m = resample_traffic(raw_df, window='1min')
df_1m_after = add_features(df_1m)
df_1m_after.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 89280 entries, 1995-07-01 00:00:00-04:00 to 1995-08-31 23:59:00-04:00
Freq: min
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   requests        89280 non-null  int64  
 1   bytes           89280 non-null  int64  
 2   hosts           89280 non-null  int64  
 3   errors          89280 non-null  int64  
 4   is_system_down  89280 non-null  int64  
 5   error_rate      89280 non-null  float64
 6   hour_of_day     89280 non-null  int64  
 7   day_of_week     89280 non-null  int64  
 8   is_weekend      89280 non-null  bool   
dtypes: bool(1), float64(1), int64(7)
memory usage: 6.2 MB


In [26]:
df_15m = resample_traffic(raw_df, window='15min')
df_15m_after = add_features(df_15m)
df_15m_after.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5952 entries, 1995-07-01 00:00:00-04:00 to 1995-08-31 23:45:00-04:00
Freq: 15min
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   requests        5952 non-null   int64  
 1   bytes           5952 non-null   int64  
 2   hosts           5952 non-null   int64  
 3   errors          5952 non-null   int64  
 4   is_system_down  5952 non-null   int64  
 5   error_rate      5952 non-null   float64
 6   hour_of_day     5952 non-null   int64  
 7   day_of_week     5952 non-null   int64  
 8   is_weekend      5952 non-null   bool   
dtypes: bool(1), float64(1), int64(7)
memory usage: 424.3 KB


In [28]:
data_1m_path = DATA_CLEANED_DIR / 'data_1m.csv'
data_5m_path = DATA_CLEANED_DIR / 'data_5m.csv'
data_15m_path = DATA_CLEANED_DIR / 'data_15m.csv'

df_1m_after.to_csv(data_1m_path, index=True)
df_5m_after.to_csv(data_5m_path, index=True)
df_15m_after.to_csv(data_15m_path, index=True)