In [1]:
import pandas as pd
import re
from datetime import datetime

In [3]:
log_file_path = "../dados/web-server-access-logs_10k.log"

log_lines = []

with open(log_file_path, 'r') as file:
    for line in file:
        line = line.strip()
        log_lines.append(line)

In [4]:
df = pd.DataFrame(log_lines, columns=['Log Line'])

In [5]:
df[['ID', 'Log Entry']] = df['Log Line'].str.split(',', n=1, expand=True)
df = df.drop('Log Line', axis=1)

df['Log Entry'] = df['Log Entry'].str.strip('"')
df['ID'] = df['ID'].str.strip('"')

In [6]:
log_pattern = r'(\S+) - - \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+) (\S+)" (\d{3}) (\d+) "([^"]*)" "([^"]*)"'

In [7]:
def parse_log_entry(log_entry):
    log_entry = re.sub(r'""', '"', log_entry)
    match = re.match(log_pattern, log_entry)
    if match:
        ip_address = match.group(1).strip('"')
        date_time_str = match.group(2)
        method = match.group(3)
        requested_url = match.group(4)
        http_version = match.group(5)
        http_status = match.group(6)
        bytes_transferred = match.group(7)
        referrer = match.group(8).strip('"')
        user_agent = match.group(9).strip('"')

        date_time = datetime.strptime(date_time_str, '%d/%b/%Y:%H:%M:%S %z')

        return pd.Series([
            ip_address,
            date_time,
            method,
            requested_url,
            http_version,
            http_status,
            bytes_transferred,
            referrer,
            user_agent
        ], index=['IP Address', 'Date/Time', 'Method', 'Requested URL', 'HTTP Version', 'HTTP Status', 'Bytes Transferred', 'Referrer', 'User Agent'])
    else:
        return pd.Series([None]*9, index=['IP Address', 'Date/Time', 'Method', 'Requested URL', 'HTTP Version', 'HTTP Status', 'Bytes Transferred', 'Referrer', 'User Agent'])


In [8]:
df[['IP Address', 'Date/Time', 'Method', 'Requested URL', 'HTTP Version', 'HTTP Status', 'Bytes Transferred', 'Referrer', 'User Agent']] = df['Log Entry'].apply(parse_log_entry)

In [9]:
df = df.drop('Log Entry', axis=1)

In [10]:
df.to_csv('../dados/logs.csv')