In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

# Function to parse nginx log lines
def parse_log_line(line):
  log_pattern = re.compile(r'(?P<ip>\d+\.\d+\.\d+\.\d+) - - \[(?P<time>.*?)\] "(?P<request>.*?)" (?P<status>\d+) (?P<size>\d+) "(?P<referrer>.*?)" "(?P<user_agent>.*?)"')
  match = log_pattern.match(line)
  if match:
    return match.groupdict()
  return None

# Read and parse the log file
log_file_path = 'samples/access_wordpress.log'
with open(log_file_path, 'r') as file:
  log_lines = file.readlines()

log_entries = [parse_log_line(line) for line in log_lines if parse_log_line(line) is not None]
df = pd.DataFrame(log_entries)

# Convert time to datetime
df['time'] = pd.to_datetime(df['time'], format='%d/%b/%Y:%H:%M:%S %z')

# Convert status and size to numeric
df['status'] = pd.to_numeric(df['status'])
df['size'] = pd.to_numeric(df['size'])

# Plotting
plt.figure(figsize=(15, 10))

# Plot 1: Number of requests per status code
plt.subplot(2, 2, 1)
sns.countplot(x='status', data=df)
plt.title('Number of Requests per Status Code')

# Plot 2: Number of requests over time
plt.subplot(2, 2, 2)
df.set_index('time').resample('H').size().plot()
plt.title('Number of Requests Over Time')

# Plot 3: Top 10 IPs by number of requests
plt.subplot(2, 2, 3)
df['ip'].value_counts().head(10).plot(kind='bar')
plt.title('Top 10 IPs by Number of Requests')

# Plot 4: Top 10 requested URLs
plt.subplot(2, 2, 4)
df['request'].value_counts().head(10).plot(kind='bar')
plt.title('Top 10 Requested URLs')

plt.tight_layout()
plt.show()