#### Setup and ingest data

In [1]:
import os
from pathlib import Path

# Define directories
base_dir = Path('./data_lake')
raw_dir = base_dir / 'raw'
processed_dir = base_dir / 'processed'
cleaned_dir = base_dir / 'cleaned'

# Create directories
for dir in [raw_dir, processed_dir, cleaned_dir]:
    dir.mkdir(parents=True, exist_ok=True)

# Simulate ingesting data
sample_logs = ["2024-08-24 12:00:00 INFO User logged in",
               "2024-08-24 12:01:00 ERROR Failed login attempt",
               "2024-08-24 12:02:00 INFO User logged out"]

with open(raw_dir / 'logs.txt', 'w') as file:
    file.write("\n".join(sample_logs))


#### Process data (ETL)

In [2]:
import pandas as pd

# Load raw data
with open(raw_dir / 'logs.txt', 'r') as file:
    logs = file.readlines()

# Process data: Filter only ERROR logs
error_logs = [log for log in logs if 'ERROR' in log]

# Save processed data
with open(processed_dir / 'error_logs.txt', 'w') as file:
    file.write("".join(error_logs))


#### Clean Data

In [3]:
# Further clean: Extract timestamp and log level
cleaned_logs = []

for log in error_logs:
    parts = log.split(" ")
    timestamp = " ".join(parts[:2])
    level = parts[2]
    message = " ".join(parts[3:])
    cleaned_logs.append({"timestamp": timestamp, "level": level, "message": message})

# Convert to DataFrame for better manipulation
df = pd.DataFrame(cleaned_logs)
df.to_csv(cleaned_dir / 'cleaned_logs.csv', index=False)


#### Query the data

In [4]:
# Query the cleaned data
df_cleaned = pd.read_csv(cleaned_dir / 'cleaned_logs.csv')

# Example query: Count number of errors
error_count = df_cleaned.shape[0]
print(f"Total number of errors: {error_count}")

# Example query: Errors in the last minute
recent_errors = df_cleaned[df_cleaned['timestamp'] > '2024-08-24 12:00:00']
print(recent_errors)


Total number of errors: 1
             timestamp  level                 message
0  2024-08-24 12:01:00  ERROR  Failed login attempt\n
