# LAB 8 Simulating MapReduce (Log Analysis - Count Requests per Status Code)

## Creating a simple dataset.

In [2]:
%%writefile weblogs.txt
# Date, Time, IP, Method, URL, Status, ResponseSize
2025-10-10,12:01:32,192.168.1.2,GET,/index.html,200,1024
2025-10-10,12:01:33,192.168.1.3,GET,/products.html,200,850
2025-10-10,12:01:35,192.168.1.4,GET,/contact.html,404,512
2025-10-10,12:01:38,192.168.1.5,POST,/checkout,500,128
2025-10-10,12:01:41,192.168.1.6,GET,/index.html,200,1024
2025-10-10,12:01:45,192.168.1.7,GET,/images/logo.png,200,256
2025-10-10,12:01:48,192.168.1.8,GET,/about.html,404,512
2025-10-10,12:01:53,192.168.1.9,POST,/login,403,64
2025-10-10,12:02:01,192.168.1.10,GET,/index.html,200,1024
2025-10-10,12:02:07,192.168.1.11,POST,/checkout,500,128
2025-10-10,12:02:12,192.168.1.12,GET,/contact.html,404,512
2025-10-10,12:02:15,192.168.1.13,GET,/index.html,200,1024
2025-10-10,12:02:21,192.168.1.14,GET,/products.html,200,850
2025-10-10,12:02:23,192.168.1.15,GET,/about.html,404,512
2025-10-10,12:02:29,192.168.1.16,POST,/checkout,500,128
2025-10-10,12:02:31,192.168.1.17,GET,/images/logo.png,200,256
2025-10-10,12:02:34,192.168.1.18,GET,/contact.html,404,512
2025-10-10,12:02:38,192.168.1.19,POST,/login,403,64
2025-10-10,12:02:41,192.168.1.20,GET,/index.html,200,1024
2025-10-10,12:02:47,192.168.1.21,GET,/products.html,200,850


Writing weblogs.txt


## Implement the Mapper

In [3]:
# Mapper: Extract (StatusCode, 1)
def mapper(line):
    fields = line.strip().split(",")
    if len(fields) != 7 or fields[0].startswith('#'):
        return []
    # TODO: extract the status code (5th or 6th field)
    status = fields[5].strip()
    return [(status, 1)]

## Shuffle Phase

In [4]:
from collections import defaultdict

def shuffle(mapped_data):
    grouped = defaultdict(list)
    for key, value in mapped_data:
        grouped[key].append(value)
    return grouped

## Reducer Phase

In [5]:
from collections import defaultdict

def reducer(mapped_data):
    grouped = defaultdict(int)
    for key, value in mapped_data:
        grouped[key] += value
    return grouped



## Combine the Phases

In [6]:
mapped = []
with open("weblogs.txt", "r") as f:
    for line in f:
        mapped.extend(mapper(line))

reduced = reducer(mapped)

for code, count in sorted(reduced.items()):
    print(f"HTTP {code}: {count} requests")

HTTP 200: 10 requests
HTTP 403: 2 requests
HTTP 404: 5 requests
HTTP 500: 3 requests


## Bonus Exploration
### 1- Count requests per URL instead of per status code

In [7]:
def mapper(line):
    fields = line.strip().split(',')
    if len(fields) != 7 or fields[0].startswith('#'):
        return []

    url = fields[4].strip()      # URL instead of status
    return [(url, 1)]

In [8]:
mapped = []
with open("weblogs.txt", "r") as f:
    for line in f:
        mapped.extend(mapper(line))

reduced = reducer(mapped)

for url, count in sorted(reduced.items()):
    print(f"{url}: {count} requests")

/about.html: 2 requests
/checkout: 3 requests
/contact.html: 3 requests
/images/logo.png: 2 requests
/index.html: 5 requests
/login: 2 requests
/products.html: 3 requests


### 2- Compute total response size per status code

In [9]:
def mapper(line):
    fields = line.strip().split(',')
    if len(fields) != 7 or fields[0].startswith('#'):
        return []

    status = fields[5].strip()
    try:
        response_size = int(fields[6].strip())
    except ValueError:
        return []   # skip bad lines

    return [(status, response_size)]

In [10]:
def reducer(mapped_data):
    from collections import defaultdict
    grouped = defaultdict(int)
    for key, value in mapped_data:
        grouped[key] += value
    return grouped

In [11]:
mapped = []
with open("weblogs.txt", "r") as f:
    for line in f:
        mapped.extend(mapper(line))

reduced = reducer(mapped)

for status, total_size in sorted(reduced.items()):
    print(f"HTTP {status}: {total_size} total bytes")

HTTP 200: 8182 total bytes
HTTP 403: 128 total bytes
HTTP 404: 2560 total bytes
HTTP 500: 384 total bytes


### 3- Filter out successful responses (status 200) â†’ only errors

In [12]:
def mapper(line):
    fields = line.strip().split(',')
    if len(fields) != 7 or fields[0].startswith('#'):
        return []

    status = fields[5].strip()

    # Filter: ignore 200 (successful) responses
    if status == "200":
        return []

    return [(status, 1)]

In [13]:
mapped = []
with open("weblogs.txt", "r") as f:
    for line in f:
        mapped.extend(mapper(line))

reduced = reducer(mapped)

for status, count in sorted(reduced.items()):
    print(f"HTTP {status}: {count} error requests")


HTTP 403: 2 error requests
HTTP 404: 5 error requests
HTTP 500: 3 error requests
