# Part 1 – TCP/IP encapsulation + generating traffic

This notebook does **two** things (as required in the project instructions):

1. **Loads** `group06_http_input.csv` (the *input* CSV that contains **application-layer messages**).
2. **Visualizes encapsulation** (Application → TCP → IP → Ethernet) by calculating payload + header sizes.
3. **Generates real TCP traffic** based on the messages, so you can **capture it live in Wireshark** and save a `.pcap`.

> If your file is actually a Wireshark “packet list” export (columns like `No., Time, Source, Destination, Protocol...`), this notebook will **auto-convert** it into the required input format.

In [None]:
from pathlib import Path
import pandas as pd

CSV_NAME = "group06_http_input.csv"

candidates = [
    Path.cwd() / CSV_NAME,
    Path.cwd() / "data" / CSV_NAME,
    Path.home() / "Desktop" / CSV_NAME,
    Path.home() / "Downloads" / CSV_NAME,
]

csv_path = next((p for p in candidates if p.exists()), None)
if csv_path is None:
    raise FileNotFoundError(
        f"Couldn't find {CSV_NAME}. Put it next to the notebook.\n"
        f"Tried:\n" + "\n".join(str(p) for p in candidates)
    )

print("Using CSV:", csv_path)

df_raw = pd.read_csv(csv_path)
print("Loaded shape:", df_raw.shape)
print("Columns:", list(df_raw.columns))
df_raw.head(10)

In [None]:
REQUIRED = {"msg_id","app_protocol","src_app","dst_app","message","timestamp"}

def is_project_format(df: pd.DataFrame) -> bool:
    return REQUIRED.issubset(set(df.columns))

def is_wireshark_packet_list(df: pd.DataFrame) -> bool:
    cols = set(df.columns)
    return {"No.","Time","Source","Destination","Protocol","Length","Info"}.issubset(cols)

if is_project_format(df_raw):
    df = df_raw.copy()
    print("✅ CSV already in the required project input format.")
elif is_wireshark_packet_list(df_raw):
    print("⚠️ Detected Wireshark packet-list CSV. Converting to the required INPUT format...")

    http_rows = df_raw[df_raw["Protocol"].astype(str).str.upper().eq("HTTP")].copy()
    if http_rows.empty:
        raise ValueError(
            "No rows with Protocol == HTTP were found.\n"
            "Capture more HTTP traffic (or generate it from the notebook), then export again."
        )

    http_rows = http_rows.reset_index(drop=True)
    df = pd.DataFrame({
        "msg_id": range(1, len(http_rows)+1),
        "app_protocol": "HTTP",
        "src_app": "client",
        "dst_app": "server",
        "message": http_rows["Info"].astype(str),
        "timestamp": http_rows["Time"].astype(float),
    })

    out_path = csv_path.parent / CSV_NAME
    df.to_csv(out_path, index=False)
    print(f"✅ Converted and saved INPUT CSV to: {out_path}")
else:
    raise ValueError(
        "CSV format not recognized.\n"
        "Either provide the project INPUT format columns:\n"
        "  msg_id, app_protocol, src_app, dst_app, message, timestamp\n"
        "or export the Wireshark packet list with columns:\n"
        "  No., Time, Source, Destination, Protocol, Length, Info"
    )

df.head(10)

In [None]:
ETH_HDR = 14
IP_HDR  = 20
TCP_HDR = 20

df2 = df.copy()
df2["message"] = df2["message"].astype(str)
df2["app_len_bytes"] = df2["message"].apply(lambda s: len(s.encode("utf-8")))
df2["tcp_len_bytes"] = TCP_HDR + df2["app_len_bytes"]
df2["ip_len_bytes"]  = IP_HDR  + df2["tcp_len_bytes"]
df2["eth_len_bytes"] = ETH_HDR + df2["ip_len_bytes"]

df2[["msg_id","app_protocol","message","timestamp","app_len_bytes","tcp_len_bytes","ip_len_bytes","eth_len_bytes"]].head(20)

In [None]:
row = df2.iloc[0]

print("Application Layer (HTTP):")
print("  payload:", row["message"])
print("  payload length:", row["app_len_bytes"], "bytes")
print()
print("Transport Layer (TCP):")
print("  TCP header:", TCP_HDR, "bytes")
print("  TCP segment length:", row["tcp_len_bytes"], "bytes")
print()
print("Network Layer (IPv4):")
print("  IP header:", IP_HDR, "bytes")
print("  IP packet length:", row["ip_len_bytes"], "bytes")
print()
print("Link Layer (Ethernet):")
print("  Ethernet header:", ETH_HDR, "bytes")
print("  Ethernet frame length:", row["eth_len_bytes"], "bytes")

## Generate traffic (so Wireshark can capture it)

1. Start capture in **Wireshark** on the correct interface (Wi‑Fi/Ethernet for internet traffic).
2. Filter example: `tcp.port == 80`
3. Run the next cell. It will open TCP connections and send HTTP requests.

If port **80** is blocked for you, set `PORT = 8080` and filter `tcp.port == 8080`.

In [None]:
import socket, time, re

HOST = "example.com"
PORT = 80

def to_http_request(line: str, host: str) -> bytes:
    # Turn a CSV 'message' like 'GET /index.html HTTP/1.1' into a full HTTP request.
    line = line.strip()

    # If it already looks like multi-line HTTP
    if "\r\n" in line:
        req = line
        if re.search(r"(?im)^Host:\s", req) is None:
            req = req.replace("\r\n", f"\r\nHost: {host}\r\n", 1)
        if not req.endswith("\r\n\r\n"):
            req += "\r\n\r\n"
        return req.encode("utf-8", errors="replace")

    parts = line.split()
    if len(parts) >= 2 and parts[0].isalpha():
        method = parts[0].upper()
        path = parts[1]
        version = parts[2] if len(parts) >= 3 else "HTTP/1.1"
        req = (
            f"{method} {path} {version}\r\n"
            f"Host: {host}\r\n"
            f"User-Agent: Jupyter-TCPIP-Project\r\n"
            f"Connection: close\r\n\r\n"
        )
        return req.encode("utf-8", errors="replace")

    return f"GET / HTTP/1.1\r\nHost: {host}\r\nConnection: close\r\n\r\n".encode("utf-8")

def send_once(payload: bytes, host: str, port: int, timeout=5):
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.settimeout(timeout)
    s.connect((host, port))
    s.sendall(payload)
    try:
        _ = s.recv(4096)
    except Exception:
        pass
    s.close()

print(f"Sending {len(df2)} messages to {HOST}:{PORT} ...")

for _, r in df2.iterrows():
    msg = str(r["message"])
    payload = to_http_request(msg, HOST)
    try:
        send_once(payload, HOST, PORT)
        print(f"  [{int(r['msg_id'])}] sent {len(payload)} bytes: {msg[:60]}")
    except Exception as e:
        print(f"  [{int(r['msg_id'])}] FAILED: {e} | message: {msg[:60]}")
    time.sleep(0.5)

print("Done. Stop capture in Wireshark and save as .pcap")