In [6]:
import csv
import time
import requests
import re
import json
import random

In [10]:
class LogEntry:
    def __init__(self, timestamp, interface_in, mac, src_ip, dst_ip, length, tos, prec, ttl, id, proto, src_port, dst_port, window, res, urgp):
        self.timestamp = timestamp
        self.interface_in = interface_in
        self.mac = mac
        self.src_ip = src_ip
        self.dst_ip = dst_ip
        self.length = length
        self.tos = tos
        self.prec = prec
        self.ttl = ttl
        self.id = id
        self.proto = proto
        self.src_port = src_port
        self.dst_port = dst_port
        self.window = window
        self.res = res
        self.urgp = urgp
        self.continent = "N/A"
        self.country = "N/A"
        self.asn = "N/A"
        self.as_name = "N/A"
        self.as_domain = "N/A"
    def __str__(self):
        return f"Log: {self.src_ip} Country: {self.country} Domain: {self.as_domain}"

In [None]:
logList = []

In [15]:
with open("./logs/ufw.log") as file:
    for line in file:
        # Separate log line into components
        components = line.strip().split(" ")
        if len(components) < 17:
            continue
        log_entry = LogEntry(
            timestamp=components[0],
            # For rest of components, attempt to find the value after the '=' sign using regex in the original line, if not found, set to "N/A"
            interface_in=re.search(r"IN=(\S+)", line).group(1) if re.search(r"IN=(\S+)", line) else "N/A",
            mac=re.search(r"MAC=(\S+)", line).group(1) if re.search(r"MAC=(\S+)", line) else "N/A",
            src_ip=re.search(r"SRC=(\S+)", line).group(1) if re.search(r"SRC=(\S+)", line) else "N/A",
            dst_ip=re.search(r"DST=(\S+)", line).group(1) if re.search(r"DST=(\S+)", line) else "N/A",
            length=re.search(r"LEN=(\S+)", line).group(1) if re.search(r"LEN=(\S+)", line) else "N/A",
            tos=re.search(r"TOS=(\S+)", line).group(1) if re.search(r"TOS=(\S+)", line) else "N/A",
            prec=re.search(r"PREC=(\S+)", line).group(1) if re.search(r"PREC=(\S+)", line) else "N/A",
            ttl=re.search(r"TTL=(\S+)", line).group(1) if re.search(r"TTL=(\S+)", line) else "N/A",
            id=re.search(r"ID=(\S+)", line).group(1) if re.search(r"ID=(\S+)", line) else "N/A",
            proto=re.search(r"PROTO=(\S+)", line).group(1) if re.search(r"PROTO=(\S+)", line) else "N/A",
            src_port=re.search(r"SPT=(\S+)", line).group(1) if re.search(r"SPT=(\S+)", line) else "N/A",
            dst_port=re.search(r"DPT=(\S+)", line).group(1) if re.search(r"DPT=(\S+)", line) else "N/A",
            window=re.search(r"WINDOW=(\S+)", line).group(1) if re.search(r"WINDOW=(\S+)", line) else "N/A",
            res=re.search(r"RES=(\S+)", line).group(1) if re.search(r"RES=(\S+)", line) else "N/A",
            urgp=re.search(r"URGP=(\S+)", line).group(1) if re.search(r"URGP=(\S+)", line) else "N/A"
        )
        logList.append(log_entry)
    print(f"Parsed {len(logList)} log entries.")

Parsed 14077 log entries.


In [4]:
unique_log_entries = []

In [5]:
for log in logList:
    if log.src_ip not in [entry.src_ip for entry in unique_log_entries]:
        unique_log_entries.append(log)
print(f"Reduced to {len(unique_log_entries)} unique log entries based on source IP.")

Reduced to 2855 unique log entries based on source IP.


# For each log entry, add datapoints provided by IPinfo.io using their API, and my Token f16eb2292770cb
print("Enriching log entries with IPinfo.io data...")
for log in unique_log_entries:
    # Make a request to the IPinfo.io API
    response = requests.get(f"https://api.ipinfo.io/lite/{log.src_ip}?token=f16eb2292770cb")
    print(f"Requesting data for IP {log.src_ip}, status code: {response.status_code}")
    if response.status_code == 200:
        data = response.json()
        print(data)
        log.continent = data.get("continent", "N/A")
        log.country = data.get("country", "N/A")
        log.asn = data.get("asn", "N/A")
        log.as_name = data.get("as_name", "N/A")
        log.as_domain = data.get("as_domain", "N/A")
        print(f"Enriched log for IP {log.src_ip}: {log.continent}, {log.country}, {log.asn}, {log.as_name}, {log.as_domain}")
    else:
        print(f"Failed to retrieve data for IP {log.src_ip}, status code: {response.status_code}")
        log.continent = "N/A"
        log.country = "N/A"
        log.asn = "N/A"
        log.as_name = "N/A"
        log.as_domain = "N/A"
    # To avoid hitting the rate limit, add a delay of 1 second between requests
    time.sleep(random.uniform(1, 2))

In [13]:
for log in unique_log_entries:
    print(f"IP: {log.src_ip}, Country: {log.country}")

IP: 64.62.197.234, Country: United States
IP: 43.130.72.177, Country: United States
IP: 45.79.109.193, Country: United States
IP: 196.251.87.35, Country: The Netherlands
IP: 196.251.71.143, Country: The Netherlands
IP: 196.251.72.177, Country: The Netherlands
IP: 18.162.120.131, Country: Hong Kong
IP: 54.80.215.124, Country: United States
IP: 135.148.168.151, Country: United States
IP: 118.194.231.231, Country: Japan
IP: 3.68.53.133, Country: Germany
IP: 63.176.22.69, Country: Germany
IP: 45.33.78.24, Country: United States
IP: 18.175.136.85, Country: United Kingdom
IP: 54.78.131.68, Country: Ireland
IP: 3.115.23.90, Country: Japan
IP: 88.214.25.123, Country: Germany
IP: 54.234.104.125, Country: United States
IP: 216.218.206.124, Country: United States
IP: 52.213.121.14, Country: Ireland
IP: 196.251.83.215, Country: The Netherlands
IP: 3.70.55.65, Country: Germany
IP: 45.142.193.141, Country: The Netherlands
IP: 196.251.67.42, Country: The Netherlands
IP: 173.255.223.89, Country: Unite

In [14]:
# Save the enriched data into a file just as a safeguard
with open("enriched_logs.json", mode="w") as jsonfile:
    json.dump([log.__dict__ for log in unique_log_entries], jsonfile)

In [16]:
# Merge the enriched data back into the original logList, so that each log entry has the enriched data corresponding to its source IP
for log in logList:
    enriched_log = next((entry for entry in unique_log_entries if entry.src_ip == log.src_ip), None)
    if enriched_log:
        log.continent = enriched_log.continent
        log.country = enriched_log.country
        log.asn = enriched_log.asn
        log.as_name = enriched_log.as_name
        log.as_domain = enriched_log.as_domain

In [20]:
# Export the enriched log entries to a CSV file
with open("enriched_logs.csv", mode="w", newline="") as csvfile:
    fieldnames = ["timestamp", "interface_in", "mac", "src_ip", "dst_ip", "length", "tos", "prec", "ttl", "id", "proto", "src_port", "dst_port", "window", "res", "urgp", "continent", "country", "asn", "as_name", "as_domain"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for log in logList:
        writer.writerow({
            "timestamp": log.timestamp,
            "interface_in": log.interface_in,
            "mac": log.mac,
            "src_ip": log.src_ip,
            "dst_ip": log.dst_ip,
            "length": log.length,
            "tos": log.tos,
            "prec": log.prec,
            "ttl": log.ttl,
            "id": log.id,
            "proto": log.proto,
            "src_port": log.src_port,
            "dst_port": log.dst_port,
            "window": log.window,
            "res": log.res,
            "urgp": log.urgp,
            "continent": log.continent,
            "country": log.country,
            "asn": log.asn,
            "as_name": log.as_name,
            "as_domain": log.as_domain
        })
print("Enriched logs have been exported to enriched_logs.csv")

ValueError: dict contains fields not in fieldnames: 'asn', 'as_domain', 'as_name', 'continent'