In [None]:
from syslogmp import parse


def load_syslog(log_message: str) -> dict:
    message = parse(log_message)

    return {
        "timestamp": message.timestamp,
        "hostname": message.hostname,
        "message": message.message,
    }


# Example usage:
log_message = bytes("<133>Feb 25 14:09:07 webserver syslogd: restart", "utf-8")
log_data = load_syslog(log_message)
print(log_data)

In [None]:
import re


def process_syslog(log_message):
    log_regex = r"^(\w{3}\s\d{1,2}\s\d{2}:\d{2}:\d{2})\s(\S+)\s(\S+):\s\[(\d+)\]\s(.*)$"
    # The above regex pattern captures the following groups:
    # 1. Timestamp
    # 2. Hostname
    # 3. Application
    # 4. PID
    # 5. Message

    match = re.match(log_regex, log_message)
    if not match:
        return None

    timestamp = match.group(1)
    hostname = match.group(2)
    application = match.group(3)
    pid = int(match.group(4))
    message = match.group(5)

    return {
        "timestamp": timestamp,
        "hostname": hostname,
        "application": application,
        "pid": pid,
        "message": message,
    }


# Example usage:
log_message = "Feb 28 14:21:30 example-hostname kernel: [12345] This is a log message"
log_data = process_syslog(log_message)
print(log_data)

In [None]:
import tldextract
import pandas as pd

legit_domains = pd.read_csv("../data/blog_show_data/top-1m.csv", names=["domain"])
legit_domains["tld"] = [tldextract.extract(d).domain for d in legit_domains["domain"]]
legit_domains

In [None]:
import urlextract

# Create an instance of URLExtract class
url_extractor = urlextract.URLExtract()

# Sample text containing URLs to be extracted
text_with_urls = "www.googleadservices.com: type A, class IN, addr 142.251.32.194"

# Extract URLs from the given text
extracted_urls = url_extractor.find_urls(text_with_urls)

print("Extracted URLs: ", extracted_urls)

In [None]:
import pandas as pd

mirai_data = pd.read_csv("../data/blog_show_data/mirai.csv")

In [None]:
from scapy.all import rdpcap

In [None]:
pcap = rdpcap("../data/blog_show_data/2023-01-23-Google-ad-to-possible-TA505-activity.pcap")
type(pcap)

In [None]:
from scapy.all import *
import urllib.parse


def parse_http_payload(packet):
    # check if there is payload in packet
    if packet.haslayer(TCP) and packet.haslayer(Raw):
        try:
            payload = str(packet[Raw].load, "utf-8")
            
            # is it http?
            if "HTTP/" in payload:
                print(payload)
                body = None
                if "\r\n\r\n" in payload:
                    body = payload.split("\r\n\r\n")[1]
                else:
                    body = payload.split("<!DOCTYPE html>")[1]
                http_request = payload.split("\r\n")
                try:
                    # this generates a ValueError if the http request does not have the proper structure 
                    method, path, protocol = http_request[0].split(" ")
                    print(method, path, protocol)
                    headers = {}
                    for header in http_request[1:]:
                        if header:
                            header_name, header_value = header.split(":", maxsplit=1)
                            headers[header_name] = header_value.strip()
                    parsed_path = urllib.parse.urlparse(path)
                    return {
                        "timestamp": str(packet.time),
                        "source_ip": packet[IP].src,
                        "dest_ip": packet[IP].dst,
                        "source_port": packet[TCP].sport,
                        "dest_port": packet[TCP].dport,
                        "method": method,
                        "path": parsed_path.path,
                        "query_string": parsed_path.query,
                        "protocol": protocol,
                        "headers": headers,
                        "body": body,
                    }
                # else:
                except:
                    # http header may be non-standard, may have no method or may be  error code like 302
                    return {
                        "timestamp": str(packet.time),
                        "source_ip": packet[IP].src,
                        "dest_ip": packet[IP].dst,
                        "source_port": packet[TCP].sport,
                        "dest_port": packet[TCP].dport,
                        "body": body,
                    }
        except:
            pass


# read a malicious packet capture downloaded from https://malware-traffic-analysis.net/
pcap = rdpcap("../data/blog_show_data/2023-01-23-Google-ad-to-possible-TA505-activity.pcap")

for packet in pcap:
    parsed_payload = parse_http_payload(packet)
    if parsed_payload != None:
        print(parsed_payload)