In [1]:
from syslogmp import parse

def load_syslog(log_message: str) -> dict:
    message = parse(log_message)

    return {
        'timestamp': message.timestamp,
        'hostname': message.hostname,
        'message': message.message,
    }

# Example usage:
log_message = bytes('<133>Feb 25 14:09:07 webserver syslogd: restart', 'utf-8')
log_data = load_syslog(log_message)
print(log_data)


{'timestamp': datetime.datetime(2023, 2, 25, 14, 9, 7), 'hostname': 'webserver', 'message': b'syslogd: restart'}


In [2]:
import re

def process_syslog(log_message):
    log_regex = r'^(\w{3}\s\d{1,2}\s\d{2}:\d{2}:\d{2})\s(\S+)\s(\S+):\s\[(\d+)\]\s(.*)$'
    # The above regex pattern captures the following groups:
    # 1. Timestamp
    # 2. Hostname
    # 3. Application
    # 4. PID
    # 5. Message

    match = re.match(log_regex, log_message)
    if not match:
        return None

    timestamp = match.group(1)
    hostname = match.group(2)
    application = match.group(3)
    pid = int(match.group(4))
    message = match.group(5)

    return {
        'timestamp': timestamp,
        'hostname': hostname,
        'application': application,
        'pid': pid,
        'message': message,
    }

# Example usage:
log_message = 'Feb 28 14:21:30 example-hostname kernel: [12345] This is a log message'
log_data = process_syslog(log_message)
print(log_data)


{'timestamp': 'Feb 28 14:21:30', 'hostname': 'example-hostname', 'application': 'kernel', 'pid': 12345, 'message': 'This is a log message'}


In [3]:
import tldextract
import pandas as pd

legit_domains = pd.read_csv("../data/blog3/top-1m.csv", names=["domain"])
legit_domains["tld"] = [tldextract.extract(d).domain for d in legit_domains["domain"]]
legit_domains

Unnamed: 0,domain,tld
1,google.com,google
2,youtube.com,youtube
3,baidu.com,baidu
4,bilibili.com,bilibili
5,facebook.com,facebook
...,...,...
864548,zoofilia-achtung.top,zoofilia-achtung
864549,zoolzool.co.nz,zoolzool
864550,zoomy.co.nz,zoomy
864551,zooporn.ws,zooporn


In [4]:
import urlextract

# Create an instance of URLExtract class
url_extractor = urlextract.URLExtract()

# Sample text containing URLs to be extracted
text_with_urls = "www.googleadservices.com: type A, class IN, addr 142.251.32.194"

# Extract URLs from the given text
extracted_urls = url_extractor.find_urls(text_with_urls)

print("Extracted URLs: ", extracted_urls)

Extracted URLs:  ['www.googleadservices.com', '142.251.32.194']


In [5]:
import pandas as pd

mirai_data = pd.read_csv("../data/blog3/mirai.csv")


In [6]:
from scapy.all import rdpcap



In [7]:
pcap = rdpcap("../data/blog3/2023-01-23-Google-ad-to-possible-TA505-activity.pcap")
type(pcap)

scapy.plist.PacketList

In [8]:
from scapy.all import *
import urllib.parse

def parse_http_payload(packet):
    if packet.haslayer(TCP) and packet.haslayer(Raw):
        try:
            payload = str(packet[Raw].load, 'utf-8')
            if 'HTTP/' in payload:
                http_request = payload.split('\r\n')
                try:
                    # this generates a ValueError that is why it is enclosed in try/except
                    method, path, protocol = http_request[0].split(' ')
                    headers = {}
                    for header in http_request[1:]:
                        if header:
                            header_name, header_value = header.split(':', maxsplit=1)
                            headers[header_name] = header_value.strip()
                    body = None
                    if '\r\n\r\n' in payload:
                        body = payload.split('\r\n\r\n')[1]
                    else:
                        body = payload.split("<!DOCTYPE html>")[1]
                    parsed_path = urllib.parse.urlparse(path)
                    return {
                        'timestamp': str(packet.time),
                        'source_ip': packet[IP].src,
                        'dest_ip': packet[IP].dst,
                        'source_port': packet[TCP].sport,
                        'dest_port': packet[TCP].dport,
                        'method': method,
                        'path': parsed_path.path,
                        'query_string': parsed_path.query,
                        'protocol': protocol,
                        'headers': headers,
                        'body': body,
                    }
                # else:
                except:
                    # http header may be non-standard, may have no method or may be  error code like 302
                    return {
                        'timestamp': str(packet.time),
                        'source_ip': packet[IP].src,
                        'dest_ip': packet[IP].dst,
                        'source_port': packet[TCP].sport,
                        'dest_port': packet[TCP].dport,
                    }
        except:
           pass 
        

# Example usage:
def print_parsed_payload(packet):
    parsed_payload = parse_http_payload(packet)
    if parsed_payload != None:
        print(parsed_payload)

for packet in pcap:
    print_parsed_payload(packet)

GET /download/AnyDeskSetup_26b30163.msi HTTP/1.1
Host: anydeskcloud.tech
Connection: keep-alive
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.55
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
Accept-Encoding: gzip, deflate
Accept-Language: en


{'timestamp': '1674496371.956557', 'source_ip': '10.1.23.101', 'dest_ip': '191.101.13.129', 'source_port': 49845, 'dest_port': 80, 'method': 'GET', 'path': '/download/AnyDeskSetup_26b30163.msi', 'query_string': '', 'protocol': 'HTTP/1.1', 'headers': {'Host': 'anydeskcloud.tech', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.55', 'Accept': 'text/html,application/xhtml+xml,application/xml