In [5]:
import os
import re
import pandas as pd
import requests
from datetime import datetime

log_dir = "logs"

def parse_clean_log_line(line):
    pairs = [item.strip() for item in line.strip().split(",") if item.strip()]
    entry = {}
    for pair in pairs:
        if ": " in pair:
            key, value = pair.split(": ", 1)
            entry[key.strip().lower()] = value.strip()  # lowercase keys for uniformity
    return entry

parsed_entries = []
for filename in os.listdir(log_dir):
    filepath = os.path.join(log_dir, filename)
    if os.path.isfile(filepath):
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in f:
                if line.strip():
                    parsed = parse_clean_log_line(line)
                    if parsed:
                        parsed_entries.append(parsed)


df_net_clean = pd.DataFrame(parsed_entries)
df_net_clean['timestamp'] = pd.to_datetime(df_net_clean['timestamp'], errors='coerce')
df_net_clean = df_net_clean.fillna("")
net = pd.DataFrame(parsed_entries)
net['timestamp'] = pd.to_datetime(net['timestamp'], errors='coerce')
net = net.fillna("")

pd.set_option('display.max_columns', None)      # Show all columns
pd.set_option('display.max_colwidth', None)     # Don't truncate column content
pd.set_option('display.max_rows', 10)  # Prevent wrapping to new lines
net

Unnamed: 0,timestamp,hostname,event,status,pid,process,dnsname,sourceip,sourceport,destip,destport,asname
0,2025-04-16 20:13:20,DESKTOP-IAGNT81,connection created,ESTABLISHED,8776,firefox.exe,ec2-3-208-232-251.compute-1.amazonaws.com,192.168.1.234,44065,3.208.232.251,443,skipped-dns
1,2025-04-16 20:13:20,DESKTOP-IAGNT81,connection created,CLOSE_WAIT,7836,LockApp.exe,a104-124-105-66.deploy.static.akamaitechnologies.com,192.168.1.234,49747,104.124.105.66,443,skipped-dns
2,2025-04-16 20:13:20,DESKTOP-IAGNT81,connection created,ESTABLISHED,10324,slack.exe,ec2-54-224-85-111.compute-1.amazonaws.com,192.168.1.234,46477,54.224.85.111,443,skipped-dns
3,2025-04-16 20:13:20,DESKTOP-IAGNT81,connection created,ESTABLISHED,8776,firefox.exe,ec2-44-224-88-118.us-west-2.compute.amazonaws.com,192.168.1.234,43491,44.224.88.118,443,skipped-dns
4,2025-04-16 20:13:21,DESKTOP-IAGNT81,connection created,ESTABLISHED,8776,firefox.exe,none,192.168.1.234,45421,172.64.41.4,443,CLOUDFLARENET
...,...,...,...,...,...,...,...,...,...,...,...,...
759,2025-04-16 20:22:24,DESKTOP-IAGNT81,connection created,LAST_ACK,6000,Code.exe,lb-140-82-112-22-iad.github.com,192.168.1.234,46987,140.82.112.22,443,skipped-dns
760,2025-04-16 20:22:29,DESKTOP-IAGNT81,connection created,SYN_SENT,11892,svchost.exe,none,192.168.1.234,46988,20.190.190.194,443,MICROSOFT-CORP-MSN-AS-BLOCK
761,2025-04-16 20:22:35,DESKTOP-IAGNT81,connection created,ESTABLISHED,11892,svchost.exe,none,192.168.1.234,46988,20.190.190.194,443,MICROSOFT-CORP-MSN-AS-BLOCK
762,2025-04-16 20:22:35,DESKTOP-IAGNT81,connection created,TIME_WAIT,0,System Idle Process,rdap.arin.net,192.168.1.234,47006,199.5.26.160,443,skipped-dns


In [9]:
import tldextract

# Extract parent domain + suffix, e.g., 'google.com'
net['tld'] = net['dnsname'].apply(
    lambda x: ".".join([tldextract.extract(x).domain, tldextract.extract(x).suffix]) if x else ""
)
tld_counts = net['tld'].value_counts().reset_index()
tld_counts.columns = ['tld', 'count']
tld_counts

Unnamed: 0,tld,count
0,none.,260
1,amazonaws.com,186
2,akamaitechnologies.com,101
3,1e100.net,93
4,arin.net,41
...,...,...
10,adobedc.net,3
11,opendr.io,2
12,rtbhouse.net,2
13,cdn77.com,2


In [11]:
# Resolve DNS domain ages

API_KEY = "INSERT-WHOISXML-API-KEY"

def get_domain_age(domain):
    url = f"https://www.whoisxmlapi.com/whoisserver/WhoisService?apiKey={API_KEY}&domainName={domain}&outputFormat=JSON"
    try:
        response = requests.get(url)
        data = response.json()

        creation_raw = (
            data.get("WhoisRecord", {}).get("createdDate") or
            data.get("WhoisRecord", {}).get("registryData", {}).get("createdDate")
        )

        if creation_raw:
            creation_date = datetime.strptime(creation_raw[:10], "%Y-%m-%d")
            age_days = (datetime.now() - creation_date).days
            age_years = age_days // 365
            return {
                "domain": domain,
                "creation_date": creation_date,
                "age_days": age_days,
                "age_years": age_years,
                "status": "OK"
            }
        else:
            return None  # No creation date, exclude

    except Exception:
        return None  # On error, also exclude

# List of domains to skip
skip_domains = {
    "adobedc.net",
    "amazonaws.com",
    "cdn77.com",
    "none.",
    "akamaitechnologies.com",
    "1e100.net",
    "github.com",
    "arin.net",
    "googleusercontent.com",
    "a-msedge.net",
    "cloudfront.net",
    "liveperson.net",
    "rtbhouse.net",
    "web-hosting.com"
}
skip_domains = {d.lower() for d in skip_domains}
unique_domains = net['tld'].dropna().unique()

# WHOIS lookups (excluding skipped)
domain_age_data = [
    result
    for domain in unique_domains
    if domain.lower() not in skip_domains
    and (result := get_domain_age(domain)) is not None
]

df_domain_age = pd.DataFrame(domain_age_data)
df_domain_age.head()


In [23]:
# Process outliers
process = (
    net
    .groupby([ 'process', 'event'])
    .size()
    .reset_index(name='count')
    .sort_values(by='count', ascending=True)
)
processhunt = process[
    (process['count'] > 1) & 
    (process['event'] == 'connection created').head(10)
]
processhunt


  processhunt = process[


Unnamed: 0,process,event,count
15,sublime_text.exe,connection created,2
12,pythonw.exe,connection created,4
7,curl.exe,connection created,4
14,smartscreen.exe,connection created,5
3,MpDefenderCoreService.exe,connection created,5
6,backgroundTaskHost.exe,connection created,6
16,svchost.exe,connection created,8
2,LockApp.exe,connection created,8
8,explorer.exe,connection created,10
13,slack.exe,connection created,13


In [25]:
# DNS Outliers
dns = (
    net
    .groupby([ 'tld', 'event'])
    .size()
    .reset_index(name='count')
    .sort_values(by='count', ascending=True)
)

# Discard rows where count is 1 and event is not 'connection created'
dnshunt = dns[
    (dns['count'] > 1) & 
    (dns['event'] == 'connection created').head(10)
]
dnshunt


  dnshunt = dns[


Unnamed: 0,tld,event,count
6,cdn77.com,connection created,2
12,opendr.io,connection created,2
13,rtbhouse.net,connection created,2
2,adobedc.net,connection created,3
10,liveperson.net,connection created,6
1,a-msedge.net,connection created,12
7,cloudfront.net,connection created,13
8,github.com,connection created,17
9,googleusercontent.com,connection created,25


In [27]:
# ASN Outliers
asn = (
    net
    .groupby([ 'process', 'asname', 'event'])
    .size()
    .reset_index(name='count')
    .sort_values(by='count', ascending=True)
    .head(10)
)

# Discard rows where count is 1 and event is not 'connection created'
asnhunt = asn[
    (asn['count'] > 1) & 
    (asn['event'] == 'connection created')
]
asnhunt


Unnamed: 0,process,asname,event,count
36,sublime_text.exe,DIGITALOCEAN-ASN,connection created,2
22,firefox.exe,MICROSOFT-CORP-MSN-AS-BLOCK,connection created,2
