In [None]:
import pandas as pd 
import os
import re
from datetime import datetime
import requests

pd.set_option('display.max_columns', None)      # Show all columns
pd.set_option('display.max_colwidth', None)     # Don't truncate column content
pd.set_option('display.expand_frame_repr', False)  # Prevent wrapping to new lines
print("Current working directory:", os.getcwd())


In [None]:

process_logs_dir = 'tmp-process'
network_logs_dir = 'tmp-network'

# Helper: parse a single log line into a dictionary
def parse_log_line(line):
    pattern = r'(\w+):\s([^|]+)'
    matches = re.findall(pattern, line)
    return {k.strip(): v.strip() for k, v in matches}

# Helper: read and parse all logs in a given directory
def read_logs_from_directory(log_dir):
    parsed_logs = []
    for filename in os.listdir(log_dir):
        filepath = os.path.join(log_dir, filename)
        if os.path.isfile(filepath):
            with open(filepath, 'r', encoding='utf-8') as f:
                for line in f:
                    line = line.strip()
                    if line:
                        parsed_logs.append(parse_log_line(line))
    return parsed_logs


process_events = read_logs_from_directory(process_logs_dir)
network_events = read_logs_from_directory(network_logs_dir)
df_process = pd.DataFrame(process_events)
df_network = pd.DataFrame(network_events)
df_combined = pd.concat([df_process, df_network], ignore_index=True)
df_combined['timestamp'] = pd.to_datetime(df_combined['timestamp'], errors='coerce')
df_combined = df_combined.sort_values('timestamp').reset_index(drop=True)


pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.expand_frame_repr', False)

In [None]:
column_order = [
    "timestamp", "event", "hostname", "username", "pid", "process", "ppid",
    "parent", "sourceip", "sourceport",
    "destip", "destport", "status", "sid"
]
for col in column_order:
    if col not in df_combined.columns:
        df_combined[col] = ""

df_combined = df_combined[column_order]
df_combined = df_combined.fillna("")
df_combined

In [None]:
print("Available fields:\n", list(df_combined.columns))

field = input("\nEnter field to search (e.g., name, process, destip): ").strip()
if field not in df_combined.columns:
    print(f"Field '{field}' not found. Please choose from the list above.")
else:
    value = input("Enter value to search for (partial match okay): ").strip()
    result = df_combined[df_combined[field].str.contains(value, case=False, na=False)]
    print(f"\nFound {len(result)} matching rows:\n")
    display(result)


In [None]:
# Process outliers
process = (
    df_combined
    .groupby(['process', 'parent', 'event'])
    .size()
    .reset_index(name='count')
    .sort_values(by='count', ascending=True)
)
processhunt = (
    process[process['event'] == 'process created']
    .head(10)
)
processhunt

In [None]:
# User outliers
users = (
    df_combined
    .groupby(['username', 'process', 'event'])
    .size()
    .reset_index(name='count')
    .sort_values(by='count', ascending=True)
)
userhunt = users[users['event'] == 'process created'].head(10)
userhunt

In [None]:
process_sample = processhunt  
users_sample = userhunt     

process_csv = process_sample.to_csv(index=False)
users_csv = users_sample.to_csv(index=False)

prompt = f"""
Process data contains process parent and child relationship outliers. User data contains user and process
outliers. These are the outliers from a set of process logs. The field are as follows:
process is the name of the process that ran. parent is the name of the parent process that created it.
username is the user context that the process ran under.
event is the event type; this is less important.
As a threat hunter, analyze the following process and user log data for suspicious or anomalous behavior. 
Highlight any unusual process creations, rare parent-child relationships, or frequencies, considering how 
rare the outlier events may be. Keep in minf that the 'process' field is the process name and the 'parent' field 
is the name of the parent process that started the process in the 'process' field. Analyze every line
and suggest up to three log lines to investigate. 
which are the most anomalous and give suggested investigation steps. Do not tell me about normal or 
unremarkable events.

Process Data:
{process_csv}

User Data:
{users_csv}
"""


In [None]:

GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"  # Replace with actual endpoint if different
GROQ_API_KEY = "INSERT-KEY-HERE"  # Replace with your actual API key

headers = {
    "Authorization": f"Bearer {GROQ_API_KEY}",
    "Content-Type": "application/json"
}

data = {
    "model": "gemma2-9b-it",  # Replace with your desired Groq model
    "messages": [
        {"role": "user", "content": prompt}
    ],
    #"max_tokens": 1024  # Adjust as needed
}

response = requests.post(GROQ_API_URL, headers=headers, json=data)
result = response.json()
print(result['choices'][0]['message']['content'])