In [None]:
import pandas as pd 
import os
import re
from datetime import datetime
import requests

pd.set_option('display.max_columns', None)      # Show all columns
pd.set_option('display.max_colwidth', None)     # Don't truncate column content
pd.set_option('display.expand_frame_repr', False)  # Prevent wrapping to new lines
print("Current working directory:", os.getcwd())


In [None]:

process_logs_dir = 'process'
network_logs_dir = 'network'

# Helper: parse a single log line into a dictionary
def parse_log_line(line):
    pattern = r'(\w+):\s([^|]+)'
    matches = re.findall(pattern, line)
    return {k.strip(): v.strip() for k, v in matches}

# Helper: read and parse all logs in a given directory
def read_logs_from_directory(log_dir):
    parsed_logs = []
    for filename in os.listdir(log_dir):
        filepath = os.path.join(log_dir, filename)
        if os.path.isfile(filepath):
            with open(filepath, 'r', encoding='utf-8') as f:
                for line in f:
                    line = line.strip()
                    if line:
                        parsed_logs.append(parse_log_line(line))
    return parsed_logs


process_events = read_logs_from_directory(process_logs_dir)
network_events = read_logs_from_directory(network_logs_dir)
df_process = pd.DataFrame(process_events)
df_network = pd.DataFrame(network_events)
df_combined = pd.concat([df_process, df_network], ignore_index=True)
df_combined['timestamp'] = pd.to_datetime(df_combined['timestamp'], errors='coerce')
df_combined = df_combined.sort_values('timestamp').reset_index(drop=True)


pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.expand_frame_repr', False)

In [None]:
df_process

In [None]:
column_order = [
    "timestamp", "event", "hostname", "username", "pid", "process", "ppid",
    "parent", "cmdline", "sourceip", "sourceport",
    "destip", "destport", "status", "sid"
]
for col in column_order:
    if col not in df_combined.columns:
        df_combined[col] = ""

df_combined = df_combined[column_order]
df_combined = df_combined.fillna("")
df_combined

In [None]:
print("Available fields:\n", list(df_combined.columns))

field = input("\nEnter field to search (e.g., name, process, destip): ").strip()
if field not in df_combined.columns:
    print(f"Field '{field}' not found. Please choose from the list above.")
else:
    value = input("Enter value to search for (partial match okay): ").strip()
    result = df_combined[df_combined[field].str.contains(value, case=False, na=False)]
    print(f"\nFound {len(result)} matching rows:\n")
    display(result)


In [None]:
df_process

In [None]:
print(df_process['parent'].unique())


In [None]:
print(df_process['name'].unique())


In [None]:
# Normalize 'name' to catch all variations of bash
process['name_clean'] = process['name'].str.strip().str.lower()

# Filter for combinations where:
# - the event is 'process created'
# - OR the name ends with 'bash' (e.g., '/bin/bash')
filtered = process[
    (process['event'] == 'process created') |
    (process['name_clean'].str.endswith('bash'))
]

# Sort by count to get the rarest combinations
processhunt = filtered.sort_values(by='count', ascending=True).head(10)

# Show the results
processhunt


In [None]:
# Process outliers
process = (
    df_process
    .groupby(['name', 'username', 'cmdline', 'event',])
    .size()
    .reset_index(name='count')
    .sort_values(by='count', ascending=True)
)
processhunt = (
    process[process['event'] == 'process created'] |
    process[process['name'] == 'bash']
    .head(10)
)
processhunt

In [None]:
# User outliers
users = (
    df_combined
    .groupby(['username', 'process', 'event'])
    .size()
    .reset_index(name='count')
    .sort_values(by='count', ascending=True)
)
userhunt = users[users['event'] == 'process created'].head(10)
userhunt

In [None]:
process_sample = processhunt  
users_sample = userhunt     

process_csv = process_sample.to_csv(index=False)
users_csv = users_sample.to_csv(index=False)

prompt = f"""
Process data contains process parent and child relationship outliers. User data contains user and process
outliers. These are the outliers from a set of process logs. The field are as follows:
process is the name of the process that ran. parent is the name of the parent process that created it.
username is the user context that the process ran under.
event is the event type; this is less important.
As a threat hunter, analyze the following process and user log data for suspicious or anomalous behavior. 
Highlight any unusual process creations, rare parent-child relationships, or frequencies, considering how 
rare the outlier events may be. Keep in minf that the 'process' field is the process name and the 'parent' field 
is the name of the parent process that started the process in the 'process' field. Analyze every line
and suggest up to three log lines to investigate. 
which are the most anomalous and give suggested investigation steps. Do not tell me about normal or 
unremarkable events.

Process Data:
{process_csv}

User Data:
{users_csv}
"""


In [None]:

GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"  # Replace with actual endpoint if different
GROQ_API_KEY = "INSERT-KEY-HERE"  # Replace with your actual API key

headers = {
    "Authorization": f"Bearer {GROQ_API_KEY}",
    "Content-Type": "application/json"
}

data = {
    "model": "gemma2-9b-it",  # Replace with your desired Groq model
    "messages": [
        {"role": "user", "content": prompt}
    ],
    #"max_tokens": 1024  # Adjust as needed
}

response = requests.post(GROQ_API_URL, headers=headers, json=data)
result = response.json()
print(result['choices'][0]['message']['content'])

In [None]:
# Path to your log file
logfile = 'services'

# Read and parse each line
def parse_log_line(line):
    record = {}
    parts = [part.strip() for part in line.strip().split('|')]
    for part in parts:
        if part.startswith("uuid: "):
            key, val = part.split(": ", 1)
            record[key] = literal_eval(val)
        else:
            key, val = part.split(": ", 1)
            record[key] = val
    return record

# Open file and parse lines
with open(logfile, 'r') as f:
    parsed_logs = [parse_log_line(line) for line in f if line.strip()]

# Convert to DataFrame
services = pd.DataFrame(parsed_logs)

# Flatten the 'uuid' dictionary column
if 'uuid' in df.columns:
    uuid_df = df['uuid'].apply(pd.Series)
    services = services.drop(columns=['uuid']).join(uuid_df)

# Display or use the DataFrame
services


In [126]:
import requests
import pandas as pd

# Assume your DataFrame is called `services`
# Convert to a readable string (e.g., table or JSON)
data_str = services.to_string(index=False)

# Define your prompt
prompt = f"""
You are a cybersecurity threat hunter. First, tell me how many unique services you see.
Next, consider each line in the log data. Given the following list of Linux service events, 
identify ONE service that is the most inexplicable or indicative of potential compromise. 
Provide reasoning for any services you highlight and explain why they are unusual. Ignore services
you recognize as legitimate.

Service Data:
{data_str}
"""

# GROQ API request setup
groq_api_key = "KEY"
model = "llama-3.1-8b-instant"  # or llama3-70b or gemma-7b depending on your setup

url = "https://api.groq.com/openai/v1/chat/completions"
headers = {
    "Authorization": f"Bearer {groq_api_key}",
    "Content-Type": "application/json"
}
payload = {
    "model": model,
    "messages": [
        {"role": "system", "content": "You are a cybersecurity threat hunter."},
        {"role": "user", "content": prompt}
    ],
    "temperature": 0.3
}

# Send request
response = requests.post(url, headers=headers, json=payload)

from IPython.display import display, Markdown

if response.ok:
    content = response.json()["choices"][0]["message"]["content"]
    display(Markdown(f"### 🧠 LLM Response\n\n{content}"))
else:
    print("❌ Error:", response.status_code)
    print(response.text)




### 🧠 LLM Response

**Unique Services:**
There are 64 unique services listed in the provided log data.

**Most Inexplicable or Indicative of Potential Compromise:**
After reviewing the log data, I would highlight the `iocontrol` service as the most inexplicable or indicative of potential compromise.

**Reasoning:**
The `iocontrol` service is listed as "loaded (failed)" twice, with the same timestamp and UUID. This suggests that the service failed to start or load, but the exact reason for the failure is not provided. The fact that the service is listed as "failed" twice, with the same timestamp, raises suspicions about its legitimacy.

Additionally, the service name "iocontrol" is not a standard or well-known service on Linux systems. This could indicate that the service is custom or malicious, and its presence on the system may be a sign of compromise.

**Why it's unusual:**
The `iocontrol` service is unusual because:

1. It's not a standard or well-known service on Linux systems.
2. It failed to start or load twice, with the same timestamp and UUID.
3. The exact reason for the failure is not provided.
4. The service name "iocontrol" is not descriptive or informative.

Overall, the presence of the `iocontrol` service, with its unusual name and failure to start or load, raises suspicions about its legitimacy and may indicate potential compromise.