In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
file_path = '/content/drive/MyDrive/235kaggle'

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.cluster import DBSCAN, KMeans
from sklearn.decomposition import PCA
from datetime import datetime, date
import warnings
import os
from functools import reduce
import re

In [None]:
os.listdir(file_path)

# Opening the datasets

In [None]:
# Loading data files
logon = pd.read_csv(f'{file_path}/logon.csv')
device = pd.read_csv(f'{file_path}/device.csv')
email = pd.read_csv(f'{file_path}/email.csv')
file_data = pd.read_csv(f'{file_path}/file.csv')
psychometric = pd.read_csv(f'{file_path}/psychometric.csv')
decoys = pd.read_csv(f'{file_path}/decoy_file.csv')

print("Shapes of the files:")
print(f"logon: {logon.shape}")
print(f"device: {device.shape}")
print(f"email: {email.shape}")
print(f"file_data: {file_data.shape}")
print(f"psychometric: {psychometric.shape}")
print(f"decoys: {decoys.shape}")

In [None]:
datasets = {
    'Logon': logon,
    'Device': device,
    'Email': email,
    'File': file_data,
    'Psychometric': psychometric,
    'Decoys': decoys
}

# Viewing the structure of the datasets
for name, df in datasets.items():
    print(f"======{name} Dataset=====")
    print(f"Shape: {df.shape}")
    print(f"\nColumns and Data Types:")
    display(df.dtypes)
    print(f"\nFirst 5 rows:")
    display(df.head(5))
    print("\n")


# Data Preprocessing

In [None]:
def process_datetime_columns(df, date_column):
    df[date_column] = pd.to_datetime(df[date_column], format='%m/%d/%Y %H:%M:%S')
    df['hour'] = df[date_column].dt.hour
    df['day_of_week'] = df[date_column].dt.day_name()
    return df

In [None]:
email = process_datetime_columns(email, 'date')
file_data = process_datetime_columns(file_data, 'date')

print("\nEmail DataFrame after processing:")
display(email.head(5))
display(email.dtypes)

print("\nFile Data DataFrame after processing:")
display(file_data.head(5))
display(file_data.dtypes)

In [None]:
# Helper Functions
def get_time_features_flags(df, date_column):
    df[date_column] = pd.to_datetime(df[date_column], format='%m/%d/%Y %H:%M:%S', errors='coerce')
    df['hour'] = df[date_column].dt.hour
    df['day'] = df[date_column].dt.date
    df['day_of_week'] = df[date_column].dt.dayofweek

    # After hours: before 7 AM or after 7 PM
    df['is_after_hours'] = ((df['hour'] < 7) | (df['hour'] >= 19)).astype(int)
    if 'to' in df.columns:
        df['is_external'] = df['to'].apply(lambda x: 0 if isinstance(x, str) and 'dtaa.com' in x else 1)

    return df

# Counting email recipients
def count_recipients(row):
    count = 0
    for field in ['to', 'cc', 'bcc']:
        if pd.notna(row[field]) and row[field]:
            count += len(str(row[field]).split(';'))
    return count

logon = get_time_features_flags(logon.copy(), 'date')
device = get_time_features_flags(device.copy(), 'date')
email = get_time_features_flags(email.copy(), 'date')
file_data = get_time_features_flags(file_data.copy(), 'date')

# Feature Engineering

In [None]:
# PC Mapping
user_pc_counts = logon.groupby(['user', 'pc']).size().reset_index(name='count')
user_primary_pc = user_pc_counts.loc[user_pc_counts.groupby('user')['count'].idxmax()][['user', 'pc']]
user_primary_pc = dict(zip(user_primary_pc['user'], user_primary_pc['pc']))

logon['is_primary_pc'] = logon.apply(lambda x: x['pc'] == user_primary_pc.get(x['user'], ''), axis=1)
print("Mapped primary PC for each user.")

# Aggregating Features By User and Day
print("Aggregating Features by User-Day")
print("==============")

# Logon Features
print("Using logon features...")
logon_agg = logon[logon['activity'] == 'Logon'].groupby(['user', 'day']).agg(
    daily_logon_count=('id', 'count'),
    daily_after_hours_logons=('is_after_hours', 'sum'),
    daily_unique_pcs=('pc', 'nunique')
).reset_index()

# Foreign PC Logons, for outside logins
foreign_logons = logon[(logon['activity'] == 'Logon') & (~logon['is_primary_pc'])].groupby(['user', 'day']).size().reset_index(name='daily_foreign_pc_logons')
logon_agg = logon_agg.merge(foreign_logons, on=['user', 'day'], how='left').fillna(0)


# File Features
print("Using file features...")
file_agg = file_data.groupby(['user', 'day']).agg(
    daily_file_reads=('activity', lambda x: (x == 'Read').sum()),
    daily_file_writes=('activity', lambda x: (x == 'Write').sum()),
    daily_file_deletes=('activity', lambda x: (x == 'Delete').sum()),
).reset_index()


# Device Features
print("Using device features...")
device_agg = device[device['activity'] == 'Connect'].groupby(['user', 'day']).agg(
    daily_device_connects=('id', 'count'),
    daily_after_hours_device=('is_after_hours', 'sum')
).reset_index()


# Email Features
email['recipient_count'] = email.apply(count_recipients, axis=1)

email_sent = email[email['activity'] == 'Send']
email_agg = email_sent.groupby(['user', 'day']).agg(
    daily_emails_sent=('id', 'count'),
    daily_max_recipients=('recipient_count', 'max'),
    daily_total_recipients=('recipient_count', 'sum'),
    daily_external_emails=('is_external', 'sum')
).reset_index()

print("Individual log aggregation complete.")

In [None]:
#Merging Features Into One Dataframe
daily_agg_dfs = [logon_agg, file_agg, device_agg, email_agg]

features_df = daily_agg_dfs[0].copy()

for df in daily_agg_dfs[1:]:
    features_df = features_df.merge(df, on=['user', 'day'], how='outer')

features_df = features_df.fillna(0)

features_df = features_df.set_index(['user', 'day'])

print("Feature Matrix For GMM")
print("---------------")
print(f"Feature Matrix Shape: {features_df.shape}")
print(f"\nList of Features in Matrix:")
print(features_df.columns.tolist())
print(f"\nSample Data:")
display(features_df.head(5))

# Principle Component Analysis

In [None]:
# Feature Scaling
X_scaled = StandardScaler().fit_transform(features_df)
X_scaled_df = pd.DataFrame(X_scaled, index=features_df.index, columns=features_df.columns)

# Running PCA for the dimensionality reduction
pca = PCA(n_components=0.90) # retaining 90% varience
X_pca = pca.fit_transform(X_scaled)
optimal = X_pca.shape[1]

pca_df = pd.DataFrame(
    data=X_pca,
    columns=[f'PC{i+1}' for i in range(optimal)],
    index=features_df.index
)
print(f"Going from {features_df.shape[1]} features to {optimal} dimensions.")

X_full_pca = pca_df.values

# GMM Model

In [None]:
# Gaussian Mixture Model (GMM) Anomaly Scoring
N_optimal=5 # This was found from the PCA
n_components = N_optimal
gmm = GaussianMixture(n_components=N_optimal, random_state=42)
gmm.fit(X_full_pca)

# score_samples returns the log-likelihood (LL).
log_likelihoods = gmm.score_samples(X_full_pca)
pca_df['GMM_Anomaly_Score'] = -log_likelihoods

#Ranking them
pca_df['GMM_Rank'] = pca_df['GMM_Anomaly_Score'].rank(ascending=False).astype(int)
print(f"GMM (n_components={n_components}) scoring complete.")
print(f"\nFinal ranked dataframe shape: {pca_df.shape}")

In [None]:
import pandas as pd
from datetime import datetime

# Indider threat incident dates
target_1 = ('PLJ1771', datetime(2010, 8, 12).date())
target_2 = [
    ('CDE1846', datetime(2011, 4, 21).date()),
    ('CDE1846', datetime(2011, 4, 25).date()),
]
target_events = [target_1] + target_2
TOTAL_EVENTS = pca_df.shape[0]

filtered_results = pca_df.loc[target_events]

target_output = filtered_results[['GMM_Anomaly_Score', 'GMM_Rank']].sort_values(by='GMM_Rank', ascending=True)

target_output['GMM_Percentile_Rank'] = (target_output['GMM_Rank'] / TOTAL_EVENTS) * 100

target_output['GMM_Percentile_Rank'] = target_output['GMM_Percentile_Rank'].round(4).astype(str) + '%'
target_output['GMM_Anomaly_Score'] = target_output['GMM_Anomaly_Score'].round(4)

print(f"GMM Anomaly Ranks and Percentiles (based on N={TOTAL_EVENTS} total events):")
print("----------------")
target_output.to_csv("insider_event_ranks_targets_with_percentile.csv")
display(target_output)

In [None]:
# Top 20 Anomaly Lookup
top_20_output = pca_df.sort_values(by='GMM_Anomaly_Score', ascending=False).head(20)

print("--- Top 20 Overall Highest GMM Anomaly Scores ---")
print(f"Total Events Scored: {pca_df.shape[0]}")

# Display and save the Top 20 results
top_20_output = top_20_output[['GMM_Anomaly_Score', 'GMM_Rank']]
top_20_output.to_csv("top_20_gmm_anomalies.csv")
display(top_20_output)

# Graphing Results

In [None]:
def plot_anomaly_trend(user_id, start_date, end_date, anomaly_df):

    # Filter by user and then by date range
    user_df = anomaly_df.loc[user_id]
    user_events = user_df.loc[start_date:end_date].copy()

    if user_events.empty:
        print(f"No scored events found for {user_id} in the time period.")
        return

    # Plotting
    plt.figure(figsize=(8, 5))
    plt.plot(user_events.index, user_events['GMM_Anomaly_Score'], marker='o', linestyle='-', color='red', linewidth=2)

    plt.title(f'GMM Anomaly Score Trend for User {user_id}')
    plt.xlabel('Date')
    plt.ylabel('GMM Anomaly Score (Higher is More Anomalous)')

    plt.xticks(rotation=45, ha='right')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()

    # Save plot
    file_name = f'{user_id}_anomaly_trend.png'
    plt.savefig(file_name)
    print(f"Graph saved as {file_name}. You can see the file above.")
    print("Data points used for the plot:")
    print(user_events[['GMM_Anomaly_Score', 'GMM_Rank']])


# Plot CDE1846
plot_anomaly_trend(
    user_id='CDE1846',
    start_date=date(2010, 1, 2),
    end_date=date(2011, 6, 1),
    anomaly_df=pca_df
)

# Plot PLJ1771
plot_anomaly_trend(
    user_id='PLJ1771',
    start_date=date(2010, 1, 2),
    end_date=date(2011, 6, 1),
    anomaly_df=pca_df
)
