In [None]:

from google.colab import drive
drive.mount('/content/drive')

import os
import pandas as pd


data_folder = '/content/drive/MyDrive/235Kaggle/235kaggle'

print("Files in data folder:")
print(os.listdir(data_folder))

def read_csv(file_name):
    """Helper to read a CSV from the data folder."""
    return pd.read_csv(os.path.join(data_folder, file_name))

logon      = read_csv('logon.csv')
emails     = read_csv('email.csv')
files      = read_csv('file.csv')
decoys     = read_csv('decoy_file.csv')
device     = read_csv('device.csv')
psychometric = read_csv('psychometric.csv')
insiders   = read_csv('insiders.csv')

display(logon.head())
display(device.head())
display(emails.head())


In [None]:
logon_df  = logon.copy()
device_df = device.copy()
email_df  = emails.copy()

logon_df['date']  = pd.to_datetime(logon_df['date'])
device_df['date'] = pd.to_datetime(device_df['date'])
email_df['date']  = pd.to_datetime(email_df['date'])

for df in [logon_df, device_df, email_df]:
    df['day'] = df['date'].dt.date
    df['hour'] = df['date'].dt.hour

    df['is_after_hours'] = (df['hour'] < 6) | (df['hour'] >= 18)

    df['is_weekend'] = df['date'].dt.dayofweek >= 5

print("Date parsing complete")
display(logon_df.head())
display(device_df.head())
display(email_df.head())


In [None]:


all_users = set(logon_df['user'].unique())
print(f"Total unique users: {len(all_users)}")


user_pc_counts = (
    logon_df
    .groupby(['user', 'pc'])
    .size()
    .reset_index(name='count')
)


user_primary_pc_df = user_pc_counts.loc[
    user_pc_counts.groupby('user')['count'].idxmax()
][['user', 'pc']]

user_primary_pc = dict(zip(user_primary_pc_df['user'], user_primary_pc_df['pc']))

print("User-PC mapping complete. Sample:")
list(user_primary_pc.items())[:10]


In [None]:

user_days = set()
for df in [logon_df, device_df, email_df]:
    user_days.update(zip(df['user'], df['day']))
print(f"Total user-day combinations: {len(user_days):,}")


logon_agg = (
    logon_df[logon_df['activity'] == 'Logon']
    .groupby(['user', 'day'])
    .agg(
        logon_count=('id', 'count'),
        after_hours_logons=('is_after_hours', 'sum'),
        unique_pcs=('pc', 'nunique'),
    )
    .reset_index()
)


logon_df['is_primary_pc'] = logon_df.apply(
    lambda x: x['pc'] == user_primary_pc.get(x['user'], ''), axis=1
)

foreign_logons = (
    logon_df[(logon_df['activity'] == 'Logon') & (~logon_df['is_primary_pc'])]
    .groupby(['user', 'day'])
    .size()
    .reset_index(name='foreign_pc_logons')
)


device_agg = (
    device_df[device_df['activity'] == 'Connect']
    .groupby(['user', 'day'])
    .agg(
        device_connects=('id', 'count'),
        after_hours_device=('is_after_hours', 'sum'),
    )
    .reset_index()
)


def count_recipients(row):
    count = 0
    for field in ['to', 'cc', 'bcc']:
        if pd.notna(row[field]) and row[field]:
            count += len(str(row[field]).split(';'))
    return count

email_df['recipient_count'] = email_df.apply(count_recipients, axis=1)

email_sent = email_df[email_df['activity'] == 'Send']

email_agg = (
    email_sent
    .groupby(['user', 'day'])
    .agg(
        emails_sent=('id', 'count'),
        max_recipients=('recipient_count', 'max'),
        total_recipients=('recipient_count', 'sum'),
        avg_recipients=('recipient_count', 'mean'),
    )
    .reset_index()
)


features_df = logon_agg.copy()
features_df = features_df.merge(foreign_logons, on=['user', 'day'], how='left')
features_df = features_df.merge(device_agg,    on=['user', 'day'], how='left')
features_df = features_df.merge(email_agg,     on=['user', 'day'], how='left')


features_df = features_df.fillna(0)


features_df['day'] = pd.to_datetime(features_df['day'])

print(f"Feature matrix shape: {features_df.shape}")
print("\nFeatures:")
print(features_df.columns.tolist())
print("\nSample data:")
display(features_df.head())


In [None]:
# STEP 5: LOCAL OUTLIER FACTOR (LOF) MODEL

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neighbors import LocalOutlierFactor
import numpy as np

feature_columns = [
    'logon_count',
    'after_hours_logons',
    'unique_pcs',
    'foreign_pc_logons',
    'device_connects',
    'after_hours_device',
    'emails_sent',
    'max_recipients',
    'total_recipients',
    'avg_recipients'
]


X = features_df[feature_columns].values
print(f"Total rows going into LOF: {X.shape[0]:,}")

lof_scaled = StandardScaler()
X_scaled = lof_scaled.fit_transform(X)


lof = LocalOutlierFactor(
    n_neighbors=20,
    contamination=0.02
)

labels_inLOF = lof.fit_predict(X_scaled)
lof_score_raw = lof.negative_outlier_factor_

features_df['lof_label'] = labels_inLOF
features_df['lof_score_raw'] = lof_score_raw


anomaly_strength = -lof_score_raw


minimum_value = anomaly_strength.min()
maximum_value = anomaly_strength.max()
features_df['lof_score'] = (anomaly_strength - minimum_value) / (maximum_value - minimum_value + 1e-12)

features_df['lof_percentile'] = pd.Series(anomaly_strength).rank(pct=True).values


LOF_ANOMALIES = features_df[features_df['lof_label'] == -1]

print(f"\nTotal records (all user-days): {len(features_df):,}")
print(f"LOF anomalies detected: {len(LOF_ANOMALIES):,} "
      f"({100*len(LOF_ANOMALIES)/len(features_df):.2f}%)")

print("\nTop 20 most anomalous user-days (LOF):")
top_anomalies = features_df.nlargest(20, 'lof_score')[
    ['user', 'day', 'lof_score', 'lof_score_raw'] + feature_columns
]
display(top_anomalies)



In [None]:

scene3_user = features_df[
    (features_df['user'] == 'PLJ1771') &
    (features_df['day'] == '2010-08-12')
]

display(scene3_user)


In [None]:
scene3_user = features_df[
    (features_df['user'] == 'PLJ1771') &
    (features_df['day'] == '2010-08-12')
][['user', 'day', 'lof_score', 'lof_percentile']]

display(scene3_user)


In [None]:

import pandas as pd

try:
    scene4_user = (
        insiders[insiders['scenario'] == 4]['user']
        .dropna()
        .unique()
        .tolist()
    )
    print("Scenario 4 insider users:", scene4_user)
except Exception as e:
    print("Could not load scenario 4 insiders:", e)
    scene4_user = []

if len(scene4_user):


    s4_rows = features_df[features_df['user'].isin(scene4_user)].copy()


    s4_rows['day'] = pd.to_datetime(s4_rows['day'])


    s4_rows = s4_rows.sort_values(['user', 'day'])


    print("\nScenario 4 insider user-days (LOF):")
    display(s4_rows[['user', 'day', 'lof_score', 'lof_percentile']])


    print("\nTop Scenario 4 anomalies (LOF):")
    s4_top = s4_rows.sort_values('lof_score', ascending=False).head(20)
    display(s4_top[['user', 'day', 'lof_score', 'lof_percentile']])


    suspicious = s4_rows[s4_rows['lof_percentile'] >= 0.99]
    print("\nScenario 4 days with LOF percentile ≥ 0.99:")
    if len(suspicious):
        display(suspicious[['user', 'day', 'lof_score', 'lof_percentile']])
    else:
        print("No Scenario 4 insider days above 99th percentile.")
else:
    print("Scenario 4 insiders not provided.")


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

def plotting_lofAnomalies(features_df, user_id):


    user_df = features_df[features_df["user"] == user_id].copy()
    user_df = user_df.sort_values("day")


    scores = user_df["lof_score"].values.astype(float)


    min_s, max_s = scores.min(), scores.max()
    if max_s - min_s < 1e-12:

        scaled = (scores - min_s) + 1e-6
    else:
        scaled = (scores - min_s) / (max_s - min_s)

    user_df["scaled_score"] = scaled

    # Plot
    plt.figure(figsize=(12, 4))
    plt.plot(user_df["day"], user_df["scaled_score"], marker="o", ms=4)
    plt.title(f"Anomaly scores over time — {user_id}")
    plt.xlabel("Date")
    plt.ylabel("Scaled anomaly score (0–1 per user)")
    plt.grid(True)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()




plotting_lofAnomalies(features_df, "PLJ1771")
plotting_lofAnomalies(features_df, "CDE1846")


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

def percentiles_plot(features_df, user_id):


    df = features_df[features_df['user'] == user_id].copy()
    df = df.sort_values("day")

    if df.empty:
        print(f"No data for {user_id}")
        return

    plt.figure(figsize=(14,4))
    plt.plot(df['day'], df['lof_percentile'], marker='o', markersize=4, linewidth=1)
    plt.title(f"Anomaly Percentiles Over Time — {user_id}", fontsize=15)
    plt.xlabel("Date")
    plt.ylabel("LOF Percentile (0–1 scale)")
    plt.grid(True, alpha=0.3)
    plt.xticks(rotation=45)
    plt.ylim(0, 1)
    plt.tight_layout()
    plt.show()


percentiles_plot(features_df, "PLJ1771")
percentiles_plot(features_df, "CDE1846")
