In [None]:
from google.colab import drive
drive.mount('/content/drive')

data_folder = '/content/drive/MyDrive/235Kaggle/235kaggle'
import os
os.listdir(data_folder)

import pandas as pd
def read_csv(file_name):
  return pd.read_csv(os.path.join(data_folder, file_name))

In [None]:
os.listdir(data_folder)

In [None]:
logon = read_csv('logon.csv')
logon.head()

emails = read_csv('email.csv')
emails.head()

files = read_csv('file.csv')
files.head()

decoys = read_csv('decoy_file.csv')
decoys.head()

device = read_csv('device.csv')
display(device.head())

psychometric = read_csv('psychometric.csv')
display(psychometric.head())

insiders = read_csv('insiders.csv')
display(insiders.head())

In [None]:
logon_df  = logon.copy()
device_df = device.copy()
email_df  = emails.copy()

logon_df['date']  = pd.to_datetime(logon_df['date'])
device_df['date'] = pd.to_datetime(device_df['date'])
email_df['date']  = pd.to_datetime(email_df['date'])

for df in [logon_df, device_df, email_df]:
    df['day']            = df['date'].dt.date
    df['hour']           = df['date'].dt.hour
    df['is_after_hours'] = (df['hour'] < 6) | (df['hour'] >= 18)
    df['is_weekend']     = df['date'].dt.dayofweek >= 5

print("Date parsing complete.")


In [None]:
all_users = set(logon_df['user'].unique())
print(f"Total unique users: {len(all_users)}")

user_pc_counts = logon_df.groupby(['user','pc']).size().reset_index(name='count')
user_primary_pc = user_pc_counts.loc[user_pc_counts.groupby('user')['count'].idxmax()]
user_primary_pc = dict(zip(user_primary_pc['user'], user_primary_pc['pc']))

print("User-PC mapping complete.")


In [None]:
# Logon aggregates
logon_agg = logon_df[logon_df['activity']=="Logon"].groupby(['user','day']).agg(
    logon_count        = ('id','count'),
    after_hours_logons = ('is_after_hours','sum'),
    unique_pcs         = ('pc','nunique')
).reset_index()

# Foreign PC logons
logon_df['is_primary_pc'] = logon_df.apply(
    lambda x: x['pc'] == user_primary_pc.get(x['user'],''), axis=1
)

foreign_logons = (
    logon_df[(logon_df['activity']=="Logon") & (~logon_df['is_primary_pc'])]
    .groupby(['user','day']).size().reset_index(name='foreign_pc_logons')
)

# Device aggregates
device_agg = device_df[device_df['activity']=="Connect"].groupby(['user','day']).agg(
    device_connects    = ('id','count'),
    after_hours_device = ('is_after_hours','sum')
).reset_index()

# Email aggregates
def count_recipients(row):
    total = 0
    for col in ['to','cc','bcc']:
        if col in row and pd.notna(row[col]) and row[col]:
            total += len(str(row[col]).split(';'))
    return total

email_df['recipient_count'] = email_df.apply(count_recipients, axis=1)
email_sent = email_df[email_df['activity']=="Send"]

email_agg = email_sent.groupby(['user','day']).agg(
    emails_sent      = ('id','count'),
    max_recipients   = ('recipient_count','max'),
    total_recipients = ('recipient_count','sum'),
    avg_recipients   = ('recipient_count','mean')
).reset_index()

print("Daily feature aggregation complete.")


In [None]:
features_df = logon_agg.copy()

features_df = features_df.merge(foreign_logons, on=['user','day'], how='left')
features_df = features_df.merge(device_agg,    on=['user','day'], how='left')
features_df = features_df.merge(email_agg,     on=['user','day'], how='left')

features_df = features_df.fillna(0)
features_df['day'] = pd.to_datetime(features_df['day'])
features_df = features_df.sort_values(['user','day']).reset_index(drop=True)

print("FEATURE MATRIX SHAPE:", features_df.shape)
features_df.head()


In [None]:
from sklearn.preprocessing import StandardScaler


In [None]:
feature_cols = [
    'logon_count',
    'after_hours_logons',
    'unique_pcs',
    'foreign_pc_logons',
    'device_connects',
    'after_hours_device',
    'emails_sent',
    'max_recipients',
    'total_recipients',
    'avg_recipients'
]

scaler = StandardScaler()
scaler.fit(features_df[feature_cols])

features_scaled = features_df.copy()
features_scaled[feature_cols] = scaler.transform(features_scaled[feature_cols])

features_scaled.head()


In [None]:
import numpy as np

In [None]:
# We take a 30 day rolling window
SEQ_LENGTH = 30

def generate_seq(df_scaled, feature_cols, seq_len=SEQ_LENGTH):
    X_list = []
    meta_rows = []

    for user, user_df in df_scaled.groupby('user'):
        user_df = user_df.sort_values('day')
        vals = user_df[feature_cols].values
        days = user_df['day'].values

        if len(vals) < seq_len:
            continue

        for start in range(len(vals) - seq_len + 1):
            end = start + seq_len
            X_list.append(vals[start:end])
            meta_rows.append({'user': user, 'day': days[end-1]})

    return np.array(X_list), pd.DataFrame(meta_rows)

X_train, meta_train = generate_seq(features_scaled, feature_cols)

print("X_train shape:", X_train.shape)
meta_train.head()


In [None]:
!pip install tensorflow

from tensorflow.keras.layers import Input, LSTM, RepeatVector, TimeDistributed, Dense
from tensorflow.keras.models import Model


In [None]:
t_steps = SEQ_LENGTH
features_N = len(feature_cols)

inputs = Input(shape=(t_steps, features_N))
x = LSTM(64, return_sequences=True)(inputs)
x = LSTM(32, return_sequences=False)(x)

x = RepeatVector(t_steps)(x)
x = LSTM(64, return_sequences=True)(x)
outputs = TimeDistributed(Dense(features_N))(x)

LSTM_AUENCO = Model(inputs, outputs)
LSTM_AUENCO.compile(optimizer='adam', loss='mse')
LSTM_AUENCO.summary()


In [None]:
from tensorflow.keras.callbacks import EarlyStopping


In [None]:
stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = LSTM_AUENCO.fit(
    X_train, X_train,
    epochs=20,
    batch_size=256,
    validation_split=0.1,
    shuffle=True,
    callbacks=[stopping],
    verbose=1
)


In [None]:
all_x, all_META = generate_seq(features_scaled, feature_cols)
print("X_all shape:", all_x.shape)


In [None]:
X_all_pred = LSTM_AUENCO.predict(all_x, batch_size=256, verbose=1)

recon_errors = np.mean((all_x - X_all_pred)**2, axis=(1,2))
all_META['recon_error'] = recon_errors

all_META.head()


In [None]:
daily_sco = (
    all_META.groupby(['user','day'])
            .agg(lstm_error=('recon_error','max'))
            .reset_index()
)

daily_sco['lstm_rank'] = daily_sco['lstm_error'].rank(ascending=False)
daily_sco['lstm_percentile'] = daily_sco['lstm_error'].rank(pct=True)

print(daily_sco.head())


In [None]:
lstm_feat = features_df.merge(daily_sco, on=['user','day'], how='left')
lstm_feat['lstm_error']      = lstm_feat['lstm_error'].fillna(0)
lstm_feat['lstm_rank']       = lstm_feat['lstm_rank'].fillna(lstm_feat['lstm_rank'].max()+1)
lstm_feat['lstm_percentile'] = lstm_feat['lstm_percentile'].fillna(0)

lstm_feat.head()


In [None]:
s3_user = "PLJ1771"
s3_day  = pd.to_datetime("2010-08-12")

s3_row = lstm_feat[
    (lstm_feat['user']==s3_user) &
    (lstm_feat['day']==s3_day)
]

display(s3_row)

if len(s3_row):
    row = s3_row.iloc[0]
    print("\nScenario 3 Evaluation:")
    print("User:", s3_user, "| Attack day:", s3_day.date())
    print("LSTM Error:", row['lstm_error'])
    print("Rank (1 = most anomalous):", int(row['lstm_rank']))
    print("Percentile:", round(row['lstm_percentile'],4))
else:
    print("Scenario 3 row not found.")


In [None]:
s3_user = "PLJ1771"
s3_day  = pd.to_datetime("2010-08-12")

s3_row = lstm_feat[
    (lstm_feat['user']==s3_user) &
    (lstm_feat['day']==s3_day)
][['user','day','lstm_rank','lstm_percentile']]

display(s3_row)


In [None]:
try:
    scenario4_users = insiders[insiders['scenario']==4]['user'].unique().tolist()
except:
    scenario4_users = []


In [None]:
if len(scenario4_users):
    s4_rows = lstm_feat[lstm_feat['user'].isin(scenario4_users)]
    s4_top = s4_rows.sort_values('lstm_error', ascending=False).head(20)
    print("Top Scenario 4 anomalies:")
    display(s4_top[['user','day','lstm_rank','lstm_percentile']])
else:
    print("Scenario 4 insiders not provided.")


In [None]:
import matplotlib.pyplot as plt

def gen_plots(lstm_feat, user_id):

    user_df = (
        lstm_feat[lstm_feat['user'] == user_id]
        .sort_values('day')
        .copy()
    )


    err = user_df['lstm_error'].values
    err_scaled = (err - err.min()) / (err.max() - err.min() + 1e-12)


    fig, ax = plt.subplots(figsize=(12, 4))
    ax.plot(user_df['day'], err_scaled, marker='o', markersize=3, linewidth=1)

    ax.set_title(f"Anomaly scores over time — {user_id}")
    ax.set_xlabel("Date")
    ax.set_ylabel("Scaled LSTM error (0–1)")
    ax.grid(True, alpha=0.3)


    fig.autofmt_xdate()
    plt.tight_layout()
    plt.show()


gen_plots(lstm_feat, "PLJ1771")
gen_plots(lstm_feat, "CDE1846")

