# Preprocessing

### Project Setup - Environment and File checks

In [None]:
import os
import sys
import glob

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

print("Environment check:")
print("Python:", sys.version)
print("Pandas:", pd.__version__)
print("NumPy:", np.__version__)


In [None]:
cwd = os.getcwd()

# If started Jupyter inside notebooks/, move up one level
if os.path.basename(cwd) == "notebooks":
    project_root = os.path.dirname(cwd)
else:
    project_root = cwd

print(f"Project root: {project_root}")

extracted_dir = os.path.join(project_root, "ExtractedData")
data_dir = os.path.join(project_root, "data")
processed_dir = os.path.join(data_dir, "processed")

missing = []
if not os.path.exists(extracted_dir):
    missing.append("Missing folder: ExtractedData/ (copy it into the project root after running feature_extraction.py)")

if missing:
    print("\n".join(missing))
    raise SystemExit("\nError: Required input folders are missing. Fix these before running the notebook.")
else:
    print(f"Found ExtractedData folder: {extracted_dir}")


### Choose granularity level and load data

In [None]:
GRANULARITY = "week"   # can be "day" or "subsession"

pattern = os.path.join(extracted_dir, f"{GRANULARITY}*.csv")
files = glob.glob(pattern)

print(f"Looking for files: {pattern}")
print(f"Found {len(files)} file(s).")

if not files:
    raise FileNotFoundError(f"No CSV files found matching '{GRANULARITY}*.csv' in {extracted_dir}")

df = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)
print(f"Loaded data shape: {df.shape}")
df.head()


### Clean data and create labels

In [None]:
print("Columns (first 20):")
print(list(df.columns)[:20])

meta_cols = [
    "user",
    "week",
    "day",
    "sessionid",
    "starttime",
    "endtime",
    "subs_ind",
]

if "insider" not in df.columns:
    raise KeyError("Expected column 'insider' not found in the extracted data.")

feature_cols = [c for c in df.columns if c not in meta_cols + ["insider"]]

X = df[feature_cols].copy().fillna(0)

#insider vs normal
y_binary = df["insider"].apply(lambda v: 1 if v != 0 else 0)

full_df = X.copy()
full_df["insider"] = df["insider"]
full_df["label"] = y_binary

print(f"\nFinal feature matrix shape: {X.shape}")
print("Label distribution (binary):")
print(full_df["label"].value_counts())


### Save cleaned data

In [None]:
os.makedirs(data_dir, exist_ok=True)
os.makedirs(processed_dir, exist_ok=True)

features_path = os.path.join(data_dir, "features.csv")
full_df.to_csv(features_path, index=False)

print(f"Saved unified features dataset to: {features_path}")


In [None]:
train_val_df, test_df = train_test_split(
    full_df,
    test_size=0.20,
    stratify=full_df["label"],
    random_state=42,
)

train_df, val_df = train_test_split(
    train_val_df,
    test_size=0.25,  # 25% of 80% = 20% total, so 60/20/20
    stratify=train_val_df["label"],
    random_state=42,
)

print("Split shapes:")
print("Train:", train_df.shape)
print("Val:  ", val_df.shape)
print("Test: ", test_df.shape)

print("\nLabel distribution by split:")
print("Train:\n", train_df["label"].value_counts(), "\n")
print("Val:\n",   val_df["label"].value_counts(), "\n")
print("Test:\n",  test_df["label"].value_counts(), "\n")


In [None]:
train_path = os.path.join(processed_dir, "train.csv")
val_path   = os.path.join(processed_dir, "val.csv")
test_path  = os.path.join(processed_dir, "test.csv")

train_df.to_csv(train_path, index=False)
val_df.to_csv(val_path, index=False)
test_df.to_csv(test_path, index=False)

print("Saved processed splits:")
print(f"  Train: {train_path}")
print(f"  Val:   {val_path}")
print(f"  Test:  {test_path}")

print("\nPreprocessing complete. You can now use these files in:")
print("  - 02_model_comparison.ipynb")
print("  - 03_model_tuning.ipynb")
print("  - 04_feature_selection.ipynb")
print("  - 05_threshold_evaluation.ipynb")
