# === Cell 1: Markdown ===
# 📊 Dataset Exploration for Limbic LLM
This notebook provides a basic overview of the training and validation data for the Limbic LLM project.

---

In [None]:
### Cell 2: Code
import pandas as pd
import json
from pathlib import Path

# Load training and validation data
train_path = Path("../data/processed/train.json")
val_path = Path("../data/processed/val.json")

with open(train_path) as f:
    train_data = [json.loads(line) for line in f]

with open(val_path) as f:
    val_data = [json.loads(line) for line in f]

# Convert to DataFrame
train_df = pd.DataFrame(train_data)
val_df = pd.DataFrame(val_data)

print(f"Train samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")


In [None]:
### Cell 3: Code
# Display sample training data
train_df.sample(3)

In [None]:
### Cell 4: Code
# Basic stats
train_df['prompt_length'] = train_df['prompt'].str.len()
train_df['response_length'] = train_df['response'].str.len()

print("Prompt length stats:")
print(train_df['prompt_length'].describe())

print("\nResponse length stats:")
print(train_df['response_length'].describe())

In [None]:

### Cell 5: Code
# Distribution plots
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 5))
sns.histplot(train_df['prompt_length'], bins=50, kde=True)
plt.title("Prompt Length Distribution")
plt.xlabel("Characters")
plt.show()

plt.figure(figsize=(12, 5))
sns.histplot(train_df['response_length'], bins=50, kde=True)
plt.title("Response Length Distribution")
plt.xlabel("Characters")
plt.show()