# Dataset exploration
This notebook runs the project pipeline steps in order:
1. fetch raw data
2. extract structure
3. compute features
4. build dataset. 

It shows mocked process of dataset extraction.

In [None]:
from pathlib import Path

import pandas as pd

from src.data.build_dataset import build_dataset
from src.data.compute_features import compute_features
from src.data.extract_structure import extract_structure
from src.data.fetch_raw import fetch_raw
from src.utils.config import load_config

data_cfg = load_config("data")
seeds_cfg = load_config("seeds")

print("[Notebook] Configs loaded:")
print(data_cfg)
print(seeds_cfg)

print("[Notebook] Fetching raw data...")
fetch_raw()

print("[Notebook] Extracting structure...")
extract_structure()

print("[Notebook] Computing features...")
compute_features()

print("[Notebook] Building dataset...")
build_dataset()

print("[Notebook] Pipeline completed (or attempted).")

[Notebook] Configs loaded:
{'raw_dir': 'data/raw/', 'interim_dir': 'data/interim/', 'processed_dir': 'data/processed/', 'dataset_filename': 'dataset.csv', 'overwrite': False}
{'data_seed': 90, 'sampling_seed': 90, 'training_seed': 90, 'evaluation_seed': 90, 'deterministic': {'force_hash_sorting': True, 'enforce_timestamp_free_ops': True}}
[Notebook] Fetching raw data...
[fetch_raw] Saved mock raw metadata to data/raw/raw_metadata.json
[Notebook] Extracting structure...
[extract_structure] Saved mock structure to data/interim/structure.json
[Notebook] Computing features...
[compute_features] Saved mock features to data/interim/features.json
[Notebook] Building dataset...
[build_dataset] Mock dataset saved to data/processed/dataset.csv
[Notebook] Pipeline completed (or attempted).


In [None]:
dataset_file = Path(data_cfg.get("processed_dir", "data/processed")) / data_cfg.get(
    "dataset_filename", "dataset.csv"
)
dataset_file.parent.mkdir(parents=True, exist_ok=True)

if not dataset_file.exists():
    print(
        "[Notebook] Dataset not found after pipeline. Check previous cells for errors."
    )
else:
    print(f"[Notebook] Dataset exists at {dataset_file}")

    df = pd.read_csv(dataset_file)
    display(df.head())
    print(df.describe())

    if "stars" in df.columns:
        df["stars_scaled"] = df["stars"] / df["stars"].max()
        df.hist(column="stars_scaled")

[Notebook] Dataset exists at data/processed/dataset.csv


Unnamed: 0,repo_url,num_py_files,num_js_files,num_notebooks,avg_files_per_dir
0,https://github.com/user/repo1,4,5,0,2
1,https://github.com/user/repo2,8,5,2,1


       num_py_files  num_js_files  num_notebooks  avg_files_per_dir
count      2.000000           2.0       2.000000           2.000000
mean       6.000000           5.0       1.000000           1.500000
std        2.828427           0.0       1.414214           0.707107
min        4.000000           5.0       0.000000           1.000000
25%        5.000000           5.0       0.500000           1.250000
50%        6.000000           5.0       1.000000           1.500000
75%        7.000000           5.0       1.500000           1.750000
max        8.000000           5.0       2.000000           2.000000
