In [None]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.style.use("seaborn-v0_8")

# Base data directory (relative to your project root)
DATA_DIR = os.path.join("..", "data", "plasticc")

META_PATH = os.path.join(DATA_DIR, "training_set_metadata.csv")
LC_PATH = os.path.join(DATA_DIR, "training_set.csv")

META_PATH, LC_PATH

In [2]:
metadata = pd.read_csv(META_PATH)
print("Total objects in metadata:", len(metadata))
metadata.head()

Total objects in metadata: 7848


Unnamed: 0,object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target
0,615,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,,0.017,92
1,713,53.085938,-27.784405,223.525509,-54.460748,1,1.8181,1.6267,0.2552,45.4063,0.007,88
2,730,33.574219,-6.579593,170.455585,-61.548219,1,0.232,0.2262,0.0157,40.2561,0.021,42
3,745,0.189873,-45.586655,328.254458,-68.969298,1,0.3037,0.2813,1.1523,40.7951,0.007,90
4,1124,352.711273,-63.823658,316.922299,-51.059403,1,0.1934,0.2415,0.0176,40.4166,0.024,90


In [3]:
class_counts = metadata["target"].value_counts().sort_index()
class_counts

target
6      151
15     495
16     924
42    1193
52     183
53      30
62     484
64     102
65     981
67     208
88     370
90    2313
92     239
95     175
Name: count, dtype: int64

In [4]:
class_map = {
    90: "SNIa",        # Type Ia Supernova
    67: "SNIa-91bg",   # Subluminous Ia
    52: "SNIax",
    42: "SNII",        # Type II Supernova
    62: "SNIbc",
    95: "SLSN-I",      # Superluminous SN
    15: "TDE",
    64: "KN",          # Kilonova
    88: "AGN",
    92: "RRL",
    65: "M-dwarf",
    16: "EB",
    53: "Mira",
    6:  "ÂµLens-Single",
}

readable_counts = (
    class_counts
    .rename(index=class_map)
    .rename_axis("class")
    .reset_index(name="count")
)

readable_counts

Unnamed: 0,class,count
0,ÂµLens-Single,151
1,TDE,495
2,EB,924
3,SNII,1193
4,SNIax,183
5,Mira,30
6,SNIbc,484
7,KN,102
8,M-dwarf,981
9,SNIa-91bg,208


In [5]:
# We focus on binary classification: SNIa (90) vs SNII (42)
SELECTED_CLASSES = [90, 42]

binary_metadata = metadata[metadata["target"].isin(SELECTED_CLASSES)].copy()
print("Total selected objects (SNIa + SNII):", len(binary_metadata))

binary_metadata["target"].value_counts().rename(index=class_map)

Total selected objects (SNIa + SNII): 3506


target
SNIa    2313
SNII    1193
Name: count, dtype: int64

np.random.seed(42)

N_PER_CLASS = 300  # you can adjust this later

snia_all = binary_metadata[binary_metadata["target"] == 90]
snii_all = binary_metadata[binary_metadata["target"] == 42]

n_snia = min(N_PER_CLASS, len(snia_all))
n_snii = min(N_PER_CLASS, len(snii_all))

snia_sample = snia_all.sample(n_snia, random_state=42)
snii_sample = snii_all.sample(n_snii, random_state=42)

sample_metadata = pd.concat([snia_sample, snii_sample], ignore_index=True)

print("Sample size:", len(sample_metadata))
print("SNIa:", (sample_metadata["target"] == 90).sum())
print("SNII:", (sample_metadata["target"] == 42).sum())

sample_metadata.head()

In [7]:
SAMPLE_META_PATH = os.path.join(DATA_DIR, "sample_metadata.csv")
sample_metadata.to_csv(SAMPLE_META_PATH, index=False)
SAMPLE_META_PATH

'../data/plasticc/sample_metadata.csv'

In [8]:
# Read only the first 10,000 rows to inspect structure
lightcurves_preview = pd.read_csv(LC_PATH, nrows=10000)
print(lightcurves_preview.shape)
lightcurves_preview.head()

(10000, 6)


Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,615,59750.4229,2,-544.810303,3.622952,1
1,615,59750.4306,1,-816.434326,5.55337,1
2,615,59750.4383,3,-471.385529,3.801213,1
3,615,59750.445,4,-388.984985,11.395031,1
4,615,59752.407,2,-681.858887,4.041204,1


In [9]:
sample_ids = sample_metadata["object_id"].values
sample_ids_set = set(sample_ids)

len(sample_ids_set)

600

In [10]:
chunksize = 100000  # rows per chunk
collected_chunks = []

print("Building sample_lightcurves.csv ...")

for i, chunk in enumerate(pd.read_csv(LC_PATH, chunksize=chunksize)):
    # Filter rows where object_id is in our sample set
    mask = chunk["object_id"].isin(sample_ids_set)
    sub = chunk[mask]
    if not sub.empty:
        collected_chunks.append(sub)
    
    if i % 10 == 0:
        print(f"Processed {i * chunksize:,} rows...")

# Concatenate all matching rows
if collected_chunks:
    sample_lightcurves = pd.concat(collected_chunks, ignore_index=True)
else:
    sample_lightcurves = pd.DataFrame()

print("Total rows for sample objects:", len(sample_lightcurves))
sample_lightcurves.head()

Building sample_lightcurves.csv ...
Processed 0 rows...
Processed 1,000,000 rows...
Total rows for sample objects: 114598


Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,2300,59770.3662,2,3.291617,0.627422,1
1,2300,59770.374,1,-1.090546,0.609956,0
2,2300,59770.3817,3,18.823622,1.074484,1
3,2300,59770.3928,4,18.912529,1.824963,1
4,2300,59770.4039,5,14.538424,4.636243,0


In [11]:
SAMPLE_LC_PATH = os.path.join(DATA_DIR, "sample_lightcurves.csv")
sample_lightcurves.to_csv(SAMPLE_LC_PATH, index=False)
SAMPLE_LC_PATH

'../data/plasticc/sample_lightcurves.csv'

In [12]:
import pandas as pd

print("Checking saved sample files...\n")

meta = pd.read_csv(SAMPLE_META_PATH)
lc = pd.read_csv(SAMPLE_LC_PATH)

print("Metadata shape:", meta.shape)
print("Lightcurve shape:", lc.shape)

print("\nClass counts:")
print(meta['target'].value_counts())

print("\nExample entries:")
display(meta.head())
display(lc.head())

Checking saved sample files...

Metadata shape: (600, 12)
Lightcurve shape: (114598, 6)

Class counts:
target
90    300
42    300
Name: count, dtype: int64

Example entries:


Unnamed: 0,object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target
0,73799799,64.511719,-38.300922,241.101834,-45.720753,0,0.3025,0.3156,0.0446,41.0827,0.022,90
1,215282,349.429535,-62.508568,320.039643,-51.393745,1,0.2733,0.2727,1.2432,40.7176,0.02,90
2,92999561,346.992188,-13.094776,57.913773,-62.04333,0,0.2192,0.2299,0.0223,40.2955,0.029,90
3,19866,359.814819,-44.399834,330.775011,-69.801007,1,0.2608,0.2877,0.0235,40.8505,0.009,90
4,68637164,296.191406,-17.89609,22.323758,-19.597744,0,0.3221,0.3435,0.008,41.2962,0.1,90


Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,2300,59770.3662,2,3.291617,0.627422,1
1,2300,59770.374,1,-1.090546,0.609956,0
2,2300,59770.3817,3,18.823622,1.074484,1
3,2300,59770.3928,4,18.912529,1.824963,1
4,2300,59770.4039,5,14.538424,4.636243,0


In [1]:
import os
import numpy as np
import pandas as pd

# ============================================================================
# CONFIGURATION - INCREASED DATASET SIZE
# ============================================================================

N_PER_CLASS = 1000  # ðŸ”¥ CHANGED FROM 300 TO 1000

DATA_DIR = os.path.join("..", "data", "plasticc")
META_PATH = os.path.join(DATA_DIR, "training_set_metadata.csv")
LC_PATH = os.path.join(DATA_DIR, "training_set.csv")

print(f"Target: {N_PER_CLASS} samples per class = {N_PER_CLASS * 2} total")

# ============================================================================
# LOAD FULL METADATA
# ============================================================================

metadata = pd.read_csv(META_PATH)
print(f"Total objects in metadata: {len(metadata)}")

# Focus on SNIa (90) vs SNII (42)
SELECTED_CLASSES = [90, 42]
binary_metadata = metadata[metadata["target"].isin(SELECTED_CLASSES)].copy()

print(f"\nAvailable samples:")
print(f"  SNIa (90): {(binary_metadata['target'] == 90).sum()}")
print(f"  SNII (42): {(binary_metadata['target'] == 42).sum()}")

# ============================================================================
# SAMPLE 1000 OF EACH CLASS
# ============================================================================

np.random.seed(42)

snia_all = binary_metadata[binary_metadata["target"] == 90]
snii_all = binary_metadata[binary_metadata["target"] == 42]

# Take 1000 of each (or max available)
n_snia = min(N_PER_CLASS, len(snia_all))
n_snii = min(N_PER_CLASS, len(snii_all))

snia_sample = snia_all.sample(n_snia, random_state=42)
snii_sample = snii_all.sample(n_snii, random_state=42)

sample_metadata = pd.concat([snia_sample, snii_sample], ignore_index=True)

print(f"\nâœ“ Sampled dataset:")
print(f"  Total: {len(sample_metadata)}")
print(f"  SNIa: {(sample_metadata['target'] == 90).sum()}")
print(f"  SNII: {(sample_metadata['target'] == 42).sum()}")

# ============================================================================
# SAVE SAMPLE METADATA
# ============================================================================

SAMPLE_META_PATH = os.path.join(DATA_DIR, "sample_metadata.csv")
sample_metadata.to_csv(SAMPLE_META_PATH, index=False)
print(f"\nâœ“ Saved: {SAMPLE_META_PATH}")

# ============================================================================
# EXTRACT LIGHTCURVES FOR SAMPLE
# ============================================================================

sample_ids = sample_metadata["object_id"].values
sample_ids_set = set(sample_ids)

print(f"\nðŸ“Š Extracting lightcurves for {len(sample_ids_set)} objects...")

chunksize = 100000
collected_chunks = []

for i, chunk in enumerate(pd.read_csv(LC_PATH, chunksize=chunksize)):
    mask = chunk["object_id"].isin(sample_ids_set)
    sub = chunk[mask]
    if not sub.empty:
        collected_chunks.append(sub)
    
    if i % 10 == 0:
        print(f"  Processed {i * chunksize:,} rows...")

if collected_chunks:
    sample_lightcurves = pd.concat(collected_chunks, ignore_index=True)
else:
    sample_lightcurves = pd.DataFrame()

print(f"\nâœ“ Total lightcurve rows: {len(sample_lightcurves):,}")

# ============================================================================
# SAVE SAMPLE LIGHTCURVES
# ============================================================================

SAMPLE_LC_PATH = os.path.join(DATA_DIR, "sample_lightcurves.csv")
sample_lightcurves.to_csv(SAMPLE_LC_PATH, index=False)
print(f"âœ“ Saved: {SAMPLE_LC_PATH}")

print("\n" + "=" * 70)
print("DATASET GENERATION COMPLETE!")
print("=" * 70)
print(f"Next step: Run feature extraction notebook (02_)")

Target: 1000 samples per class = 2000 total
Total objects in metadata: 7848

Available samples:
  SNIa (90): 2313
  SNII (42): 1193

âœ“ Sampled dataset:
  Total: 2000
  SNIa: 1000
  SNII: 1000

âœ“ Saved: ../data/plasticc/sample_metadata.csv

ðŸ“Š Extracting lightcurves for 2000 objects...
  Processed 0 rows...
  Processed 1,000,000 rows...

âœ“ Total lightcurve rows: 381,810
âœ“ Saved: ../data/plasticc/sample_lightcurves.csv

DATASET GENERATION COMPLETE!
Next step: Run feature extraction notebook (02_)
