## 0. Imports & Setup

This section sets up the environment:

- Imports all necessary packages (standard libraries, pandas, NumPy, etc.).
- Ensures reproducibility by seeding `numpy`.

In [1]:
# =========================
# 0a. Imports & Setup
# =========================
import os
import numpy as np
import pandas as pd

# Reproducibility
RNG = 42
np.random.seed(RNG)

In [2]:
# =========================
# ðŸ”§ 0b. Configuration
# =========================

# File paths
input_file_path = "Raw File/dataset0.csv"
os.makedirs("Dataset", exist_ok= True)
output_parquet_path = "Dataset/before_embedding.parquet"

# Column names from raw file
rename_columns = {
    'ID': 'transcript_id',
    'POS': 'transcript_position',
    'SEQ': '7mer'
}

split_ratios = {
    'Train': 0.8,
    'Val': 0.1,
    'Test': 0.1
}

## 1. Load and Prepare Dataset

- Loads the raw dataset from a CSV file.
- Drops unnecessary or redundant columns.
- Renames specific columns for clarity and consistency:
  - `'ID'` â†’ `'transcript_id'`
  - `'POS'` â†’ `'transcript_position'`
  - `'SEQ'` â†’ `'7mer'`
- Computes the number of reads per `(transcript_id, transcript_position)` pair.


In [3]:
# =========================
# 1. Load and Preparing datasets
# =========================
print("Starting: Load and Prepare Dataset")

# Load
reads_df = pd.read_csv(input_file_path)

# Clean up
columns_to_drop = ['Unnamed: 0']
reads_df = reads_df.drop(columns=columns_to_drop, axis=1, errors='ignore')
reads_df = reads_df.rename(columns=rename_columns)

# Add read count per transcript-position
reads_df['n_reads'] = reads_df.groupby(['transcript_id', 'transcript_position'])['7mer'].transform('size')

print("âœ… Dataset loaded and prepared.")

Starting: Load and Prepare Dataset
âœ… Dataset loaded and prepared.


## 2. Preprocessing: Extract 5-mers from 7-mer

This step processes each 7-mer sequence into three overlapping 5-mers:

- **Pre-5mer**: bases 0â€“4  
- **In-5mer**: bases 1â€“5  
- **Post-5mer**: bases 2â€“6

These 5-mers will be used later for feature embedding or modeling.


In [4]:

# =========================
# 2. Preprocessing: 7-mer embedding
# =========================
print("Starting: 5-mer extraction")

def extract_5mers(seq):
    pre_5mer = seq[0:5]   # bases 0-4
    in_5mer = seq[1:6]    # bases 1-5
    post_5mer = seq[2:7]  # bases 2-6
    return pre_5mer, in_5mer, post_5mer

# Apply extraction
reads_df[['Pre_5mer', 'In_5mer', 'Post_5mer']] = reads_df['7mer'].apply(
    lambda x: pd.Series(extract_5mers(x))
)

print("âœ… 5-mer columns added.")

Starting: 5-mer extraction
âœ… 5-mer columns added.


## 3. Assign Train / Val / Test Splits (By Gene)

- Splits the dataset into **Train**, **Validation**, and **Test** sets **by gene**, not randomly by rows.
- Ensures all rows from the same gene appear in only one of the three sets.
- Maintains label balance across the splits using a greedy bin-filling strategy.
- Target split ratios:
  - Train: 80%
  - Validation: 10%
  - Test: 10%
- Prints the number of rows per set and label distribution (% of label 0 and 1).
- Saves the result to a `.parquet` file for later use.


In [5]:
# =========================
# 3. Assign split bins
# =========================
print("Starting: 3. Assign split bins")

def assign_set_type_by_gene(reads_df, split_ratios={'Train': 0.8, 'Val': 0.1, 'Test': 0.1}, random_state=42):
    """
    Assigns each row in reads_df a 'set_type' of Train, Val, or Test,
    ensuring all rows with the same gene_id are in the same set,
    and total number of rows (not just genes) in each set matches desired ratios.
    Label distribution is approximately balanced using a greedy strategy.
    """

    # Step 1: Get stats per gene
    gene_stats = (
        reads_df
        .groupby('gene_id')['label']
        .value_counts()
        .unstack(fill_value=0)
        .rename(columns={0: 'label_0', 1: 'label_1'})
        .reset_index()
    )
    gene_stats['total'] = gene_stats['label_0'] + gene_stats['label_1']

    # Shuffle genes for randomness
    gene_stats = gene_stats.sample(frac=1, random_state=random_state).reset_index(drop=True)

    # Step 2: Overall label distribution and target row counts
    total_rows = gene_stats['total'].sum()
    total_label_1 = gene_stats['label_1'].sum()
    overall_pos_rate = total_label_1 / total_rows

    target_rows = {k: total_rows * split_ratios[k] for k in split_ratios}

    # Step 3: Initialize bins
    bins = {
        'Train': {'genes': [], 'label_0': 0, 'label_1': 0, 'total': 0},
        'Val': {'genes': [], 'label_0': 0, 'label_1': 0, 'total': 0},
        'Test': {'genes': [], 'label_0': 0, 'label_1': 0, 'total': 0},
    }

    def pick_bin():
        # Find the bin with the biggest gap between current and target row count
        diffs = {k: target_rows[k] - bins[k]['total'] for k in bins}
        # Choose the bin that needs rows the most
        return max(diffs, key=diffs.get)

    # Step 4: Assign genes to bins to match row targets and label balance
    for _, row in gene_stats.iterrows():
        chosen_bin = pick_bin()
        bins[chosen_bin]['genes'].append(row['gene_id'])
        bins[chosen_bin]['label_0'] += row['label_0']
        bins[chosen_bin]['label_1'] += row['label_1']
        bins[chosen_bin]['total'] += row['total']

    # Step 5: Map gene_id â†’ set_type
    gene_to_set = {}
    for set_name, bin_data in bins.items():
        for gene_id in bin_data['genes']:
            gene_to_set[gene_id] = set_name

    reads_df['set_type'] = reads_df['gene_id'].map(gene_to_set)

    return reads_df


reads_df = assign_set_type_by_gene(reads_df, split_ratios = split_ratios, random_state=RNG)

set_counts = reads_df['set_type'].value_counts()
print("ðŸ“Š Number of rows in each set:")
for set_name, count in set_counts.items():
    print(f"  - {set_name}: {count} rows")

# Print label distribution per set (normalized)

label_dist = reads_df.groupby('set_type')['label'].value_counts(normalize=True).unstack()
print("\nðŸ“ˆ Label distribution:")
for set_name in label_dist.index:
    print(f"  - {set_name}: 0 = {label_dist.loc[set_name].get(0, 0)*100:.2f}%, 1 = {label_dist.loc[set_name].get(1, 0)*100:.2f}%")

print("Ending: 3. Assign split bins")

Starting: 3. Assign split bins
ðŸ“Š Number of rows in each set:
  - Train: 8820055 rows
  - Val: 1105069 rows
  - Test: 1101982 rows

ðŸ“ˆ Label distribution:
  - Test: 0 = 94.08%, 1 = 5.92%
  - Train: 0 = 95.59%, 1 = 4.41%
  - Val: 0 = 95.90%, 1 = 4.10%
Ending: 3. Assign split bins


In [6]:
# =========================
# 4. Save Preprocessed Dataset
# =========================

reads_df.to_parquet(output_parquet_path, index=False)
print(f"âœ… Saved processed data to: {output_parquet_path}")

âœ… Saved processed data to: Dataset/before_embedding.parquet
