In [7]:
import pandas as pd
import numpy as np
from scipy import stats

In [8]:
df = pd.read_parquet("../data/before_embedding.parquet")  # file path/file name here should be the output from Data processing Part 1.

df.drop(["Pre_5mer", "In_5mer", "Post_5mer", "n_reads"], axis=1, inplace=True)

In [9]:
df.head()

Unnamed: 0,transcript_id,transcript_position,7mer,PreTime,PreSD,PreMean,InTime,InSD,InMean,PostTime,PostSD,PostMean,gene_id,label,set_type
0,ENST00000000233,244,AAGACCA,0.00299,2.06,125.0,0.0177,10.4,122.0,0.0093,10.9,84.1,ENSG00000004059,0,Train
1,ENST00000000233,244,AAGACCA,0.00631,2.53,125.0,0.00844,4.67,126.0,0.0103,6.3,80.9,ENSG00000004059,0,Train
2,ENST00000000233,244,AAGACCA,0.00465,3.92,109.0,0.0136,12.0,124.0,0.00498,2.13,79.6,ENSG00000004059,0,Train
3,ENST00000000233,244,AAGACCA,0.00398,2.06,125.0,0.0083,5.01,130.0,0.00498,3.78,80.4,ENSG00000004059,0,Train
4,ENST00000000233,244,AAGACCA,0.00664,2.92,120.0,0.00266,3.94,129.0,0.013,7.15,82.2,ENSG00000004059,0,Train


## Keep the code above as it is other than changing the file path/file name, use the Data Processing Part 1 to produce ensure split used by all model are same

---

Add skew, kurtosis
3mer, 5mer combinations from mirabelle

In [10]:
# Define your numeric columns (all one-hot encoded ones)
numeric_cols = ['PreTime', 'PreSD', 'PreMean', 'InTime', 'InSD', 'InMean', 'PostTime', 'PostSD', 'PostMean']

# Custom function for mode (since it returns a Series)
def mode_func(x):
    return stats.mode(x, keepdims=True)[0][0] if len(x) > 0 else np.nan
mode_func.__name__ = "mode"

# Quantile functions with named attributes (for clean column names)
def q25(x):
    return x.quantile(0.25)
q25.__name__ = "p25"

def q75(x):
    return x.quantile(0.75)
q75.__name__ = "p75"

# Skew and kurtosis functions with named attributes
def skew_func(x):
    return stats.skew(x, nan_policy='omit')
skew_func.__name__ = "skew"

def kurtosis_func(x):
    return stats.kurtosis(x, nan_policy='omit')
kurtosis_func.__name__ = "kurtosis"

# Perform grouped aggregation
df_agg = (
    df
    .groupby(["transcript_id", "transcript_position", "7mer" , "label", "set_type"], as_index=False)
    .agg({col: ['mean', 'median', mode_func, 'std', 'min', 'max', q25, q75, skew_func, kurtosis_func] 
          for col in numeric_cols})
)

# Flatten the MultiIndex column names
new_columns = []
for col_tuple in df_agg.columns:
    # Keep base columns (grouping keys) unchanged
    if col_tuple[0] in ["transcript_id", "transcript_position", "7mer", "label", "set_type"]:
        new_columns.append(col_tuple[0])
    else:
        # Combine numeric column + stat name
        stat_name = col_tuple[1]
        new_columns.append(f"{col_tuple[0]}_{stat_name}")

df_agg.columns = new_columns

In [11]:
df_agg.head()

Unnamed: 0,transcript_id,transcript_position,7mer,label,set_type,PreTime_mean,PreTime_median,PreTime_mode,PreTime_std,PreTime_min,...,PostMean_mean,PostMean_median,PostMean_mode,PostMean_std,PostMean_min,PostMean_max,PostMean_p25,PostMean_p75,PostMean_skew,PostMean_kurtosis
0,ENST00000000233,244,AAGACCA,0,Train,0.008264,0.00697,0.00398,0.005399,0.00199,...,80.57027,80.5,78.6,2.529013,73.1,88.3,79.0,82.0,0.120934,0.663203
1,ENST00000000233,261,CAAACTG,0,Train,0.006609,0.00564,0.00498,0.003599,0.00199,...,94.290698,94.1,93.0,2.499807,88.6,103.0,92.8,95.725,0.753166,1.428778
2,ENST00000000233,316,GAAACAG,0,Train,0.00757,0.00631,0.00498,0.004456,0.00232,...,89.364324,89.2,89.2,1.488369,84.4,96.2,88.4,90.1,0.749744,2.467259
3,ENST00000000233,332,AGAACAT,0,Train,0.01062,0.00902,0.0102,0.006136,0.00232,...,89.154,89.9,90.3,3.503707,81.4,95.7,86.975,91.225,-0.479554,-0.500195
4,ENST00000000233,368,AGGACAA,0,Train,0.010701,0.00896,0.00398,0.007169,0.00199,...,85.178788,85.4,84.9,2.385075,77.6,90.5,83.625,86.9,-0.533942,0.342322


In [12]:
df_agg.to_csv('../data/before_embedding_continued.csv', index=False)