In [16]:
import pandas as pd
import numpy as np

In [17]:
df = pd.read_csv('../data/output_w_label.csv')
df = df.drop(columns='Unnamed: 0')

In [18]:
df.head()

Unnamed: 0,ID,POS,SEQ,PreTime,PreSD,PreMean,InTime,InSD,InMean,PostTime,PostSD,PostMean,gene_id,label
0,ENST00000000233,244,AAGACCA,0.00299,2.06,125.0,0.0177,10.4,122.0,0.0093,10.9,84.1,ENSG00000004059,0
1,ENST00000000233,244,AAGACCA,0.00631,2.53,125.0,0.00844,4.67,126.0,0.0103,6.3,80.9,ENSG00000004059,0
2,ENST00000000233,244,AAGACCA,0.00465,3.92,109.0,0.0136,12.0,124.0,0.00498,2.13,79.6,ENSG00000004059,0
3,ENST00000000233,244,AAGACCA,0.00398,2.06,125.0,0.0083,5.01,130.0,0.00498,3.78,80.4,ENSG00000004059,0
4,ENST00000000233,244,AAGACCA,0.00664,2.92,120.0,0.00266,3.94,129.0,0.013,7.15,82.2,ENSG00000004059,0


In [19]:
# SEQ one hot encoding

base_map = {
    "A": [1, 0, 0, 0],
    "C": [0, 1, 0, 0],
    "G": [0, 0, 1, 0],
    "T": [0, 0, 0, 1],
    "U": [0, 0, 0, 1]   # treat U same as T
}

# Expand into one-hot encoded row
def one_hot_flat(seq):
    return sum([base_map[b] for b in seq], [])  # flatten list of lists

In [20]:
one_hot_vectors = df["SEQ"].apply(one_hot_flat)

In [21]:
# Create DataFrame with 28 columns
one_hot_df = pd.DataFrame(
    one_hot_vectors.tolist(),
    columns=[f"pos{i+1}_{b}" for i in range(len(df["SEQ"][0])) for b in ["A","C","G","T"]]
)

# Merge with original
df_expanded = pd.concat([df, one_hot_df], axis=1)

In [None]:
# Logic:
# pos0: A, C, G, T (all)
# pos1: A, G, T
# pos2: A, G
# pos3: A
# pos4: C
# pos5: A, C, T
# pos6: A, C, G, T (all)

df_expanded = df_expanded.drop(columns=['pos1_C', 'pos2_C', 'pos2_T', 'pos3_C', 'pos3_G', 'pos3_T','pos4_A', 'pos4_G', 'pos4_T','pos5_G'], axis=1)

In [25]:
df_expanded.head()

Unnamed: 0,ID,POS,SEQ,PreTime,PreSD,PreMean,InTime,InSD,InMean,PostTime,...,pos5_C,pos5_T,pos6_A,pos6_C,pos6_G,pos6_T,pos7_A,pos7_C,pos7_G,pos7_T
0,ENST00000000233,244,AAGACCA,0.00299,2.06,125.0,0.0177,10.4,122.0,0.0093,...,1,0,0,1,0,0,1,0,0,0
1,ENST00000000233,244,AAGACCA,0.00631,2.53,125.0,0.00844,4.67,126.0,0.0103,...,1,0,0,1,0,0,1,0,0,0
2,ENST00000000233,244,AAGACCA,0.00465,3.92,109.0,0.0136,12.0,124.0,0.00498,...,1,0,0,1,0,0,1,0,0,0
3,ENST00000000233,244,AAGACCA,0.00398,2.06,125.0,0.0083,5.01,130.0,0.00498,...,1,0,0,1,0,0,1,0,0,0
4,ENST00000000233,244,AAGACCA,0.00664,2.92,120.0,0.00266,3.94,129.0,0.013,...,1,0,0,1,0,0,1,0,0,0


In [26]:
df_agg = (
    df_expanded
    .groupby(["gene_id", "ID", "POS"], as_index=False)
    .mean(numeric_only=True)
)

In [27]:
df_agg.head()

Unnamed: 0,gene_id,ID,POS,PreTime,PreSD,PreMean,InTime,InSD,InMean,PostTime,...,pos5_C,pos5_T,pos6_A,pos6_C,pos6_G,pos6_T,pos7_A,pos7_C,pos7_G,pos7_T
0,ENSG00000000003,ENST00000373020,512,0.007247,2.3595,86.795,0.011177,2.4965,97.965,0.0085,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,ENSG00000000003,ENST00000373020,689,0.009868,1.972857,102.495238,0.007332,2.551905,97.928571,0.006917,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,ENSG00000000003,ENST00000373020,823,0.007456,2.078095,86.904762,0.006193,1.912857,93.680952,0.007296,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,ENSG00000000003,ENST00000373020,830,0.007765,2.834,86.765,0.007522,3.1925,93.075,0.005923,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,ENSG00000000003,ENST00000373020,849,0.006785,2.926667,86.92381,0.007727,2.782381,92.504762,0.008596,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [33]:
df_agg.columns

Index(['gene_id', 'ID', 'POS', 'PreTime', 'PreSD', 'PreMean', 'InTime', 'InSD',
       'InMean', 'PostTime', 'PostSD', 'PostMean', 'label', 'pos1_A', 'pos1_G',
       'pos1_T', 'pos2_A', 'pos2_G', 'pos3_A', 'pos4_C', 'pos5_A', 'pos5_C',
       'pos5_T', 'pos6_A', 'pos6_C', 'pos6_G', 'pos6_T', 'pos7_A', 'pos7_C',
       'pos7_G', 'pos7_T'],
      dtype='object')

In [32]:
df_agg.to_csv('../data/output_site_level.csv')