In [204]:
# %pip install prince

In [205]:
import pandas as pd

# For MCA which is not used
# import prince

In [206]:
df = pd.read_csv("../data/before_embedding_continued.csv")

In [207]:
if 'Unnamed: 0' in df.columns:
  df = df.drop(columns='Unnamed: 0')

In [208]:
df.head()

Unnamed: 0,transcript_id,transcript_position,7mer,label,set_type,PreTime_mean,PreTime_median,PreTime_mode,PreTime_std,PreTime_min,...,PostMean_mean,PostMean_median,PostMean_mode,PostMean_std,PostMean_min,PostMean_max,PostMean_p25,PostMean_p75,PostMean_skew,PostMean_kurtosis
0,ENST00000000233,244,AAGACCA,0,Train,0.008264,0.00697,0.00398,0.005399,0.00199,...,80.57027,80.5,78.6,2.529013,73.1,88.3,79.0,82.0,0.120934,0.663203
1,ENST00000000233,261,CAAACTG,0,Train,0.006609,0.00564,0.00498,0.003599,0.00199,...,94.290698,94.1,93.0,2.499807,88.6,103.0,92.8,95.725,0.753166,1.428778
2,ENST00000000233,316,GAAACAG,0,Train,0.00757,0.00631,0.00498,0.004456,0.00232,...,89.364324,89.2,89.2,1.488369,84.4,96.2,88.4,90.1,0.749744,2.467259
3,ENST00000000233,332,AGAACAT,0,Train,0.01062,0.00902,0.0102,0.006136,0.00232,...,89.154,89.9,90.3,3.503707,81.4,95.7,86.975,91.225,-0.479554,-0.500195
4,ENST00000000233,368,AGGACAA,0,Train,0.010701,0.00896,0.00398,0.007169,0.00199,...,85.178788,85.4,84.9,2.385075,77.6,90.5,83.625,86.9,-0.533942,0.342322


### 1. Perform one-hot encoding on SEQ

In [209]:
base_map = {
    "A": [1, 0, 0, 0],
    "C": [0, 1, 0, 0],
    "G": [0, 0, 1, 0],
    "T": [0, 0, 0, 1]
}

# Expand into one-hot encoded row
def one_hot_flat(seq):
    return sum([base_map[b] for b in seq], [])  # flatten list of lists

In [210]:
one_hot_vectors = df["7mer"].apply(one_hot_flat)

Drop unnecessary one-hot encoded columns based on DRACH

Logic of what needs to be kept:

pos0: A, C, G, T (all)

pos1: A, G, T

pos2: A, G

pos3: A (technically can drop also as it is the only option)

pos4: C (technically can drop also as it is the only option)

pos5: A, C, T

pos6: A, C, G, T (all)

In [211]:
# Create one-hot encoding columns
one_hot_df = pd.DataFrame(
    one_hot_vectors.tolist(),
    columns=[f"pos{i}_{b}" for i in range(len(df["7mer"][0])) for b in ["A","C","G","T"]]
)

one_hot_df = one_hot_df.drop(columns=['pos1_C', 'pos2_C', 'pos2_T', 'pos3_C', 'pos3_G', 'pos3_T','pos4_A', 'pos4_G', 'pos4_T','pos5_G'], axis=1)

# Remove zero columns and consider dropping pos3_A and pos4_C as there can only be one possible value, and all the rows conform to this

one_hot_df = one_hot_df.drop(columns=['pos3_A', 'pos4_C'], axis=1).loc[:, (one_hot_df != 0).any(axis=0)]

# Merge with original
df_expanded = pd.concat([df, one_hot_df], axis=1)

In [212]:
df_expanded.head()

Unnamed: 0,transcript_id,transcript_position,7mer,label,set_type,PreTime_mean,PreTime_median,PreTime_mode,PreTime_std,PreTime_min,...,pos1_T,pos2_A,pos2_G,pos5_A,pos5_C,pos5_T,pos6_A,pos6_C,pos6_G,pos6_T
0,ENST00000000233,244,AAGACCA,0,Train,0.008264,0.00697,0.00398,0.005399,0.00199,...,0,0,1,0,1,0,1,0,0,0
1,ENST00000000233,261,CAAACTG,0,Train,0.006609,0.00564,0.00498,0.003599,0.00199,...,0,1,0,0,0,1,0,0,1,0
2,ENST00000000233,316,GAAACAG,0,Train,0.00757,0.00631,0.00498,0.004456,0.00232,...,0,1,0,1,0,0,0,0,1,0
3,ENST00000000233,332,AGAACAT,0,Train,0.01062,0.00902,0.0102,0.006136,0.00232,...,0,1,0,1,0,0,0,0,0,1
4,ENST00000000233,368,AGGACAA,0,Train,0.010701,0.00896,0.00398,0.007169,0.00199,...,0,0,1,1,0,0,1,0,0,0


In [213]:
for column in df_expanded.columns:
  print(column)

transcript_id
transcript_position
7mer
label
set_type
PreTime_mean
PreTime_median
PreTime_mode
PreTime_std
PreTime_min
PreTime_max
PreTime_p25
PreTime_p75
PreTime_skew
PreTime_kurtosis
PreSD_mean
PreSD_median
PreSD_mode
PreSD_std
PreSD_min
PreSD_max
PreSD_p25
PreSD_p75
PreSD_skew
PreSD_kurtosis
PreMean_mean
PreMean_median
PreMean_mode
PreMean_std
PreMean_min
PreMean_max
PreMean_p25
PreMean_p75
PreMean_skew
PreMean_kurtosis
InTime_mean
InTime_median
InTime_mode
InTime_std
InTime_min
InTime_max
InTime_p25
InTime_p75
InTime_skew
InTime_kurtosis
InSD_mean
InSD_median
InSD_mode
InSD_std
InSD_min
InSD_max
InSD_p25
InSD_p75
InSD_skew
InSD_kurtosis
InMean_mean
InMean_median
InMean_mode
InMean_std
InMean_min
InMean_max
InMean_p25
InMean_p75
InMean_skew
InMean_kurtosis
PostTime_mean
PostTime_median
PostTime_mode
PostTime_std
PostTime_min
PostTime_max
PostTime_p25
PostTime_p75
PostTime_skew
PostTime_kurtosis
PostSD_mean
PostSD_median
PostSD_mode
PostSD_std
PostSD_min
PostSD_max
PostSD_p25
PostSD_

Add 5-mer DRACH pattern: GGACT

Add positive 3-mer patterns: GGA, AGG, TGG

Add negative 3-mer patterns: TTA, GTA, TAA



In [214]:
# 1. Check if the **middle 5 letters** contain 'GGACT'
# Middle 5 letters of a 7-mer are positions 1 to 5 (0-indexed → [1:6])
df_expanded['has_GGACT'] = df_expanded['7mer'].str[1:6].str.contains('GGACT').astype(int)

# 2. Check if 'GGA', 'AGG', or 'TGG' are anywhere in the 7mer sequence
df_expanded['has_GGA'] = df_expanded['7mer'].str.contains('GGA').astype(int)
df_expanded['has_AGG'] = df_expanded['7mer'].str.contains('AGG').astype(int)
df_expanded['has_TGG'] = df_expanded['7mer'].str.contains('TGG').astype(int)

# 3. Check if 'TTA', 'GTA', or 'TAA' are anywhere in the 7mer sequence
df_expanded['has_TTA'] = df_expanded['7mer'].str.contains('TTA').astype(int)
df_expanded['has_GTA'] = df_expanded['7mer'].str.contains('GTA').astype(int)
df_expanded['has_TAA'] = df_expanded['7mer'].str.contains('TAA').astype(int)

In [203]:
df_expanded.head()

Unnamed: 0,transcript_id,transcript_position,7mer,label,set_type,PreTime_mean,PreTime_median,PreTime_mode,PreTime_std,PreTime_min,...,pos6_C,pos6_G,pos6_T,has_GGACT,has_GGA,has_AGG,has_TGG,has_TTA,has_GTA,has_TAA
0,ENST00000000233,244,AAGACCA,0,Train,0.008264,0.00697,0.00398,0.005399,0.00199,...,0,0,0,0,0,0,0,0,0,0
1,ENST00000000233,261,CAAACTG,0,Train,0.006609,0.00564,0.00498,0.003599,0.00199,...,0,1,0,0,0,0,0,0,0,0
2,ENST00000000233,316,GAAACAG,0,Train,0.00757,0.00631,0.00498,0.004456,0.00232,...,0,1,0,0,0,0,0,0,0,0
3,ENST00000000233,332,AGAACAT,0,Train,0.01062,0.00902,0.0102,0.006136,0.00232,...,0,0,1,0,0,0,0,0,0,0
4,ENST00000000233,368,AGGACAA,0,Train,0.010701,0.00896,0.00398,0.007169,0.00199,...,0,0,0,0,1,1,0,0,0,0


In [215]:
df_expanded.columns

Index(['transcript_id', 'transcript_position', '7mer', 'label', 'set_type',
       'PreTime_mean', 'PreTime_median', 'PreTime_mode', 'PreTime_std',
       'PreTime_min',
       ...
       'pos6_C', 'pos6_G', 'pos6_T', 'has_GGACT', 'has_GGA', 'has_AGG',
       'has_TGG', 'has_TTA', 'has_GTA', 'has_TAA'],
      dtype='object', length=118)

In [216]:
one_hot_df.columns

Index(['pos0_A', 'pos0_C', 'pos0_G', 'pos0_T', 'pos1_A', 'pos1_G', 'pos1_T',
       'pos2_A', 'pos2_G', 'pos5_A', 'pos5_C', 'pos5_T', 'pos6_A', 'pos6_C',
       'pos6_G', 'pos6_T'],
      dtype='object')

In [None]:
# Export full dataset with all one-hot-encoded columns

# df_expanded.to_csv('../data/dataset0_all_seq.csv')

## Perform MCA on SEQ one-hot encoded columns (not used)

In [None]:
# # Remove columns that are not required due to DRACH and which are zero columns

# one_hot_df_cleaned = one_hot_df.drop(columns=['pos1_C', 'pos2_C', 'pos2_T', 'pos3_C', 'pos3_G', 'pos3_T','pos4_A', 'pos4_G', 'pos4_T','pos5_G'], axis=1).loc[:, (one_hot_df != 0).any(axis=0)]

# # Consider dropping pos3_A and pos4_C as there can only be one possible value, and all the rows conform to this

# one_hot_df_cleaned = one_hot_df_cleaned.drop(columns=['pos3_A', 'pos4_C'], axis=1).loc[:, (one_hot_df != 0).any(axis=0)]

In [None]:
# mca = prince.MCA(
#     n_components=16,
#     n_iter=10, # controls max number of iterations of SVD allowed
#     copy=True,
#     check_input=True,
#     engine='sklearn',
#     random_state=42,
#     one_hot=False
# )

In [None]:
# one_hot_mca = mca.fit(one_hot_df_cleaned)

In [None]:
# one_hot_df_cleaned.columns

Index(['pos0_A', 'pos0_C', 'pos0_G', 'pos0_T', 'pos1_A', 'pos1_G', 'pos1_T',
       'pos2_A', 'pos2_G', 'pos5_A', 'pos5_C', 'pos5_T', 'pos6_A', 'pos6_C',
       'pos6_G', 'pos6_T'],
      dtype='object')

In [None]:
# df_mca = one_hot_mca.transform(one_hot_df_cleaned)

In [None]:
# df_mca.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.420941,-0.320798,-0.879412,0.093348,-0.141512,-0.056389,-0.551046,0.498671,-0.290484,-0.279216,-0.416974,0.183995,-0.634409,0.597436,0.366676
1,-0.142215,1.243669,-0.081412,-0.276204,-0.120959,-0.176572,0.474024,0.402844,-0.486998,0.364221,0.149256,0.183995,-0.634409,0.597436,0.366676
2,-0.628382,0.752708,-0.364812,0.160608,0.644947,0.438556,0.156574,-0.39263,0.012817,0.189489,-0.374444,0.183995,-0.634409,0.597436,0.366676
3,-0.686695,-0.337457,-0.174406,0.103896,-0.548998,-0.154801,-0.370468,-0.687121,0.133164,0.323889,0.417547,0.183995,-0.634409,0.597436,0.366676
4,-0.203526,-0.47673,-0.321733,0.678899,0.125176,-0.368318,-0.753457,0.059573,-0.101502,-0.308624,0.47912,0.183995,-0.634409,0.597436,0.366676


In [None]:
# one_hot_mca.eigenvalues_summary

Unnamed: 0_level_0,eigenvalue,% of variance,% of variance (cumulative)
component,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.259,11.77%,11.77%
1,0.243,11.04%,22.81%
2,0.239,10.84%,33.65%
3,0.216,9.83%,43.49%
4,0.211,9.58%,53.07%
5,0.195,8.86%,61.93%
6,0.194,8.82%,70.75%
7,0.181,8.22%,78.97%
8,0.166,7.56%,86.53%
9,0.152,6.89%,93.43%


From above, we can see that only components 0-10 have some explained variance

In [None]:
# df_mca = df_mca.drop(columns=[11, 12, 13, 14])

# df_mca_final = pd.concat([df.reset_index(drop=True), df_mca.add_prefix('MCA_')], axis=1)

In [None]:
# df_mca_final.head()

Unnamed: 0,ID,POS,SEQ,label,set_type,PreTime_mean,PreTime_median,PreTime_mode,PreTime_std,PreTime_min,...,MCA_1,MCA_2,MCA_3,MCA_4,MCA_5,MCA_6,MCA_7,MCA_8,MCA_9,MCA_10
0,ENST00000000233,244,AAGACCA,0,Train,0.008264,0.00697,0.00398,0.005399,0.00199,...,-0.320798,-0.879412,0.093348,-0.141512,-0.056389,-0.551046,0.498671,-0.290484,-0.279216,-0.416974
1,ENST00000000233,261,CAAACTG,0,Train,0.006609,0.00564,0.00498,0.003599,0.00199,...,1.243669,-0.081412,-0.276204,-0.120959,-0.176572,0.474024,0.402844,-0.486998,0.364221,0.149256
2,ENST00000000233,316,GAAACAG,0,Train,0.00757,0.00631,0.00498,0.004456,0.00232,...,0.752708,-0.364812,0.160608,0.644947,0.438556,0.156574,-0.39263,0.012817,0.189489,-0.374444
3,ENST00000000233,332,AGAACAT,0,Train,0.01062,0.00902,0.0102,0.006136,0.00232,...,-0.337457,-0.174406,0.103896,-0.548998,-0.154801,-0.370468,-0.687121,0.133164,0.323889,0.417547
4,ENST00000000233,368,AGGACAA,0,Train,0.010701,0.00896,0.00398,0.007169,0.00199,...,-0.47673,-0.321733,0.678899,0.125176,-0.368318,-0.753457,0.059573,-0.101502,-0.308624,0.47912


In [None]:
# df_mca_final.columns

Index(['ID', 'POS', 'SEQ', 'label', 'set_type', 'PreTime_mean',
       'PreTime_median', 'PreTime_mode', 'PreTime_std', 'PreTime_min',
       'PreTime_max', 'PreTime_p25', 'PreTime_p75', 'PreSD_mean',
       'PreSD_median', 'PreSD_mode', 'PreSD_std', 'PreSD_min', 'PreSD_max',
       'PreSD_p25', 'PreSD_p75', 'PreMean_mean', 'PreMean_median',
       'PreMean_mode', 'PreMean_std', 'PreMean_min', 'PreMean_max',
       'PreMean_p25', 'PreMean_p75', 'InTime_mean', 'InTime_median',
       'InTime_mode', 'InTime_std', 'InTime_min', 'InTime_max', 'InTime_p25',
       'InTime_p75', 'InSD_mean', 'InSD_median', 'InSD_mode', 'InSD_std',
       'InSD_min', 'InSD_max', 'InSD_p25', 'InSD_p75', 'InMean_mean',
       'InMean_median', 'InMean_mode', 'InMean_std', 'InMean_min',
       'InMean_max', 'InMean_p25', 'InMean_p75', 'PostTime_mean',
       'PostTime_median', 'PostTime_mode', 'PostTime_std', 'PostTime_min',
       'PostTime_max', 'PostTime_p25', 'PostTime_p75', 'PostSD_mean',
       'PostSD_medi

In [None]:
# Export dataset with MCA columns > 0 explained variance

# df_mca_final.to_csv('../data/dataset0_mca.csv')