In [11]:
import pandas as pd
import numpy as np
from scipy import stats

In [12]:
df = pd.read_csv("../data/dataset2.csv")

In [13]:
# SEQ one hot encoding

base_map = {
    "A": [1, 0, 0, 0],
    "C": [0, 1, 0, 0],
    "G": [0, 0, 1, 0],
    "T": [0, 0, 0, 1],
    "U": [0, 0, 0, 1]   # treat U same as T
}

# Expand into one-hot encoded row
def one_hot_flat(seq):
    return sum([base_map[b] for b in seq], [])  # flatten list of lists

In [14]:
one_hot_vectors = df["SEQ"].apply(one_hot_flat)

In [15]:
# Create DataFrame with 28 columns
one_hot_df = pd.DataFrame(
    one_hot_vectors.tolist(),
    columns=[f"pos{i+1}_{b}" for i in range(len(df["SEQ"][0])) for b in ["A","C","G","T"]]
)

# Merge with original
df_expanded = pd.concat([df, one_hot_df], axis=1)

In [16]:
# df_expanded.head()

In [17]:
# Logic:
# pos0: A, C, G, T (all)
# pos1: A, G, T
# pos2: A, G
# pos3: A
# pos4: C
# pos5: A, C, T
# pos6: A, C, G, T (all)

df_expanded = df_expanded.drop(columns=['pos1_C', 'pos2_C', 'pos2_T', 'pos3_C', 'pos3_G', 'pos3_T','pos4_A', 'pos4_G', 'pos4_T','pos5_G'], axis=1)

In [18]:
df_expanded.head()

Unnamed: 0,ID,POS,SEQ,PreTime,PreSD,PreMean,InTime,InSD,InMean,PostTime,...,pos5_C,pos5_T,pos6_A,pos6_C,pos6_G,pos6_T,pos7_A,pos7_C,pos7_G,pos7_T
0,tx_id_0,0,AAAACCT,0.0122,3.99,106.0,0.00337,4.56,102.0,0.00664,...,1,0,0,1,0,0,0,0,0,1
1,tx_id_0,0,AAAACCT,0.0302,2.32,107.0,0.00443,2.36,102.0,0.00332,...,1,0,0,1,0,0,0,0,0,1
2,tx_id_0,0,AAAACCT,0.00232,5.55,110.0,0.00664,7.04,99.3,0.00232,...,1,0,0,1,0,0,0,0,0,1
3,tx_id_0,0,AAAACCT,0.00465,2.1,104.0,0.00996,3.9,108.0,0.00401,...,1,0,0,1,0,0,0,0,0,1
4,tx_id_0,0,AAAACCT,0.0211,3.49,103.0,0.00531,3.8,101.0,0.00997,...,1,0,0,1,0,0,0,0,0,1


In [19]:
# Define your numeric columns (all one-hot encoded ones)
numeric_cols = ['PreTime', 'PreSD', 'PreMean', 'InTime', 'InSD', 'InMean', 'PostTime', 'PostSD', 'PostMean']

# Custom function for mode (since it returns a Series)
def mode_func(x):
    return stats.mode(x, keepdims=True)[0][0] if len(x) > 0 else np.nan

# Quantile functions with named attributes (for clean column names)
def q25(x):
    return x.quantile(0.25)
q25.__name__ = "p25"

def q75(x):
    return x.quantile(0.75)
q75.__name__ = "p75"

# Perform grouped aggregation
df_agg = (
    df_expanded
    .groupby(["ID", "POS"], as_index=False)
    .agg({col: ['mean', 'median', mode_func, 'std', 'min', 'max', q25, q75] 
          for col in numeric_cols})
)

# Flatten the MultiIndex column names
new_columns = []
for col_tuple in df_agg.columns:
    # Keep base columns (grouping keys) unchanged
    if col_tuple[0] in ["ID", "POS"]:
        new_columns.append(col_tuple[0])
    else:
        # Combine numeric column + stat name
        stat_name = col_tuple[1]
        # Clean up _mode_func → _mode
        if stat_name == "mode_func":
            stat_name = "mode"
        new_columns.append(f"{col_tuple[0]}_{stat_name}")

df_agg.columns = new_columns

In [20]:
df_agg.to_csv('../data/output_site_level_dataset2.csv')

In [21]:
df_agg['ID'].unique()

array(['tx_id_0', 'tx_id_1', 'tx_id_2', 'tx_id_3', 'tx_id_4', 'tx_id_5',
       'tx_id_6'], dtype=object)