## Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.stats import normaltest
from sklearn.preprocessing import StandardScaler
from custom import HMM, KNN
from statsmodels.stats.multitest import multipletests

## Import

In [2]:
df = pd.read_csv("output_w_label.csv").drop('Unnamed: 0', axis=1)
df.head()

Unnamed: 0,ID,POS,SEQ,PreTime,PreSD,PreMean,InTime,InSD,InMean,PostTime,PostSD,PostMean,gene_id,label
0,ENST00000000233,244,AAGACCA,0.00299,2.06,125.0,0.0177,10.4,122.0,0.0093,10.9,84.1,ENSG00000004059,0
1,ENST00000000233,244,AAGACCA,0.00631,2.53,125.0,0.00844,4.67,126.0,0.0103,6.3,80.9,ENSG00000004059,0
2,ENST00000000233,244,AAGACCA,0.00465,3.92,109.0,0.0136,12.0,124.0,0.00498,2.13,79.6,ENSG00000004059,0
3,ENST00000000233,244,AAGACCA,0.00398,2.06,125.0,0.0083,5.01,130.0,0.00498,3.78,80.4,ENSG00000004059,0
4,ENST00000000233,244,AAGACCA,0.00664,2.92,120.0,0.00266,3.94,129.0,0.013,7.15,82.2,ENSG00000004059,0


## Feature

In [3]:
cols = ['PreTime', 'InTime', 'PostTime']  # columns to transform
df.loc[:, cols] = np.cbrt(df[cols])  # inplace modification

## Segmentation

Yes, the split is not really accurate - but it's close enough

In [4]:
def split(x):

    unique_x =x[['gene_id', 'label']].drop_duplicates()
    X_train, X_non_train, y_train, y_non_train = train_test_split(
        unique_x['gene_id'], unique_x['label'], test_size=0.2, stratify=unique_x['label'], random_state=42
    )

    X_test, X_val , y_test, y_val = train_test_split(
        X_non_train, y_non_train, test_size=0.5, stratify=y_non_train, random_state=42
    )

    return X_train, X_val, X_test

gene_train, gene_val, gene_test = split(df) 
    

In [5]:
train = df[df['gene_id'].isin(gene_train)].copy()
val = df[df['gene_id'].isin(gene_val)].copy()
test = df[df['gene_id'].isin(gene_test)].copy()
train.head()

Unnamed: 0,ID,POS,SEQ,PreTime,PreSD,PreMean,InTime,InSD,InMean,PostTime,PostSD,PostMean,gene_id,label
0,ENST00000000233,244,AAGACCA,0.144065,2.06,125.0,0.26061,10.4,122.0,0.210294,10.9,84.1,ENSG00000004059,0
1,ENST00000000233,244,AAGACCA,0.184789,2.53,125.0,0.203601,4.67,126.0,0.217577,6.3,80.9,ENSG00000004059,0
2,ENST00000000233,244,AAGACCA,0.166911,3.92,109.0,0.238697,12.0,124.0,0.170769,2.13,79.6,ENSG00000004059,0
3,ENST00000000233,244,AAGACCA,0.158475,2.06,125.0,0.202469,5.01,130.0,0.170769,3.78,80.4,ENSG00000004059,0
4,ENST00000000233,244,AAGACCA,0.187956,2.92,120.0,0.138557,3.94,129.0,0.235133,7.15,82.2,ENSG00000004059,0


### Check stratification

In [6]:
# Sample dataframe
all_segments = [train, val, test, df]

def get_transcripts(df):
    return df.drop_duplicates(subset=['ID', 'POS']).shape[0]

def get_positive_transcripts(df):
    return df[df['label'] == 1].drop_duplicates(subset=['ID', 'POS']).shape[0]
    


tab = pd.DataFrame({
    'Segment Set': ['Train', 'Validation', 'Testing','Total'],
    'Total Transcripts': map(get_transcripts, all_segments),
    'Pos Transcripts': map(get_positive_transcripts, all_segments),
    'Proportion (Positive/Total)': map(lambda x: x[0]/x[1], list(zip(map(get_positive_transcripts, all_segments),map(get_transcripts, all_segments)))),
    
})

# Style with nice header
styled_tab = (tab.style
    .set_caption('Train Validation Test Split')
    .hide()
    .set_table_styles([
        {'selector': 'thead th', 
         'props': [('background-color', '#405D7A'),
                   ('color', 'white'),
                   ('font-weight', 'bold'),
                   ('text-align', 'center')]},
        {'selector': 'tbody td', 
         'props': [('text-align', 'center')]},
        {'selector': 'tbody tr:nth-child(even)', 
         'props': [('background-color', '#F5F5F5')]}
    ])
    .format({
        'Total Transcripts': '{:,.0f}',  # Thousand separators, no decimals
        'Pos Transcripts': '{:,.0f}',  # Thousand separators, no decimals
        'Proportion (Positive/Total)': '{:.2%}'  # Percentage format with 2 decimals
    })
)

styled_tab

Segment Set,Total Transcripts,Pos Transcripts,Proportion (Positive/Total)
Train,108534,5248,4.84%
Validation,16002,1186,7.41%
Testing,16774,1106,6.59%
Total,121838,5475,4.49%


### Remove some data

1. Remove last character of sequence
2. Replace T with A in the 6th character

To reduce number of models to fit

In [7]:
train_segmented = {
    newseq: g
    for newseq, g in train.assign(
        newSEQ = train['SEQ'].str[:5] + train['SEQ'].str.get(5).replace({'C': 'A'})
    ).groupby('newSEQ')
}

val_segmented = {
    newseq: g
    for newseq, g in test.assign(
        newSEQ = val['SEQ'].str[:5] + val['SEQ'].str.get(5).replace({'C': 'A'})
    ).groupby('newSEQ')
}

test_segmented = {
    newseq: g
    for newseq, g in test.assign(
        newSEQ = test['SEQ'].str[:5] + test['SEQ'].str.get(5).replace({'C': 'A'})
    ).groupby('newSEQ')
}

In [8]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

# --- Example setup ---
# df = your DataFrame with columns: ['SEQ', 'x1', 'x2', ..., 'x12']
def get_closest_neighbours(df, SEQ, n_neighbors = 5):
    # Separate identifier and features
    seqs = df[SEQ]
    X = df.drop(columns=[SEQ])
    
    # --- Step 1: Standardize ---
    X_scaled = StandardScaler().fit_transform(X)
    
    # --- Step 2: PCA to 3 components ---
    pca = PCA(n_components=3)
    X_pca = pca.fit_transform(X_scaled)
    
    # --- Step 3: Nearest Neighbors (find 5 closest per point) ---
    # n_neighbors=6 includes the point itself, so we'll exclude it later
    nn = NearestNeighbors(n_neighbors=n_neighbors+1, metric='euclidean')
    nn.fit(X_pca)
    
    distances, indices = nn.kneighbors(X_pca)
    
    # --- Step 4: Build a mapping of each SEQ to its nearest 4–5 neighbors ---
    neighbors_dict = {}
    for i, seq in enumerate(seqs):
        neighbor_indices = indices[i][1:]  # skip the first one (itself)
        neighbors_dict[seq] = seqs.iloc[neighbor_indices].tolist()
    
    return neighbors_dict

def summarize_group(group):
    numeric_cols = group.select_dtypes(include=[np.number]).columns
    numeric_cols = numeric_cols.difference(['POS', 'label'])
    
    summary = {}
    for col in numeric_cols:
        values = group[col].dropna()
        mean = values.mean()
        sd = values.std(ddof=1)

        summary[f"{col}_mean"] = mean
        summary[f"{col}_sd"] = sd

    return pd.Series(summary)

summaries = []
for newseq, g in train_segmented.items():
    s = summarize_group(g)
    s.name = newseq
    summaries.append(s)

# --- Step 4: Combine into final summary DataFrame ---
summary_df = pd.DataFrame(summaries).reset_index().rename(columns={'index': 'newSEQ'})

closest_neighbours = get_closest_neighbours(summary_df, 'newSEQ', n_neighbors = 1)

In [140]:
def dataframe_to_HMM_input(df):
    scaler = StandardScaler()
    
    x_var = scaler.fit_transform(df[['PreTime', 'PreSD', 'PreMean', 
               'InTime', 'InSD', 'InMean', 
               'PostTime', 'PostSD', 'PostMean']])
    
    x_var = x_var.reshape(-1,3,3)
    '''
    y_var = np.column_stack([
        np.zeros(len(df), dtype=int),
        df['label'].to_numpy() + 1,
        np.full(len(df), 3, dtype=int)
    ])'''
    y_var = df[['label']]

    return x_var, y_var

def dataframe_to_HMM_input_scaled(df, scale):

    x_var = scaler.transform(df[['PreTime', 'PreSD', 'PreMean', 
               'InTime', 'InSD', 'InMean', 
               'PostTime', 'PostSD', 'PostMean']])
    
    x_var = x_var.reshape(-1,3,3)
    '''
    y_var = np.column_stack([
        np.zeros(len(df), dtype=int),
        df['label'].to_numpy() + 1,
        np.full(len(df), 3, dtype=int)
    ])'''
    y_var = df[['label']]

    return x_var, y_var

def dataframe_to_KNN_input(df):
    x_var = df[['PreTime', 'PreSD', 'PreMean', 
               'InTime', 'InSD', 'InMean', 
               'PostTime', 'PostSD', 'PostMean']]

    y_var = df[['label']]

    return x_var, y_var

def attach_positive(old_df, new_df):

    return pd.concat([old_df, new_df[new_df['label']==1]])
    


In [9]:
# Check attach_positive valid

print(train_segmented[list(train_segmented.keys())[0]].shape)
print(train_segmented[list(train_segmented.keys())[1]].shape)
print((lambda x: x[x['label']==1].shape) (train_segmented[list(train_segmented.keys())[1]]) )
print(attach_positive(train_segmented[list(train_segmented.keys())[0]],
                     train_segmented[list(train_segmented.keys())[1]]).shape)



(466451, 15)
(248595, 15)
(8280, 15)
(474731, 15)


### Actual fitting

Time to run: 30 minutes

In [142]:
y_actual = np.array([])
y_actual_seg = []
y_pred = np.array([])
y_pred_seg = []
i= 1

for SEQ, df_spec in train_segmented.items():
    print(i, "out of", len(train_segmented))

    '''
    if sum(df_spec['label'])/len(df_spec['label']) < 0.0001:
        for newSEQ in closest_neighbours[SEQ]:
            df_spec = attach_positive(df_spec, train_segmented[newSEQ])
    
    if sum(df_spec['label']) >0:
        df_spec = oversample_with_noise(df_spec, label_col='label',
                                        minority_label=1,
                                        target=0.2,
                                        target_type='ratio',
                                        noise_scale=0.02,
                                        random_state=0,
                                        use_smote=False)  # disable SMOTE if you prefer pure noise approach
    '''
    X_train, y_train = dataframe_to_HMM_input(df_spec)
    
    X_test, y_test = dataframe_to_HMM_input(val_segmented[SEQ])
    i = i + 1
    try:
        model, post_middle = HMM(X_train, X_test, y_train)
    
        y_pred = np.append(y_pred, post_middle)
        y_actual = np.append(y_actual, y_test)
    
        y_pred_seg.append(post_middle.copy())
        y_actual_seg.append(y_test.copy())
        
    except:
        print("Using KNN")
        X_train, y_train, scale = dataframe_to_KNN_input(df_spec)
        X_test, y_test = dataframe_to_KNN_input_scaled(val_segmented[SEQ], scale)
        model, post_middle = KNN(X_train, y_train, X_test)
    
        y_pred = np.append(y_pred, post_middle)
        y_actual = np.append(y_actual, y_test)
    
        y_pred_seg.append(post_middle.copy())
        y_actual_seg.append(y_test.copy())

        

1 out of 48
Best matching state: 2, flipped: False
2 out of 48
Best matching state: 2, flipped: True
3 out of 48
Best matching state: 2, flipped: False
4 out of 48
Best matching state: 2, flipped: True
5 out of 48
Best matching state: 2, flipped: False
6 out of 48
Best matching state: 2, flipped: True
7 out of 48
Best matching state: 2, flipped: False
8 out of 48
Best matching state: 2, flipped: True
9 out of 48
Best matching state: 2, flipped: False
10 out of 48
Best matching state: 2, flipped: False
11 out of 48
Best matching state: 2, flipped: False
12 out of 48
Best matching state: 2, flipped: False
13 out of 48
Best matching state: 2, flipped: True
14 out of 48
Best matching state: 2, flipped: False
15 out of 48
Best matching state: 2, flipped: True
16 out of 48
Best matching state: 2, flipped: True
17 out of 48
Best matching state: 2, flipped: False
18 out of 48
Best matching state: 2, flipped: True
19 out of 48
Best matching state: 2, flipped: True
20 out of 48
Best matching sta

In [131]:
def HMM(X_train, X_test, y_train, n_repeats=1):

    def build_model(p_to_node2):
        """Helper to construct and fit a DenseHMM with a given transition prob."""
        dists = [Normal() for _ in range(4)]
        edges = np.zeros((4, 4), dtype=float)

        # Transition probabilities
        edges[0, 1] = p_to_node2
        edges[0, 2] = 1.0 - p_to_node2
        edges[1, 3] = 1.0
        edges[2, 3] = 1.0
        edges[3, :] = 0.0

        starts = [1.0, 0.0, 0.0, 0.0]
        ends   = [0.0, 0.0, 0.0, 1.0]

        models = []
        for _ in range(n_repeats):
            model = DenseHMM(dists, edges=edges.tolist(),
                             starts=starts, ends=ends,
                             verbose=False, max_iter=50)
            model._initialize(X_train)
            model.fit(X_train)
            models.append(model)
        return models

    def get_posteriors(models, X):
        """Predict posterior probabilities, filtering out any NaN results."""
        posterior_list = []
        for i, m in enumerate(models):
            try:
                probs = m.predict_proba(X)
                if not np.isnan(probs).any():
                    posterior_list.append(probs)
                else:
                    print(f"[Model {i}] skipped due to NaN values.")
            except Exception as e:
                print(f"[Model {i}] failed: {e}")
        return posterior_list

    # --- First attempt ---
    p_to_node2 = 0.3
    models = build_model(p_to_node2)
    posterior_list = get_posteriors(models, X_test)

    # --- Retry if failed ---
    if len(posterior_list) == 0:
        print("Retrying with modified transition probability...")
        p_to_node2 = 0.6
        models = build_model(p_to_node2)
        posterior_list = get_posteriors(models, X_test)

    # --- If still failed, raise custom error ---
    if len(posterior_list) == 0:
        raise RuntimeError("Both attempts failed (NaNs in predictions).")

    # --- Determine correct state assignment using training data ---
    ref_model = models[0]
    train_probs = ref_model.predict_proba(X_train)  # (n_train, seq_len, n_states)
    train_middle = train_probs[:, 1, :]             # take middle timestep probabilities

    # Compare each state's probability vs y_train and (1 - y_train)
    best_state = 2
    best_distance = float("inf")

    p_state = train_middle[:, 2]

    d1 = np.sum((p_state.detach().numpy().flatten() - y_train.to_numpy().flatten() )**2)
    d2 = np.sum(((1 - p_state.detach().numpy().flatten()) - y_train.to_numpy().flatten() )**2)

    flipped = d2 < d1  # whether to flip probabilities

    print(f"Best matching state: {best_state}, flipped: {flipped}")

    # --- Aggregate results ---
    posterior_array = np.stack(posterior_list, axis=0)
    posterior_mean = np.nanmean(posterior_array, axis=0)
    post_middle = posterior_mean[:, 1, :]

    # Ensure output probabilities correspond to correct positive state
    prob_pos = post_middle[:, best_state]
    if flipped:
        prob_pos = 1 - prob_pos

    return models, prob_pos

In [67]:
len(y_pred_seg[0])

3092

In [146]:
for i in range(len(y_actual_seg)):
    seq_list = list(train_segmented.keys())
    print(seq_list[i], 
          np.round(np.mean((y_actual_seg[i].to_numpy()-y_pred_seg[i])**2),decimals=3 ),
          np.round(np.sum(y_actual_seg[i].to_numpy())/y_actual_seg[i].shape[0],decimals=3),
          np.round(np.sum(train_segmented[seq_list[i]]['label'].to_numpy())/train_segmented[seq_list[i]]['label'].to_numpy().shape[0],decimals=3))

AAAACA 0.327 0.041 0.004
AAAACT 0.347 0.0 0.033
AAGACA 0.271 0.0 0.014
AAGACT 0.307 0.133 0.114
AGAACA 0.113 0.01 0.024
AGAACT 0.258 0.2 0.135
AGGACA 0.257 0.172 0.074
AGGACT 0.278 0.118 0.247
ATAACA 0.016 0.0 0.0
ATAACT 0.097 0.0 0.02
ATGACA 0.159 0.0 0.007
ATGACT 0.319 0.0 0.052
CAAACA 0.336 0.0 0.003
CAAACT 0.166 0.0 0.031
CAGACA 0.272 0.036 0.011
CAGACT 0.37 0.072 0.079
CGAACA 0.059 0.0 0.021
CGAACT 0.075 0.0 0.087
CGGACA 0.372 0.348 0.056
CGGACT 0.372 0.403 0.325
CTAACA 0.134 0.0 0.003
CTAACT 0.049 0.0 0.008
CTGACA 0.307 0.103 0.019
CTGACT 0.341 0.123 0.081
GAAACA 0.286 0.0 0.01
GAAACT 0.305 0.163 0.032
GAGACA 0.156 0.0 0.016
GAGACT 0.281 0.198 0.125
GGAACA 0.073 0.0 0.024
GGAACT 0.274 0.228 0.122
GGGACA 0.229 0.04 0.083
GGGACT 0.297 0.0 0.243
GTAACA 0.048 0.0 0.002
GTAACT 0.097 0.0 0.018
GTGACA 0.242 0.047 0.031
GTGACT 0.289 0.188 0.084
TAAACA 0.062 0.0 0.003
TAAACT 0.109 0.0 0.031
TAGACA 0.169 0.0 0.023
TAGACT 0.237 0.0 0.056
TGAACA 0.116 0.049 0.032
TGAACT 0.223 0.157 0.118
TGG

In [106]:
y_actual.shape

(87350,)

In [143]:
mask = ~np.isnan(y_pred)
filter_y_actual = y_actual[mask]
filter_y_pred = y_pred[mask]

In [27]:
import numpy as np
import pandas as pd
from collections import Counter
from imblearn.over_sampling import RandomOverSampler, SMOTE

def oversample_with_noise(
    df,
    label_col,
    minority_label=1,
    target=None,
    target_type='ratio',   # 'ratio' or 'absolute'
    noise_scale=0.02,      # proportion of feature std for noise
    random_state=42,
    use_smote=True,
    smote_k_neighbors=5
):
    """
    Oversample df so the minority class reaches the requested target.

    - df: pandas DataFrame (X and y together)
    - label_col: name of label column in df
    - minority_label: value representing minority class (e.g. 1)
    - target: if target_type=='ratio', target is fraction (minority / majority)
              if target_type=='absolute', target is absolute minority count
              Example: target=0.5 means minority will be 50% of majority.
    - noise_scale: fraction of per-feature std used as Gaussian noise
    - use_smote: try SMOTE when possible, otherwise fallback to RandomOverSampler+noise
    - smote_k_neighbors: k used by SMOTE if invoked
    Returns: resampled DataFrame (index reset)
    """
    if target is None:
        raise ValueError("Provide target (ratio or absolute count).")

    df = df.copy()
    if label_col not in df.columns:
        raise KeyError(f"label_col {label_col} not in DataFrame")

    y_counts = Counter(df[label_col])
    if minority_label not in y_counts:
        raise ValueError(f"Minority label {minority_label} not found in label column. Can't oversample from nothing.")

    majority_label = max(y_counts, key=lambda k: y_counts[k])
    minority_count = y_counts[minority_label]
    majority_count = y_counts[majority_label]

    # compute desired minority count
    if target_type == 'ratio':
        if not (0 < target):
            raise ValueError("For ratio, target must be > 0 (e.g. 0.5 means minority = 50% of majority).")
        desired_minority = int(np.round(target * majority_count))
    elif target_type == 'absolute':
        if int(target) <= 0:
            raise ValueError("Absolute target must be positive.")
        desired_minority = int(target)
    else:
        raise ValueError("target_type must be 'ratio' or 'absolute'.")

    # do nothing if already enough
    if minority_count >= desired_minority:
        return df.reset_index(drop=True)

    # prepare X, y
    X = df.drop(columns=[label_col])
    y = df[label_col]

    samples_needed = desired_minority - minority_count

    # Try SMOTE when allowed and feasible
    if use_smote and minority_count >= (smote_k_neighbors + 1):
        sampling_strategy = {minority_label: desired_minority}
        sm = SMOTE(sampling_strategy=sampling_strategy, random_state=random_state, k_neighbors=smote_k_neighbors)
        X_res, y_res = sm.fit_resample(X, y)
        df_res = pd.concat([X_res, y_res], axis=1)
        return df_res.reset_index(drop=True)

    # Otherwise use RandomOverSampler to reach the desired minority count, then add Gaussian noise
    sampling_strategy = {minority_label: desired_minority}
    ros = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=random_state)
    X_res, y_res = ros.fit_resample(X, y)

    # Add noise only to numeric columns
    numeric_cols = X_res.select_dtypes(include=[np.number]).columns.tolist()
    if len(numeric_cols) == 0:
        # nothing numeric to perturb; return as-is
        df_res = pd.concat([X_res, y_res], axis=1)
        return df_res.reset_index(drop=True)

    # Need to add noise only to the newly created rows for the minority class.
    # RandomOverSampler appends new samples but ordering is not strictly guaranteed across implementations,
    # so we'll locate rows where label == minority and that were not in original indices.
    # Simpler and safe approach: compute how many minority rows were originally present, then perturb that
    # many of the minority rows *beyond the first minority_count* occurrences.
    minority_indices = np.flatnonzero((y_res.values == minority_label))
    # the first minority_count occurrences may correspond to original minority examples, so perturb the tail.
    # But to be safe (when order unknown), we'll pick `samples_needed` indices from the minority_indices,
    # preferring those with duplicated rows compared to original set.
    # We'll find rows in X_res that exactly match any original minority row — mark them as originals.
    original_minority_df = X[y == minority_label].reset_index(drop=True)
    # Build a boolean mask whether a row in X_res matches any original minority row (fast-ish via tuple matching)
    orig_tuples = set(map(tuple, original_minority_df[numeric_cols].fillna(0).values))
    rows_to_perturb = []
    for i, row in enumerate(X_res[numeric_cols].fillna(0).values):
        if y_res.iloc[i] != minority_label:
            continue
        tup = tuple(row)
        if tup not in orig_tuples:
            rows_to_perturb.append(i)
    # if we didn't find enough "new" rows by exact matching (rare), pick from minority_indices tail
    if len(rows_to_perturb) < samples_needed:
        # choose from minority_indices the last samples_needed indexes that are not in rows_to_perturb
        extras = [int(i) for i in minority_indices if int(i) not in rows_to_perturb]
        # take from the end (these are likely the duplicated ones)
        take = extras[-samples_needed:]
        rows_to_perturb = (rows_to_perturb + take)[-samples_needed:]

    rows_to_perturb = rows_to_perturb[:samples_needed]

    # compute stds on original data (avoid influence from synthetic data)
    per_feature_stds = X[numeric_cols].std().replace(0, 1.0)  # avoid zero std
    rng = np.random.RandomState(random_state)
    rows_to_perturb = np.asarray(rows_to_perturb, dtype=int)
    
    # Generate noise for *all* rows at once
    noise_matrix = rng.normal(
        loc=0.0,
        scale=(per_feature_stds * noise_scale).values,
        size=(len(rows_to_perturb), len(numeric_cols))
    )
    
    # Add noise in one assignment (no copy, no SettingWithCopyWarning)
    X_res.loc[X_res.index[rows_to_perturb], numeric_cols] += noise_matrix

    df_res = pd.concat([X_res, y_res], axis=1)
    return df_res.reset_index(drop=True)


oversample_with_noise(
    test_segmented['AAAACT'],
    'label',
    minority_label=1,
    target=0.1,
    target_type='ratio',   # 'ratio' or 'absolute'
    noise_scale=0.02,      # proportion of feature std for noise
    random_state=42,
    use_smote=True,
    smote_k_neighbors=5
)

ValueError: could not convert string to float: 'ENST00000000412'

In [82]:
len(y_actual_seg)

44

In [144]:
print("Total:"+str(sum(map(len,val_segmented.values()))))
print("Pred:"+str(len(filter_y_pred)))

Total:87350
Pred:87350


### Evaluation

In [145]:
from sklearn.metrics import roc_auc_score, average_precision_score



# ROC AUC
roc_auc = roc_auc_score(filter_y_actual, filter_y_pred)

# PR AUC (Average Precision)
pr_auc = average_precision_score(filter_y_actual, filter_y_pred)

print("ROC AUC:", roc_auc)
print("PR AUC:", pr_auc)

ROC AUC: 0.5631527387114106
PR AUC: 0.084635562249819


In [85]:
sum(y_pred>0.5)

24274

In [78]:
y_h = np.array([])
y_h = np.append(y_h, post_middle[:,2])
y_h = np.append(y_h, post_middle[:,2])

len(y_h)

17596

In [27]:
summary = (
    train.groupby("SEQ", as_index=False)
      .agg(
          positive_label=("label", "sum"),
          total=("label", "count")
      )
)

summary["prop"] = summary["positive_label"] / summary["total"]

overall_prop = summary["positive_label"].sum() / summary["total"].sum()

from scipy.stats import binomtest

summary["p_value"] = summary.apply(
    lambda row: binomtest(
        row["positive_label"], 
        row["total"], 
        overall_prop, 
        alternative='two-sided'
    ).pvalue,
    axis=1
)


# Step 4: Multiple testing correction
summary["p_fdr"] = multipletests(summary["p_value"], method="fdr_bh")[1]
summary["p_bonferroni"] = multipletests(summary["p_value"], method="bonferroni")[1]

summary['prop_relative'] = summary.apply(

    lambda row: row['prop']/overall_prop,
    axis = 1
    
)
print("Overall mean proportion =", overall_prop)
summary

Overall mean proportion = 0.04886430397945919


Unnamed: 0,SEQ,positive_label,total,prop,p_value,p_fdr,p_bonferroni,prop_relative
0,AAAACAA,62,115107,0.000539,0.000000e+00,0.000000e+00,0.000000e+00,0.011023
1,AAAACAC,842,44283,0.019014,6.029589e-240,1.039833e-239,1.736522e-237,0.389120
2,AAAACAG,487,54262,0.008975,0.000000e+00,0.000000e+00,0.000000e+00,0.183671
3,AAAACAT,142,72284,0.001964,0.000000e+00,0.000000e+00,0.000000e+00,0.040203
4,AAAACCA,93,64406,0.001444,0.000000e+00,0.000000e+00,0.000000e+00,0.029551
...,...,...,...,...,...,...,...,...
283,TTGACCT,1011,50758,0.019918,1.037660e-255,1.915680e-255,2.988460e-253,0.407619
284,TTGACTA,57,19329,0.002949,0.000000e+00,0.000000e+00,0.000000e+00,0.060350
285,TTGACTC,1676,25420,0.065932,2.687902e-33,3.071888e-33,7.741158e-31,1.349295
286,TTGACTG,4000,33458,0.119553,0.000000e+00,0.000000e+00,0.000000e+00,2.446630


### Filtering away insufficient data

In [89]:
top10 = summary[summary["prop"] >= summary["prop"].quantile(0.95)]
top10['SEQ']

22     AAGACTG
34     AGAACTG
45     AGGACTC
46     AGGACTG
47     AGGACTT
116    CGGACTA
117    CGGACTC
118    CGGACTG
119    CGGACTT
189    GGGACTC
190    GGGACTG
191    GGGACTT
261    TGGACTC
262    TGGACTG
263    TGGACTT
Name: SEQ, dtype: object

In [41]:
summary

Unnamed: 0,SEQ,positive_label,total,prop,p_value,p_fdr,p_bonferroni,prop_relative
0,AAAACAA,62,115107,0.000539,0.000000e+00,0.000000e+00,0.000000e+00,0.011023
1,AAAACAC,842,44283,0.019014,6.029589e-240,1.039833e-239,1.736522e-237,0.389120
2,AAAACAG,487,54262,0.008975,0.000000e+00,0.000000e+00,0.000000e+00,0.183671
3,AAAACAT,142,72284,0.001964,0.000000e+00,0.000000e+00,0.000000e+00,0.040203
4,AAAACCA,93,64406,0.001444,0.000000e+00,0.000000e+00,0.000000e+00,0.029551
...,...,...,...,...,...,...,...,...
283,TTGACCT,1011,50758,0.019918,1.037660e-255,1.915680e-255,2.988460e-253,0.407619
284,TTGACTA,57,19329,0.002949,0.000000e+00,0.000000e+00,0.000000e+00,0.060350
285,TTGACTC,1676,25420,0.065932,2.687902e-33,3.071888e-33,7.741158e-31,1.349295
286,TTGACTG,4000,33458,0.119553,0.000000e+00,0.000000e+00,0.000000e+00,2.446630


In [5]:
X_train = train[['PreTime', 'PreSD', 'PreMean', 
               'InTime', 'InSD', 'InMean', 
               'PostTime', 'PostSD', 'PostMean']].to_numpy().reshape(-1,3,3)

y_train = np.column_stack([
    np.zeros(len(train), dtype=int),
    train['label'].to_numpy() + 1,
    np.full(len(train), 3, dtype=int)
])

print(X_train)
print(y_train)

[[[2.99e-03 2.06e+00 1.25e+02]
  [1.77e-02 1.04e+01 1.22e+02]
  [9.30e-03 1.09e+01 8.41e+01]]

 [[6.31e-03 2.53e+00 1.25e+02]
  [8.44e-03 4.67e+00 1.26e+02]
  [1.03e-02 6.30e+00 8.09e+01]]

 [[4.65e-03 3.92e+00 1.09e+02]
  [1.36e-02 1.20e+01 1.24e+02]
  [4.98e-03 2.13e+00 7.96e+01]]

 ...

 [[7.21e-03 4.58e+00 1.05e+02]
  [3.98e-03 6.58e+00 1.13e+02]
  [3.16e-03 2.28e+00 8.53e+01]]

 [[2.66e-03 2.33e+00 1.09e+02]
  [9.13e-03 1.04e+01 1.08e+02]
  [6.64e-03 4.44e+00 7.68e+01]]

 [[5.64e-03 3.13e+00 1.10e+02]
  [3.03e-03 9.98e+00 1.18e+02]
  [1.93e-02 1.79e+00 7.62e+01]]]
[[0 1 3]
 [0 1 3]
 [0 1 3]
 ...
 [0 1 3]
 [0 1 3]
 [0 1 3]]
