
# Import packages

In [None]:
import sys
import pandas as pd
import numpy as np
import glob
import os
import scipy.stats as st
import random
import matplotlib.pyplot as plt

sys.path.append('../../')
from utils import dataframe_utils

In [None]:
satellite_ttherm_ids = list((pd.read_csv('./selected-annotation-data.tsv', delimiter='\t'))['TTHERM_ID'].values)

In [None]:
len(satellite_ttherm_ids)

# Import the data for each timepoint

In [None]:
# file_paths = sorted(glob.glob('./rna_seq_count_data/kallisto_quant_2024/kallisto_quant_p_trimmed_*'))
# file_paths = sorted(glob.glob('./rna_seq_count_data/kallisto_quant_2024_exon/kallisto_quant_p_trimmed_*'))
file_paths = sorted(glob.glob('./rna_seq_count_data/kallisto_quant_2024_jun28_cds/kallisto_quant_p_trimmed_*'))
file_paths

In [None]:
dict_tpms = {}

for fp in file_paths:
    split = ((os.path.splitext(os.path.basename(fp))[0]).split('_'))
    key = '_'.join(split[len(split)-2:len(split)])
    print(key)
    dict_tpms[key] = (pd.read_csv(os.path.join(fp, 'abundance.tsv'), delimiter='\t'))

print(np.unique([df.shape for df in dict_tpms.values()]))

In [None]:
dict_tpms['000min_A'].head()

In [None]:
def computeRPK(row):
    if not row['eff_length'] > 0:
        print('ZERO sequence length:', row['target_id'])
        print(row)
        return 'ZERO sequence length'
    return row['est_counts']/row['eff_length']

def computeTPM(row, normalization_factor):
    return row['rpk']/normalization_factor

def compute_tpm_columns(df_dict):
    df_tpm = {}
    normalization_factor_dict = {}
    for key, df in df_dict.items():
        tpm_df = df
        tpm_df['rpk'] = tpm_df.apply(computeRPK, axis=1)
        tpm_df = tpm_df.loc[tpm_df['rpk']!='ZERO sequence length']

        sum_rpk = tpm_df['rpk'].sum()

        normalization_factor = sum_rpk/1_000_000

        tpm_df['tpm_manual'] = tpm_df.apply(computeTPM, axis=1, args=(normalization_factor,))

        df_tpm[key] = tpm_df  
        normalization_factor_dict[key] = normalization_factor

    return df_tpm, normalization_factor_dict

def compute_cpm_columns(df_dict):
    df_cpm = {}
    for key, df in df_dict.items():
        cpm_df = df

        sum_count = cpm_df['est_counts'].sum()

        cpm_df['cpm_manual'] = (cpm_df['est_counts'] / sum_count) * 1_000_000

        df_cpm[key] = cpm_df  

    return df_cpm

In [None]:
dict_tpms, norm_factors_dict = compute_tpm_columns(dict_tpms)

In [None]:
dict_tpms['150min_A'].head()

In [None]:
dict_cpms = compute_cpm_columns(dict_tpms)

In [None]:
dict_cpms['150min_A'].head()

In [None]:
norm_factors_dict

In [None]:
ttherm_id = 'YF00013476.t1'

dict_tpms['000min_A'].loc[dict_tpms['000min_A']['target_id'] == ttherm_id]

In [None]:
dict_tpms['000min_A'].shape

## Checkout a single timepoint

In [None]:
num_nonzero = (dict_tpms['000min_A'].loc[dict_tpms['000min_A']['tpm'] != 0.0].shape)[0]

mean = dict_tpms['000min_A']['tpm'].mean()

std = dict_tpms['000min_A']['tpm'].std()

str_summary_wzeros = f'''WITH ZEROS...
MEAN: {mean}
MEDIAN: {dict_tpms['000min_A']['tpm'].median()}
STDEV: {std}
CV: {(std/mean)*100}
MAX: {dict_tpms['000min_A']['tpm'].max()}
MIN: {dict_tpms['000min_A']['tpm'].min()}
# GENES EXPRESSED: {num_nonzero}
'''

df_tpm_wozeros = dict_tpms['000min_A'].loc[dict_tpms['000min_A']['tpm'] != 0.0]

mean_wozeros = df_tpm_wozeros['tpm'].mean()

std_wozeros = df_tpm_wozeros['tpm'].std()

str_summary_wozeros = f'''WITHOUT ZEROS...
MEAN: {mean_wozeros}
MEDIAN: {df_tpm_wozeros['tpm'].median()}
STDEV: {std_wozeros}
CV: {(std_wozeros/mean_wozeros)*100}
MAX: {df_tpm_wozeros['tpm'].max()}
MIN: {df_tpm_wozeros['tpm'].min()}
# GENES EXPRESSED: {num_nonzero}
'''

print(str_summary_wzeros, '\n', str_summary_wozeros, sep='')


# Create a dataframe containing all data at every timepoint (sorted by TTHERM_ID)

In [None]:
column_prefixes = sorted(list(dict_tpms.keys()))
column_prefixes

In [None]:
complete_data_df = None

for col_prefix in column_prefixes:
    curr_df = dict_tpms[col_prefix].sort_values(by='target_id')
    curr_df.rename(columns={col: (f'{col_prefix}_{col}') for col in list(curr_df.columns) if col != 'target_id'}, inplace=True)

    if complete_data_df is None:
        complete_data_df = curr_df
        continue

    complete_data_df = dataframe_utils.sql_query_df({'complete_data_df': complete_data_df, 'curr_df': curr_df},
                                                    '''SELECT * FROM complete_data_df
                                                       INNER JOIN curr_df USING(target_id)'''
                                                    )

complete_data_df.rename(columns={'target_id': 'TTHERM_ID'}, inplace=True)
complete_data_df


# SATELLITE GENE COUNTS

In [None]:
(complete_data_df[['TTHERM_ID'] + [col for col in complete_data_df if 'counts' in col]].loc[complete_data_df['TTHERM_ID'].isin(satellite_ttherm_ids)])

In [None]:
# 10 reads cuttoff
complete_data_df['max_count'] = complete_data_df[[col for col in complete_data_df if 'counts' in col]].apply(lambda x: max(x), axis=1)

In [None]:
complete_data_df_filtered = complete_data_df.loc[complete_data_df['max_count'] > 10]
complete_data_df_filtered

# SATELLITE GENE COUNTS AFTER FILTERING

In [None]:
(complete_data_df_filtered[['TTHERM_ID'] + [col for col in complete_data_df_filtered if 'counts' in col]].loc[complete_data_df_filtered['TTHERM_ID'].isin(satellite_ttherm_ids)])

In [None]:
# Assuming you have your gene expression dataset loaded into a DataFrame called 'data'
# Columns: Genes (rows), Phases (columns)
# Rows: Genes, Expression values across phases

df_data = complete_data_df.loc[:, [col for col in complete_data_df if 'counts' in col]]

# Plot boxplot for each phase
plt.figure(figsize=(12, 8))
df_data.boxplot(rot=90)
plt.title('Read Count distribution of all genes for each phase')
plt.xlabel('Phases')
plt.ylabel('Read Count')
plt.tight_layout()
plt.ylim(top=1000)
plt.ylim(bottom=-2.5)
plt.show()

# Create a dataframe containing all TPM data at every timepoint (sorted by TTHERM_ID)

In [None]:
columns = sorted(list(dict_tpms.keys()))
columns

In [None]:
curr_df = dict_tpms['000min_A'].sort_values(by='target_id')
dict_df_all_tpm = {'TTHERM_ID': curr_df['target_id'].values}
print(curr_df.head(3))

for col in columns:
    curr_df = dict_tpms[col].sort_values(by='target_id')
    print(curr_df.head(3))
    dict_df_all_tpm[col] = (curr_df)['tpm'].values

df_all_tpm = pd.DataFrame(dict_df_all_tpm)

print(df_all_tpm.shape)

df_all_tpm.head()

In [None]:
all_cols = list(df_all_tpm.columns)
all_cols

In [None]:
tpm_cols = [col for col in df_all_tpm.columns[1:] if 'diff' not in col]
tpm_cols

# Create a dataframe containing all CPM data at every timepoint (sorted by TTHERM_ID)

In [None]:
columns = sorted(list(dict_tpms.keys()))
columns

In [None]:
curr_df = dict_cpms['000min_A'].sort_values(by='target_id')
dict_df_all_cpm = {'TTHERM_ID': curr_df['target_id'].values}
print(curr_df.head(3))

for col in columns:
    curr_df = dict_cpms[col].sort_values(by='target_id')
    print(curr_df.head(3))
    dict_df_all_cpm[col] = (curr_df)['cpm_manual'].values

df_all_cpm = pd.DataFrame(dict_df_all_cpm)

print(df_all_cpm.shape)

df_all_cpm.head()

# Jaccard filtering: https://academic.oup.com/bioinformatics/article/29/17/2146/240530#SEC2.2

In [None]:
cols_a = [col for col in df_all_cpm if 'A' in col]
cols_a

In [None]:
def jaccard_similarity(set1, set2):
    # intersection of two sets
    intersection = len(set1.intersection(set2))
    # Unions of two sets
    union = len(set1.union(set2))
     
    return intersection / union

In [None]:
np.percentile(df_all_cpm.loc[:, tpm_cols].to_numpy().flatten(), 14)

In [None]:
percentiles = np.arange(14, 31, 0.1)

tpm_cutoffs = []

jaccard_similarity_means = []

for _percentile in percentiles:

    tpm_cutoff = np.percentile(df_all_cpm.loc[:, tpm_cols].to_numpy().flatten(), _percentile)

    tpm_cutoffs.append(tpm_cutoff)

    jaccard_similarities = []

    for col_a in cols_a:
        col_b = col_a.replace('A', 'B')

        jaccard_similarities.append(jaccard_similarity(set(df_all_cpm.loc[df_all_cpm[col_a] > tpm_cutoff, 'TTHERM_ID'].values), 
                                                        set(df_all_cpm.loc[df_all_cpm[col_b] > tpm_cutoff, 'TTHERM_ID'].values)))

    jaccard_similarity_means.append(np.mean(jaccard_similarities))


In [None]:
plt.scatter(x=tpm_cutoffs, y=jaccard_similarity_means)
plt.xlabel('Maximum CPM Threshold')
plt.ylabel('Jaccard Similarity Mean')
plt.show()

In [None]:
print(np.argmax(jaccard_similarity_means))
print(jaccard_similarity_means[np.argmax(jaccard_similarity_means)])
print(tpm_cutoffs[np.argmax(jaccard_similarity_means)])

In [None]:
CPM_THRESHOLD = tpm_cutoffs[np.argmax(jaccard_similarity_means)]

In [None]:
df_all_cpm['max_cpm'] = df_all_cpm[tpm_cols].apply(lambda x: max(x), axis=1)

In [None]:
df_all_cpm_filtered = df_all_cpm.loc[df_all_cpm['max_cpm'] > CPM_THRESHOLD]
df_all_cpm_filtered.shape, df_all_cpm.shape

In [None]:
((df_all_cpm_filtered[['TTHERM_ID'] + tpm_cols].loc[df_all_cpm_filtered['TTHERM_ID'].isin(satellite_ttherm_ids)])).shape

# Translate to TTHERM_ID

In [None]:
df_y_to_ttherm = pd.read_csv('../tgd2024/yf_ttherm_mapping_feb2024.csv')
dict_y_to_ttherm = {yf: ttherm for yf, ttherm in zip(df_y_to_ttherm['yf2024'].values, df_y_to_ttherm['ttherm2021'].values)}

In [None]:
dict_y_to_ttherm

In [None]:
# df_all_tpm['TTHERM_ID'] = [dict_y_to_ttherm[id] if id in dict_y_to_ttherm else id for id in df_all_tpm['TTHERM_ID'].values]
df_all_tpm.sample(10)

In [None]:
target_ttherms = [
'YF00038249.t1',
'YF00038144.t1',
'YF00037875.t1',
'YF00036137.t1',
'YF00035125.t1',
'YF00032971.t1',
'YF00031769.t1',
'YF00030211.t1',
'YF00029957.t1',
'YF00029882.t1',
'YF00029765.t1',
'YF00028141.t1',
'YF00027752.t1',
'YF00027453.t1',
'YF00027225.t1',
'YF00026417.t1',
'YF00025835.t1',
'YF00022485.t1',
'YF00021713.t1',
'YF00019938.t1',
'YF00016416.t1',
'YF00014101.t1',
'YF00012770.t1',
'YF00011364.t1',
'YF00011107.t1',
'YF00010540.t1',
'YF00008765.t1',
'YF00008689.t1',
'YF00008085.t1',
'YF00008072.t1',
'YF00007477.t1',
'YF00005570.t1',
'YF00004316.t1',
'YF00003349.t1',
'YF00002595.t1',
'YF00000081.t1',
]

df_all_tpm.loc[df_all_tpm['TTHERM_ID'].isin(target_ttherms)]

## Check out genes with all zeros

In [None]:
rows_with_all_zeros = df_all_tpm[(df_all_tpm.iloc[:, 1:] == 0.0).all(axis=1)]

print(rows_with_all_zeros.shape)

rows_with_all_zeros.head()

## Check out rows with one nonzero

In [None]:
rows_with_one_nonzero = df_all_tpm[(df_all_tpm.iloc[:, 1:] == 0.0).sum(axis=1) == 17]

print(rows_with_one_nonzero.shape)

rows_with_one_nonzero.head()

## Check out the distribution of the number of zeros

In [None]:
num_zeros_counts = {}

for i in range(20):
    num_zeros = (df_all_tpm.iloc[:, 1:] == 0.0).sum(axis=1)
    
    rows_with_num_zeros = df_all_tpm[num_zeros == i]
    
    num_zeros_counts[i] = len(rows_with_num_zeros)

for num_zeros, count in num_zeros_counts.items():
    print(f"Number of rows with {num_zeros} zeros: {count}")

## Filter out genes with all zeros

In [None]:
df_no_all_zeros = df_all_tpm[~df_all_tpm.isin(rows_with_all_zeros)].dropna()
df_no_all_zeros.reset_index()
print(df_all_tpm.shape[0]-df_no_all_zeros.shape[0])
df_no_all_zeros.head()

In [None]:
df_all_zeros = df_all_tpm[df_all_tpm.isin(rows_with_all_zeros)].dropna()
df_all_zeros.reset_index()
print(df_all_tpm.shape[0]-df_all_zeros.shape[0])
df_all_zeros.head()

In [None]:
df_no_all_zeros.shape

## Check out the genes which display the highest expression

In [None]:
df_no_all_zeros.loc[:, df_no_all_zeros.columns[1:]].max(axis=None)

In [None]:
max_tpm_ttherms = []

for col in df_no_all_zeros.columns[1:]:
    max_tpm = df_no_all_zeros[col].max()
    max_tpm_ttherm = df_no_all_zeros['TTHERM_ID'].loc[df_no_all_zeros[col] == df_no_all_zeros[col].max()].values[0]
    print(col, 'MAX:', max_tpm, '|', max_tpm_ttherm)
    max_tpm_ttherms.append(max_tpm_ttherm)
    

In [None]:
np.unique(max_tpm_ttherms)

Identifier	Gene Name	Alias Name(s)	Description
TTHERM_00105330	RPS28	7.m00482, PreTt23033	RPS28 40S ribosomal protein S28e putative; Homolog of yeast RPS28- human RPS28;

Identifier	Gene Name	Alias Name(s)	Description
TTHERM_00682940		103.m00122, 3734.m00034, PreTt16131	60S ribosomal protein L29

In [None]:
df_no_all_zeros_cols = df_no_all_zeros.columns[1:]
df_no_all_zeros_cols

In [None]:
df_no_all_zeros_cols_a = [col for col in df_no_all_zeros_cols if 'A' in col]
df_no_all_zeros_cols_a

## Compute the TPM difference fractions between duplicates

In [None]:
def compute_diff_fraction(row):
    values = row.values
    if all(value == 0 for value in values):
        return 1
    return min(values) / max(values)

for col_a in df_no_all_zeros_cols_a:
    col_b = col_a.replace('A', 'B')
    df_no_all_zeros[f'{col_a.replace("_A", "")}_diff'] = df_no_all_zeros[[col_a, col_b]].apply(compute_diff_fraction, axis=1)

In [None]:
df_no_all_zeros.sample(10)

### Sanity check

In [None]:
[(str(num)).zfill(3) for num in np.arange(0, 241, 30)]

In [None]:
random_ttherms = [random.choice(df_no_all_zeros['TTHERM_ID'].values) for _ in range(10)]
random_time = random.choice([f'{((str(num)).zfill(3))}min' for num in np.arange(0, 241, 30)])

print('TIME:', random_time)

for ttherm in random_ttherms:
    val_a = df_no_all_zeros[f'{random_time}_A'].loc[df_no_all_zeros['TTHERM_ID'] == ttherm].values[0]
    val_b = df_no_all_zeros[f'{random_time}_B'].loc[df_no_all_zeros['TTHERM_ID'] == ttherm].values[0]
    val_diff = df_no_all_zeros[f'{random_time}_diff'].loc[df_no_all_zeros['TTHERM_ID'] == ttherm].values[0]

    manual_val_diff = 1 if all(value == 0 for value in (val_a, val_b)) else min((val_a, val_b))/max((val_a, val_b))

    print(f'{ttherm} | {min([val_a, val_b])}/{max([val_a, val_b])} = {val_diff} | {manual_val_diff} | {val_diff == (manual_val_diff)}')


## Compute the differences between TPM and ∆TPM for the two repeated cycles

In [None]:
df_no_all_zeros.head()

In [None]:
percent_diff_col_pairs = [['000min_A', '210min_A'], ['000min_B', '210min_B'], ['000min_diff', '210min_diff'], ['030min_A', '240min_A'], ['030min_B', '240min_B'], ['030min_diff', '240min_diff']]
print_cols = []

for pair in percent_diff_col_pairs:
    col_1 = pair[0]
    col_2 = pair[1]
    new_col_name = f'percent_diff_{col_1}_{col_2}'
    df_no_all_zeros[new_col_name] = np.nan_to_num(100 * (np.absolute(df_no_all_zeros[col_1] - df_no_all_zeros[col_2])/((df_no_all_zeros[col_1] + df_no_all_zeros[col_2]) / 2)), nan=0)
    print_cols.append(new_col_name)

df_no_all_zeros.head()

In [None]:
print_cols_padding = max([len(col) for col in print_cols])

for col in print_cols:
    print(f'{" "*(print_cols_padding-len(col))}{col}:', df_no_all_zeros[col].mean())

In [None]:
print_cols_padding = max([len(col) for col in print_cols])

for col in print_cols:
    print(f'{" "*(print_cols_padding-len(col))}{col}:', df_no_all_zeros[col].median())

In [None]:
print_cols_padding = max([len(col) for col in print_cols])

for col in print_cols:
    print(f'{" "*(print_cols_padding-len(col))}{col}:', df_no_all_zeros[col].std())

## Take a look at the distribution of differences

In [None]:
diff_cols = [col for col in list(df_no_all_zeros.columns) if 'diff' in col and 'percent_diff' not in col]
diff_cols

In [None]:
diffs_flattened = df_no_all_zeros.loc[:, diff_cols].to_numpy().flatten()
diffs_flattened.shape, diffs_flattened

In [None]:
st.percentileofscore(diffs_flattened, 0, kind='weak')

In [None]:
sorted(diffs_flattened, reverse=True)

In [None]:
# plt.hist(diffs_flattened, bins=10000, edgecolor='black')

# # line_num = 200
# # plt.axvline(x=line_num, color='red', linestyle='dashed', linewidth=0.5, label='line at x={}'.format(line_num))
# # plt.legend()

# plt.xlabel('∆TPM Between Duplicates')
# plt.ylabel('# of ∆TPMs')
# plt.title(f'Distribution of ∆TPM Between Duplicates')

# plt.show()

# plt.hist(diffs_flattened, bins=10000, edgecolor='black')

# # line_num = 200
# # plt.axvline(x=line_num, color='red', linestyle='dashed', linewidth=0.5, label='line at x={}'.format(line_num))
# # plt.legend()

# plt.xlabel('∆TPM Between Duplicates')
# plt.ylabel('# of ∆TPMs')
# plt.title(f'Distribution of ∆TPM Between Duplicates')

# plt.ylim(top=200)

# plt.show()

In [None]:
tpms_flattened = df_no_all_zeros[tpm_cols].to_numpy().flatten()
tpms_flattened.shape, tpms_flattened

## Take a look at the distribution of TPMs

In [None]:
# plt.hist(tpms_flattened, bins=100000, edgecolor='black')

# # line_num = 200
# # plt.axvline(x=line_num, color='red', linestyle='dashed', linewidth=0.5, label='experimental at x={}'.format(line_num))
# # plt.legend()

# plt.xlabel('TPM')
# plt.ylabel('# of TPMs')
# plt.title(f'Distribution of TPMs')

# plt.show()

# plt.hist(tpms_flattened, bins=100000, edgecolor='black')

# # line_num = 200
# # plt.axvline(x=line_num, color='red', linestyle='dashed', linewidth=0.5, label='experimental at x={}'.format(line_num))
# # plt.legend()

# plt.xlabel('TPM')
# plt.ylabel('# of TPMs')
# plt.title(f'Distribution of TPMs')

# plt.ylim(top=50)

# plt.show()

In [None]:
sorted(tpms_flattened)

# Take the geometric mean of each set of timepoint duplicates

In [None]:
def geo_mean_df_of_duplicates(df: pd.DataFrame):
    return_df = df.loc[:, df.columns[0:1]]
    num_df = df.loc[:, df.columns[1:]]

    idxa = 0
    idxb = 1

    num_df_cols = list(num_df.columns)

    while idxb < len(num_df_cols):
        col_a_split = num_df_cols[idxa].split('_')
        col_name = '_'.join(col_a_split[:len(col_a_split)-1])
        return_df[col_name] = np.sqrt(num_df[num_df_cols[idxa]] * num_df[num_df_cols[idxb]])

        idxa += 1
        idxb += 1
    
    return return_df

def ari_mean_df_of_duplicates(df: pd.DataFrame):
    return_df = df.loc[:, df.columns[0:1]]
    num_df = df.loc[:, df.columns[1:]]

    idxa = 0
    idxb = 1

    num_df_cols = list(num_df.columns)

    while idxb < len(num_df_cols):
        col_a_split = num_df_cols[idxa].split('_')
        col_name = '_'.join(col_a_split[:len(col_a_split)-1])
        return_df[col_name] = num_df[num_df_cols[idxa]] + num_df[num_df_cols[idxb]] / 2

        idxa += 1
        idxb += 1
    
    return return_df

In [None]:
df_all_tpms_averaged_geo = geo_mean_df_of_duplicates(df_no_all_zeros.loc[:, all_cols])
df_all_tpms_averaged_geo

In [None]:
df_all_tpms_averaged_geo_flattened = df_all_tpms_averaged_geo.iloc[:, 1:].to_numpy().flatten()

In [None]:
# plt.hist(df_all_tpms_averaged_geo_flattened, bins=100000, edgecolor='black')

# # line_num = 200
# # plt.axvline(x=line_num, color='red', linestyle='dashed', linewidth=0.5, label='experimental at x={}'.format(line_num))
# # plt.legend()

# plt.xlabel('TPM')
# plt.ylabel('# of TPMs')
# plt.title(f'Distribution of TPMs')

# plt.show()

# plt.hist(df_all_tpms_averaged_geo.iloc[:, 1:].to_numpy().flatten(), bins=100000, edgecolor='black')

# # line_num = 200
# # plt.axvline(x=line_num, color='red', linestyle='dashed', linewidth=0.5, label='experimental at x={}'.format(line_num))
# # plt.legend()

# plt.xlabel('TPM')
# plt.ylabel('# of TPMs')
# plt.title(f'Distribution of TPMs')

# plt.ylim(top=50)

# plt.show()

In [None]:
df_all_tpms_averaged_ari = ari_mean_df_of_duplicates(df_no_all_zeros.loc[:, all_cols])
df_all_tpms_averaged_ari

In [None]:
df_all_tpms_averaged_ari_flattened = df_all_tpms_averaged_ari.iloc[:, 1:].to_numpy().flatten()

In [None]:
# plt.hist(df_all_tpms_averaged_ari_flattened, bins=100000, edgecolor='black')

# # line_num = 200
# # plt.axvline(x=line_num, color='red', linestyle='dashed', linewidth=0.5, label='experimental at x={}'.format(line_num))
# # plt.legend()

# plt.xlabel('TPM')
# plt.ylabel('# of TPMs')
# plt.title(f'Distribution of TPMs')

# plt.show()

# plt.hist(df_all_tpms_averaged_ari.iloc[:, 1:].to_numpy().flatten(), bins=100000, edgecolor='black')

# # line_num = 200
# # plt.axvline(x=line_num, color='red', linestyle='dashed', linewidth=0.5, label='experimental at x={}'.format(line_num))
# # plt.legend()

# plt.xlabel('TPM')
# plt.ylabel('# of TPMs')
# plt.title(f'Distribution of TPMs')

# plt.ylim(top=50)

# plt.show()

# Check out the TPM values at different percentiles

In [None]:
print('% ALL TPMs <= 0: ', st.percentileofscore(tpms_flattened, 0, kind='weak'))
print('% ALL TPMs == 0: ', (1 - (np.count_nonzero(tpms_flattened)/len(tpms_flattened))) * 100)
print('TPM AT 10th PERCENTILE OF ALL TPMs: ', np.percentile(tpms_flattened, 10))
print('TPM AT 10th PERCENTILE OF ALL TPMs W/O ZEROS: ', np.percentile(tpms_flattened[tpms_flattened != 0], 10))

In [None]:
print('% ALL TPMs <= 0: ', st.percentileofscore(df_all_tpms_averaged_geo_flattened, 0, kind='weak'))
print('% ALL TPMs == 0: ', (1 - (np.count_nonzero(df_all_tpms_averaged_geo_flattened)/len(df_all_tpms_averaged_geo_flattened))) * 100)
print('TPM AT 10th PERCENTILE OF ALL TPMs: ', np.percentile(df_all_tpms_averaged_geo_flattened, 10))
print('TPM AT 10th PERCENTILE OF ALL TPMs W/O ZEROS: ', np.percentile(df_all_tpms_averaged_geo_flattened[df_all_tpms_averaged_geo_flattened != 0], 10))

In [None]:
print('% ALL TPMs <= 0: ', st.percentileofscore(df_all_tpms_averaged_ari_flattened, 0, kind='weak'))
print('% ALL TPMs == 0: ', (1 - (np.count_nonzero(df_all_tpms_averaged_ari_flattened)/len(df_all_tpms_averaged_ari_flattened))) * 100)
print('TPM AT 10th PERCENTILE OF ALL TPMs: ', np.percentile(df_all_tpms_averaged_ari_flattened, 10))
print('TPM AT 10th PERCENTILE OF ALL TPMs W/O ZEROS: ', np.percentile(df_all_tpms_averaged_ari_flattened[df_all_tpms_averaged_ari_flattened != 0], 10))

# Graphing exploration for filtering

In [None]:
satellite_ttherm_ids[: 20]

In [None]:
df_tpm_data = df_all_tpm[tpm_cols]

In [None]:
# Assuming you have your gene expression dataset loaded into a DataFrame called 'data'
# Columns: Genes (rows), Phases (columns)
# Rows: Genes, Expression values across phases

# Plot boxplot for each phase
plt.figure(figsize=(12, 8))
df_tpm_data.boxplot(rot=90)
plt.title('Expression distribution of all genes for each phase')
plt.xlabel('Phases')
plt.ylabel('Expression values')
plt.tight_layout()
plt.ylim(top=10)
plt.ylim(bottom=-2.5)
plt.show()


In [None]:
# Assuming you have your gene expression dataset loaded into a DataFrame called 'data'
# Columns: Genes (rows), Phases (columns)
# Rows: Genes, Expression values across phases

# Step 1: Phase-specific outliers
# Calculate expression distribution for each gene across phases
gene_expression_distribution = df_tpm_data.describe().T  # Transpose for easier processing

# Plot boxplots or histograms for each gene
plt.figure(figsize=(10, 6))
gene_expression_distribution.boxplot()
plt.title('Expression distribution of each gene across phases')
plt.xlabel('Phases')
plt.ylabel('Expression values')
plt.show()

# Step 2: Gene-specific outliers
# Calculate expression distribution for each phase across genes
phase_expression_distribution = df_tpm_data.describe()

# Plot boxplots or histograms for each phase
plt.figure(figsize=(10, 6))
phase_expression_distribution.T.boxplot()
plt.title('Expression distribution of all genes for each phase')
plt.xlabel('Phases')
plt.ylabel('Expression values')
plt.show()

In [None]:
# # Identify genes with consistently low or high expression values across phases
# gene_outliers = []
# for gene in data.index:
#     if (data.loc[gene] < threshold).all() or (data.loc[gene] > threshold).all():
#         gene_outliers.append(gene)

# # Filter out genes with consistent outlier behavior
# filtered_data = data.drop(gene_outliers, axis=0)

# Filter out genes

In [None]:
tpm_cols

In [None]:
df_no_all_zeros['max_tpm'] = df_no_all_zeros[tpm_cols].apply(lambda x: max(x), axis=1)
df_no_all_zeros.head()

In [None]:
TPM_THRESHOLD_PERCENTILE = 0

In [None]:
np.percentile(tpms_flattened[tpms_flattened != 0], TPM_THRESHOLD_PERCENTILE)

In [None]:
st.percentileofscore(tpms_flattened[tpms_flattened != 0], np.percentile(tpms_flattened[tpms_flattened != 0], TPM_THRESHOLD_PERCENTILE), kind='weak')

In [None]:
df_tpm_filtered = df_no_all_zeros.loc[df_no_all_zeros['max_tpm'] > np.percentile(tpms_flattened[tpms_flattened != 0], TPM_THRESHOLD_PERCENTILE)]
df_no_all_zeros.shape, df_tpm_filtered.shape

In [None]:
df_tpm_filtered.loc[df_tpm_filtered['TTHERM_ID'].isin(target_ttherms)]

In [None]:
diff_cols

In [None]:
df_no_all_zeros['min_diff'] = df_no_all_zeros[diff_cols].apply(lambda x: np.min(x[x != 0]) if len(x[x != 0]) > 0 else 0, axis=1)
df_no_all_zeros.head()

In [None]:
df_no_all_zeros_zero_min_diffs = df_no_all_zeros.loc[df_no_all_zeros['min_diff'] == 0].loc[:, ['TTHERM_ID']+diff_cols]
df_no_all_zeros_zero_min_diffs.shape

In [None]:
df_no_all_zeros.loc[df_no_all_zeros['TTHERM_ID']=='YF00000043.t1'].loc[:, ['TTHERM_ID'] + diff_cols]

In [None]:
DIFF_THRESHOLD_PERCENTILE = 0

In [None]:
np.percentile(diffs_flattened[diffs_flattened != 0], DIFF_THRESHOLD_PERCENTILE)

In [None]:
st.percentileofscore(diffs_flattened[diffs_flattened != 0], np.percentile(diffs_flattened[diffs_flattened != 0], DIFF_THRESHOLD_PERCENTILE), kind='weak')

In [None]:
df_diff_filtered = df_no_all_zeros.loc[df_no_all_zeros['min_diff'] > np.percentile(diffs_flattened[diffs_flattened != 0], DIFF_THRESHOLD_PERCENTILE)]
df_no_all_zeros.shape, df_diff_filtered.shape

In [None]:
list(np.unique(max_tpm_ttherms))

In [None]:
df_diff_filtered.loc[df_diff_filtered['TTHERM_ID'].isin(list(np.unique(max_tpm_ttherms)))]

In [None]:
df_filtered = df_diff_filtered.merge(df_tpm_filtered, how='inner', on='TTHERM_ID')
print('# genes in df_tpm_filtered:', df_tpm_filtered.shape[0])
print('# genes in df_diff_filtered:', df_diff_filtered.shape[0])
print('# genes in df_filtered:', df_filtered.shape[0])

In [None]:
df_no_all_zeros.loc[df_no_all_zeros['TTHERM_ID'].isin(list(np.unique(max_tpm_ttherms)))]

In [None]:
df_filtered_diffs_flattened = df_diff_filtered.loc[:, diff_cols].to_numpy().flatten()

In [None]:
# plt.hist(df_filtered_diffs_flattened, bins=10000, edgecolor='black')

# # line_num = 200
# # plt.axvline(x=line_num, color='red', linestyle='dashed', linewidth=0.5, label='line at x={}'.format(line_num))
# # plt.legend()

# plt.xlabel('∆TPM Between Duplicates')
# plt.ylabel('# of ∆TPMs')
# plt.title(f'Distribution of ∆TPM Between Duplicates')

# plt.show()

# plt.hist(df_filtered_diffs_flattened, bins=10000, edgecolor='black')

# # line_num = 200
# # plt.axvline(x=line_num, color='red', linestyle='dashed', linewidth=0.5, label='line at x={}'.format(line_num))
# # plt.legend()

# plt.xlabel('∆TPM Between Duplicates')
# plt.ylabel('# of ∆TPMs')
# plt.title(f'Distribution of ∆TPM Between Duplicates')

# plt.ylim(top=200)

# plt.show()

# Export the filtered TPM data to CSV

# FILTER BY RAW COUNT SELECTION

In [None]:
df_no_all_zeros.shape, df_all_tpm.shape

In [None]:
# df_filtered = df_no_all_zeros # FIXME
# df_filtered = df_no_all_zeros.loc[df_no_all_zeros['TTHERM_ID'].isin(complete_data_df_filtered['TTHERM_ID'].values)]

df_filtered = df_all_tpm.loc[df_all_tpm['TTHERM_ID'].isin(df_all_cpm_filtered['TTHERM_ID'].values)]

In [None]:
df_filtered.shape

In [None]:
df_filtered.loc[df_filtered['TTHERM_ID'].isin(df_all_zeros['TTHERM_ID'].values)]

In [None]:
df_filtered.head()

In [None]:
df_filtered[tpm_cols].max(axis=None)

In [None]:
df_filtered.loc[:, all_cols].to_csv('./tpm_kallisto.csv', index=False)

# Normalize expression per gene with zscore

In [None]:
# normalized_tpm_df = normalize_expression_per_gene(df_filtered)
normalized_tpm_df = df_filtered

Sanity check

In [None]:
row_to_check = normalized_tpm_df.loc[2, normalized_tpm_df.columns[1:]] 

row_mean = np.mean(row_to_check)
row_std = np.std(row_to_check)

print(row_mean)
print(row_std)

In [None]:
normalized_tpm_df.shape

In [None]:
normalized_tpm_df

# MEAN TYPE SELECTION

In [None]:
# df_filtered_averaged = ari_mean_df_of_duplicates(normalized_tpm_df.loc[:, all_cols])
# df_filtered_averaged = geo_mean_df_of_duplicates(df_filtered.loc[:, all_cols])
# df_filtered_averaged
df_filtered_averaged = df_filtered

In [None]:
df_filtered.loc[:, all_cols]

# Filter out genes with all zeros

In [None]:
df_filtered_averaged_with_all_zeros = df_filtered_averaged[(df_filtered_averaged.iloc[:, 1:] == 0.0).all(axis=1)]

print(df_filtered_averaged_with_all_zeros.shape)

df_filtered_averaged_with_all_zeros.head()

In [None]:
df_filtered_averaged_no_all_zeros = df_filtered_averaged[~df_filtered_averaged.isin(df_filtered_averaged_with_all_zeros)].dropna()
df_filtered_averaged_no_all_zeros.reset_index()
print(df_filtered_averaged.shape[0])
print(df_filtered_averaged_no_all_zeros.shape[0])
print(df_filtered_averaged.shape[0]-df_filtered_averaged_no_all_zeros.shape[0])
df_filtered_averaged_no_all_zeros.head()

In [None]:
df_filtered_averaged_no_all_zeros[df_filtered_averaged_no_all_zeros['TTHERM_ID'].isin(satellite_ttherm_ids)]

# SATELLITE GENE TPMS

In [None]:
(df_no_all_zeros[df_no_all_zeros['TTHERM_ID'].isin(satellite_ttherm_ids)])[all_cols]

In [None]:
df_filtered_averaged_no_all_zeros

In [None]:
df_filtered_averaged_no_all_zeros[df_filtered_averaged_no_all_zeros.isna().any(axis=1)]

In [None]:
df_filtered_averaged_no_all_zeros.sample(10)

In [None]:
df_filtered_averaged_no_all_zeros.to_csv('./kallisto.csv', index=False)