### Alternative Splicing Analysis

This section covers the alternative splicing analyses conducted in the manuscript.


In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from functools import reduce
# import scib
import scanpy as sc
import anndata
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score

### Generate PSI adata

In [None]:
outrigger_path = "./outrigger_output/"
study_name="STUDY"
output_name=study_name+"_PSI_N10_exon"
DOLPHIN_latent = anndata.read_h5ad("DOLPHIN_Z.h5ad")

main_folder="Path_to_save_output_data"
output_directory = os.path.join(main_folder, "final_data")

In [None]:
adata_lat = anndata.read(DOLPHIN_latent)
adata_lat

In [None]:
all_sample = list(adata_lat.obs.index)
len(all_sample)

In [None]:
pd_psi_single = pd.read_csv(os.path.join(outrigger_path, "psi", "outrigger_summary.csv"))
pd_psi_single["sample_id"] = pd_psi_single["sample_id"].apply(lambda x:x.split(".")[0])

In [None]:
#Task: cell pca, convert to event_id X sample_name, save each sample dataframe into dictionary
d = {}
for i, _srr in enumerate(tqdm(all_sample)):
    _temp_df = pd_psi_single[pd_psi_single["sample_id"] == _srr]
    _temp_df = _temp_df.rename(columns={"psi":_srr})
    _temp_df = _temp_df[["event_id",_srr]]
    d["{0}".format(_srr)] = _temp_df

In [None]:
df_merge_list = []
for key in d:
    df_merge_list.append(d[key])

In [None]:
df_merged = reduce(lambda left,right: pd.merge(left, right, on = "event_id", how='outer'), df_merge_list)
df_merged = df_merged.set_index("event_id")
df_recon = df_merged.transpose()
# merge to get label and leiden
df_obs_org = pd.merge(pd.DataFrame(adata_lat.obs), df_recon, left_index=True, right_index=True)

In [None]:
### get AS event and it's corresponding gene name
pd_mxe_event = pd.read_csv(os.path.join(outrigger_path, "index", "mxe/events.csv"))
pd_se_event = pd.read_csv(os.path.join(outrigger_path, "index", "se/events.csv"))

In [None]:
#get event dataframe
pd_mxe_event["AS_event_type"] = "MXE" 
pd_se_event["AS_event_type"] = "SE" 
pd_event = pd.concat([pd_mxe_event, pd_se_event], ignore_index=True)
#use gene id to replace nan gene name
pd_event["isoform1_gene_name_mod"] = pd_event["isoform1_gene_name"]
pd_event.isoform1_gene_name_mod.fillna(pd_event.isoform1_gene_id, inplace=True)
pd_event["isoform2_gene_name_mod"] = pd_event["isoform2_gene_name"]
pd_event.isoform2_gene_name_mod.fillna(pd_event.isoform2_gene_id, inplace=True)
#get the event id the corresponding most frequent gene name
pd_event_isoform1 = pd_event[["event_id", "isoform1_gene_name_mod"]]
pd_event_isoform1_freq = pd_event_isoform1.groupby(['event_id', "isoform1_gene_name_mod"], dropna=False).size().to_frame('count1').reset_index()
pd_event_isoform1_freq = pd_event_isoform1_freq.sort_values(["event_id","count1"],ascending=False).groupby('event_id').head(1)

pd_event_isoform2 = pd_event[["event_id", "isoform2_gene_name_mod"]]
pd_event_isoform2_freq = pd_event_isoform2.groupby(['event_id', "isoform2_gene_name_mod"], dropna=False).size().to_frame('count2').reset_index()
pd_event_isoform2_freq = pd_event_isoform2_freq.sort_values(["event_id","count2"],ascending=False).groupby('event_id').head(1)

#merge two isoform table to get final genes per event
pd_event_gene = pd.merge(pd_event_isoform1_freq, pd_event_isoform2_freq, left_on=["event_id"], right_on=["event_id"])

#remove both isoforms has nan gene_name/id
pd_event_gene["gene_name"] = np.select(
    [(pd_event_gene["isoform1_gene_name_mod"].notna() & (pd_event_gene["isoform1_gene_name_mod"] == pd_event_gene["isoform2_gene_name_mod"])),
    (pd_event_gene["isoform1_gene_name_mod"].notna() & pd_event_gene["isoform2_gene_name_mod"].isna()),
    (pd_event_gene["isoform2_gene_name_mod"].notna() & pd_event_gene["isoform1_gene_name_mod"].isna()),
    (pd_event_gene["isoform1_gene_name_mod"].notna() & pd_event_gene["isoform2_gene_name_mod"].notna() & (pd_event_gene["isoform1_gene_name_mod"] != pd_event_gene["isoform2_gene_name_mod"])),
    (pd_event_gene["isoform2_gene_name_mod"].isna() & pd_event_gene["isoform1_gene_name_mod"].isna())
    ],
    [pd_event_gene["isoform1_gene_name_mod"],
    pd_event_gene["isoform1_gene_name_mod"],
    pd_event_gene["isoform2_gene_name_mod"],
    pd_event_gene["isoform1_gene_name_mod"] + "," + pd_event_gene["isoform2_gene_name_mod"],
    "Empty"
    ]
)

#remove duplicate gene names if more than one genes
pd_event_gene['gene_name'] = pd_event_gene['gene_name'].apply(lambda x: (",").join(list(set(x.split(",")))) if "," in x else x)
pd_event_gene = pd_event_gene[["event_id", "gene_name"]]

dict_event_gene = dict(zip(pd_event_gene.event_id, pd_event_gene.gene_name))

In [None]:
#save original psi count table
## dataframe for annotating the observations = sample name
obs = df_obs_org.iloc[:, :adata_lat.obs.shape[1]]

## dataframe for annotating the variables = geneid
var_names = df_obs_org.T.index.values[adata_lat.obs.shape[1]:] #use gene_id as index since gene name is not unique
var = pd.DataFrame(index=var_names)
var["gene_name"] = var.index
var = var.replace({"gene_name": dict_event_gene})

# # # ##the data matrix 
X = df_obs_org.iloc[:,adata_lat.obs.shape[1]:].values
adata = anndata.AnnData(X, obs=obs, var=var, dtype=np.float32)

adata.write(os.path.join(output_directory, output_name+".h5ad")) #2842*61860
adata

### Differential Alternative Splicing Events

In [None]:
anndata_psi = os.path.join(output_directory, output_name+".h5ad")
output_name=study_name+"_PSI_N10_GO"

In [None]:
adata = anndata.read_h5ad(anndata_psi)
df_psi_raw = adata.to_df()
df_psi_raw

In [None]:
#get number of cells per each event
df_psi_raw_t = df_psi_raw.T
df_psi_raw_t["count"] = df_psi_raw_t.ge(0.0).sum(axis=1)

In [None]:
#plot event id vs sample number
freq, bins, patches = plt.hist(list(df_psi_raw_t["count"]), edgecolor='white', label='d', bins=range(1,800,25))

# x coordinate for labels
bin_centers = np.diff(bins)*0.5 + bins[:-1]

n = 0
for fr, x, patch in zip(freq, bin_centers, patches):
  height = int(freq[n])
  plt.annotate("{}".format(height),
               xy = (x, height),             # top left corner of the histogram bar
               xytext = (0,0.2),             # offsetting label position above its bar
               textcoords = "offset points", # Offset (in points) from the *xy* value
               ha = 'center', va = 'bottom',
               fontsize = 6
               )
  n = n+1

# plt.legend()
plt.title("Number of cells per Splicing Event")
plt.xlabel("Number of Cells")
plt.ylabel("Number of Events")
plt.show

In [None]:
#only keep event which is exist in more than 10 cells include 10 cells
df_psi_raw_t_filter = df_psi_raw_t[df_psi_raw_t["count"] >=10]
df_psi_raw_t_filter.drop(columns=["count"], inplace=True)
df_psi_raw_filter = df_psi_raw_t_filter.T

#### Average PSI Value Per Cluster to Replace NaN Values

In this step, we calculate the average PSI value per event and per cluster to replace the `NaN` values. Some clusters may have missing values (`NaN`) for certain events, and replacing these with the cluster-wise averages helps to maintain data integrity for downstream analysis.


In [None]:
#get sample mean
df_psi_raw_filter['sample_mean'] = df_psi_raw_filter.mean(axis=1, skipna=True)

In [None]:
#merge to get ground truth and leiden cluster category
df_psi_raw_filter = pd.merge(df_psi_raw_filter, pd.DataFrame(adata.obs), left_index=True, right_index=True)

In [None]:
df_value_mean = df_psi_raw_filter.groupby(['celltype'], as_index=False)["sample_mean"].mean()
df_value_mean.sample_mean = df_value_mean.sample_mean.round(3).astype(str)
dict_mean_cluster = dict(zip(df_value_mean["celltype"], df_value_mean.sample_mean))
dict_mean_cluster

In [None]:
df_psi_raw_filter = df_psi_raw_filter.replace({"celltype": dict_mean_cluster})
df_psi_raw_filter = df_psi_raw_filter.rename(columns={"celltype": "psi_mean_cluster"})
df_psi_raw_filter

In [None]:
#fill the nan value row using mean psi value per cluster, using str here otherwise will cause iteration error
df_psi_mod_mean_cluster = df_psi_raw_filter.apply(lambda x : x.fillna(value=df_psi_raw_filter["psi_mean_cluster"]))
df_psi_mod_mean_cluster

In [None]:
#convert value back to float
cols = df_psi_mod_mean_cluster.columns
df_psi_mod_mean_cluster[cols] = df_psi_mod_mean_cluster[cols].apply(pd.to_numeric, errors='coerce')

In [None]:
# merge to get label and leiden
df_psi_mod_mean_cluster = df_psi_mod_mean_cluster.drop(columns=["sample_mean","psi_mean_cluster"])
df_obs_go_mean_cluster = pd.merge(pd.DataFrame(adata.obs), df_psi_mod_mean_cluster, left_index=True, right_index=True)
# df_obs_go_mean_cluster = df_obs_go_mean_cluster.rename(columns = {"leiden":"leiden_celltype1"})
df_obs_go_mean_cluster

In [None]:
df_obs_go_mean_cluster["CB"] = df_obs_go_mean_cluster.index

In [None]:
obs = df_obs_go_mean_cluster[["CB", "celltype"]]

In [None]:
#conver to h5ad file
## dataframe for annotating the observations = sample name
obs = df_obs_go_mean_cluster[["CB", "celltype"]]

## dataframe for annotating the variables = geneid
var_names = df_obs_go_mean_cluster.T.index.values[4:-4] #use gene_id as index since gene name is not unique
var = pd.DataFrame(index=var_names)
var["gene_name"] = var.index
var = var.replace({"gene_name": dict_event_gene})

# # # ##the data matrix 
X = df_obs_go_mean_cluster.iloc[:,4:-4].values
adata_leiden = anndata.AnnData(X, obs=obs, var=var, dtype=np.float32)

adata_leiden.write(os.path.join(output_directory, output_name+".h5ad")) #2842*61860