In [24]:
import os
import pandas as pd
import ast

from pydeseq2.dds import DeseqDataSet
from pydeseq2.default_inference import DefaultInference
from pydeseq2.ds import DeseqStats

## get the count data

In [25]:
input_dir = "6_counts"
output_dir = "7_log2fc"

# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [34]:
# load the design file
design_df = pd.read_csv("2_references/library2_sequences.csv", index_col=0)

In [35]:
# counts should be given as a pandas dataframe with genes as columns and samples as rows
counts_df = pd.read_csv(os.path.join(input_dir, "all_counts.csv"), index_col=0)

# get relevant columns
counts_3utr_df = counts_df[[col for col in counts_df.columns if "3UTR" in col]]
# reorder such that columns containing "DNA" are first
counts_3utr_df = counts_3utr_df[sorted(counts_3utr_df.columns, key=lambda x: "DNA" not in x)]

In [None]:
# which indices are in design_df but not in counts_df?
print("Indices in design_df but not in counts_df:")
print(set(design_df.index) - set(counts_df.index))
# which indices are in counts_df but not in design_df?
print("Indices in counts_df but not in design_df:")
print(set(counts_df.index) - set(design_df.index))

In [None]:
threeUTRfilter = (counts_3utr_df["DNA_3UTR_r1"] + counts_3utr_df["DNA_3UTR_r2"]) > 100

# what is the length of the dataframes before filtering?
print(len(counts_3utr_df))

# filter the dataframes
counts_3utr_df = counts_3utr_df[threeUTRfilter]

# which designs were discarded?
print(threeUTRfilter[threeUTRfilter == False].index)

# what is the length of the dataframes after filtering?
print(len(counts_3utr_df))

In [None]:
threeUTRfilter[threeUTRfilter == False].index

In [113]:
# what are the lowest read counts for the remaining designs?
# print all designs with a read count below 10 in any of the samples
counts_3utr_df[(counts_3utr_df < 10).any(axis=1)].to_excel(f'{output_dir}/low_counts.xlsx')

## restore identical designs

In [114]:
# get duplicated sequences
duplicated = pd.read_excel("2_references/library2_duplicated_sequences.xlsx")

In [115]:
# make column 'dup' a list
duplicated['dup'] = duplicated['dup'].apply(ast.literal_eval)

In [116]:
# iterate over duplicated sequences and add them to the counts
for index, row in duplicated.iterrows():
    seq1 = row['dup'][0]
    seq2 = row['dup'][1]
    # is seq1 in the counts?
    if seq1 in counts_3utr_df.index:
        counts_3utr_df.loc[seq2,:] = counts_3utr_df.loc[seq1,:]
    # is seq2 in the counts?
    elif seq2 in counts_3utr_df.index:
        counts_3utr_df.loc[seq1,:] = counts_3utr_df.loc[seq2,:]
    else:
        print(f"Neither {seq1} nor {seq2} are in the counts")

## continue processing

In [118]:
# transpose both dataframes so that genes are columns and samples are rows
counts_3utr_df = counts_3utr_df.transpose()

In [119]:
# metadata should be given as a pandas dataframe with samples as rows and metadata as columns
metadata_3utr_df = pd.DataFrame(index=counts_3utr_df.index, columns=["condition"])

# individually add groups
# the first group is the reference group
metadata_3utr_df["condition"] = ["A", "A", "B", "B", "C", "C", "D", "D", "E", "E", "F", "F", "G", "G", "H", "H", "I", "I", "J", "J", "K", "K", "L", "L", "M", "M"]

In [127]:
current_count_df = counts_3utr_df
current_metadata_df = metadata_3utr_df

In [128]:
%%capture output

inference = DefaultInference(n_cpus=4)
dds = DeseqDataSet(
    counts=current_count_df,
    metadata=current_metadata_df,
    design_factors="condition",
    refit_cooks=True,
    inference=inference,
)

dds.deseq2()

In [129]:
%%capture output

result_list = []
for condition in current_metadata_df["condition"].unique()[1:]:
    stat_res = DeseqStats(dds, inference=inference, contrast=["condition", condition, "A"])
    stat_res.summary()
    result_list.append(stat_res.results_df)

In [130]:
cell_line_df = current_metadata_df.copy()
cell_line_df["cell_line"] = cell_line_df.index
cell_line_df["cell_line"] = cell_line_df["cell_line"].apply(lambda x: f"{x.split("_")[0]}_{x.split('_')[1]}")
cell_line_df.set_index("condition", inplace=True)
# drop duplicate entries in the index of current_metadata_df
cell_line_df = cell_line_df[~cell_line_df.index.duplicated(keep="first")]

In [131]:
# add the cell line name to each column in result_list
for i, condition in enumerate(current_metadata_df["condition"].unique()[1:]):
    results_df = result_list[i]
    results_df.columns = [f"{cell_line_df.loc[condition, 'cell_line']}_{col}" for col in results_df.columns]
    result_list[i] = results_df 

In [132]:
# concatenate all dataframes in result_list
results_df_3UTR = pd.concat(result_list, axis=1)

## concatenate counts and inferred stabilities

In [135]:
# transpose counts_3utr_df 
counts_3utr_df = counts_3utr_df.transpose()
# add count_ to each column in counts_3utr_df
counts_3utr_df.columns = [f"count_{col}" for col in counts_3utr_df.columns]
# concatenate this with the counts dataframe
output_df = pd.concat([counts_3utr_df, results_df_3UTR], axis=1)
# save to csv
output_df.to_csv(os.path.join(output_dir, "library2_log2fc_with_UMIs.csv"))