In [1]:
import numpy as np
import numpy.typing as npt
import pandas as pd

from Bio.Seq import Seq

In [None]:
def load_k50_data(count_filedir: str, qpcr_raw_filepath: str) -> dict:
    def load_raw() -> tuple[list[pd.DataFrame], dict[str, npt.NDArray[np.int64]]]:
        """Loads the raw datasets"""

        # Read in the count dataframes, remove duplicate sequences, and assign unique
        # sequence identities
        seqids = {}
        seqid = 0
        dfs = []
        for df_ind in range(1, 5):

            # Load the dataframe, remove unnecessary columns, and combine duplicates
            df = (
                pd.read_csv(count_filedir + f"NGS_count_lib{df_ind}.csv")
                .drop(columns=["name", "dna_seq"])
                .groupby("aa_seq", as_index=False)
                .sum()
            )

            # Assign unique sequence IDs
            ids = [None] * len(df)
            for df_ind, seq in enumerate(df.aa_seq.tolist()):
                if seq not in seqids:
                    seqids[seq] = seqid
                    seqid += 1
                ids[df_ind] = seqids[seq]
            df["seq_id"] = ids

            # Record
            dfs.append(df)

        return dfs, seqids

    def gather_counts() -> dict[str, npt.NDArray]:
        """Gathers count data for each library in the dataset"""
        # Convert the dataframes to numpy arrays of counts
        libdata = {}
        for lib_ind, df in enumerate(dfs, 1):

            # Get the columns for each replicate
            stringdices1 = [str(j).rjust(2, "0") for j in range(1, 13)]
            stringdices2 = [str(j) for j in range(13, 25)]

            # Get counts for each protease
            protease_counts = []
            for protease in ("C", "T"):
                protease_counts.append(
                    np.stack(
                        [
                            df[
                                [
                                    f"v{lib_ind}_{protease}{stringdex}"
                                    for stringdex in stringdices
                                ]
                            ]
                            .to_numpy(dtype=int)
                            .T
                            for stringdices in (stringdices1, stringdices2)
                        ]
                    )
                )

            # Record for the libraries
            key = f"v{lib_ind}"
            protease_counts = np.stack(protease_counts)
            libdata[f"{key}_counts_cg0"] = protease_counts[:, :, 1:]
            libdata[f"{key}_counts_c0"] = protease_counts[:, :, 0]
            libdata[f"{key}_seqids"] = df["seq_id"].to_numpy()

        return libdata

    def load_qpcr() -> dict[str, npt.NDArray]:
        """Loads raw qpcr data"""

        # Load qpcr data and separate trypsin from chymotrypsin
        qpcr = pd.read_csv(qpcr_raw_filepath)
        tryp = qpcr[qpcr.protease == "trypsin"]
        chymo = qpcr[qpcr.protease == "chymotrypsin"]

        # Get protease concentrations
        protease_conc = chymo.protease_con.to_numpy()
        assert np.all(protease_conc == tryp.protease_con.to_numpy())

        # Get survival data for each protein
        tryp_survival = tryp.iloc[1:, 1:9].to_numpy()
        chymo_survival = chymo.iloc[1:, 1:9].to_numpy()

        return {
            "qpcr_log_protease_conc": np.log(protease_conc[1:]),
            "qpcr_log2_survival": np.stack([chymo_survival, tryp_survival]),
        }


    # Get count data
    dfs, seqids = load_raw()
    libdata = gather_counts()

    # Define the expected protease concentrations
    conc1 = np.log(25) - np.arange(10, -1, -1) * np.log(3)
    conc2 = 0.5 * np.log(3) + conc1
    libdata["log_expected_protease_conc"] = np.stack([conc1, conc2])

    # Add qpcr data
    libdata.update(load_qpcr())

    return libdata, seqids

In [3]:
k50_data = load_k50_data(
    count_filedir="flip3_data/raw_data/tsuboyama/Raw_NGS_count_tables/",
    qpcr_raw_filepath="flip3_data/raw_data/tsuboyama/Pipeline_qPCR_data/Raw_qPCR_data_FigS1.csv"
)

  "qpcr_log_protease_conc": np.log(protease_conc),


In [6]:
k50_data[0]["v1_counts_c0"].shape

(2, 2, 297699)

In [5]:
k50_data[0]["qpcr_log_protease_conc"].shape

(12,)

In [7]:
k50_data[0]["qpcr_log2_survival"].shape

(2, 11, 8)

In [None]:
qpcr_filepath = "/home/bwittmann/GitRepos/cdna-display-proteolysis-pipeline/Pipeline_qPCR_data/Raw_qPCR_data_FigS1.csv"

# Load qpcr data and separate trypsin from chymotrypsin
qpcr = pd.read_csv(qpcr_filepath)
tryp = qpcr[qpcr.protease == "trypsin"]
chymo = qpcr[qpcr.protease == "chymotrypsin"]

# Get protease concentrations
protease_conc = chymo.protease_con.to_numpy()
assert np.all(protease_conc == tryp.protease_con.to_numpy())

# Get survival data for each protein
tryp_survival = tryp.iloc[:, 1:9].to_numpy()
chymo_survival = chymo.iloc[:, 1:9].to_numpy()


In [None]:
tryp_survival.shape

(12, 8)

In [None]:
tryp

Unnamed: 0,protease_con,ec5.1,ec50_4.9,ec50_4.4,ec50_4.2,ec50_3,ec50_2,ec50_0,4cuts,protease
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,trypsin
1,0.000423,0.627874,0.205186,0.146026,0.35118,0.156886,0.179236,-0.143042,0.291431,trypsin
2,0.00127,0.682212,0.13765,0.21422,0.22638,0.081019,0.026974,-0.233269,0.16725,trypsin
3,0.00381,0.368126,0.387063,0.303972,0.252836,0.130429,-0.045532,-0.590464,-0.081227,trypsin
4,0.011431,0.431367,0.353191,0.351618,0.511762,0.173152,0.065722,-0.843568,-0.579334,trypsin
5,0.034294,0.194988,0.258538,0.233,0.334137,-0.183751,-0.553425,-2.443772,-2.645948,trypsin
6,0.102881,-0.210084,-0.006398,-0.017008,-0.340219,-0.929618,-2.180667,-5.35879,-5.919798,trypsin
7,0.308642,-0.403575,-0.589131,-0.448072,-0.649178,-2.839089,-5.350209,-6.821456,-6.887197,trypsin
8,0.925926,-0.692816,-0.428798,-0.55909,-0.896552,-5.08056,-6.661476,-7.148223,-6.922888,trypsin
9,2.777778,-0.256332,-0.700245,-1.451881,-3.020199,-6.689945,-7.098805,-7.490998,-7.088666,trypsin


In [None]:
(chymo.protease_con.values == tryp.protease_con.values).all()

np.True_

In [None]:

""