In [None]:
import numpy as np
import pandas as pd    

In [15]:
#Read the compressed CSV files for each partition into dfs
data_partition_1 = pd.read_csv("../../data/data_model/datasets/data_partition_1.csv.gz", 
                                   dtype={'annotation_source': 'str'},
                                   compression='gzip')
data_partition_2 = pd.read_csv("../../data/data_model/datasets/data_partition_2.csv.gz",
                                   dtype={'annotation_source': 'str'},
                                   compression='gzip')
data_partition_3 = pd.read_csv("../../data/data_model/datasets/data_partition_3.csv.gz", 
                                   dtype={'annotation_source': 'str'},
                                   compression='gzip')
data_partition_4 = pd.read_csv("../../data/data_model/datasets/data_partition_4.csv.gz", 
                                   dtype={'annotation_source': 'str'},
                                   compression='gzip')
data_partition_5 = pd.read_csv("../../data/data_model/datasets/data_partition_5.csv.gz", 
                                   dtype={'annotation_source': 'str'},
                                   compression='gzip')

In [None]:
#Get list of all species
species_list = ['Saccharomyces cerevisiae', 'Ustilago maydis', 'Schizosaccharomyces pombe', 'Aspergillus nidulans', 'Cryptococcus neoformans', 'Neurospora crassa', 'Coprinopsis cinerea', 'Rhizophagus irregularis', 'Schizophyllum commune',
                'Plasmodium falciparum', 'Entamoeba histolytica', 'Dictyostelium discoideum', 'Giardia intestinalis', 'Trypanosoma brucei', 'Leishmania donovani', 'Toxoplasma gondii', 'Eimeria maxima', 'Oryza sativa', 'Arabidopsis thaliana', 
                'Selaginella moellendorffii', 'Brachypodium distachyon', 'Setaria viridis', 'Zea mays', 'Hordeum vulgare', 'Triticum aestivum', 'Phoenix dactylifera', 'Lotus japonicus', 'Medicago truncatula', 'Nicotiana tabacum', 'Glycine max', 
                'Solanum lycopersicum', 'Trichoplax adhaerens', 'Tribolium castaneum', 'Manduca sexta', 'Apis mellifera', 'Strongylocentrotus purpuratus', 'Daphnia carinata', 'Drosophila melanogaster', 'Anopheles gambiae', 'Caenorhabditis elegans', 
                'Gallus gallus', 'Alligator mississippiensis', 'Xenopus laevis', 'Oreochromis niloticus', 'Homo sapiens', 'Bos taurus', 'Mus musculus',  'Ovis aries', 'Canis lupus familiaris', 'Equus caballus', 'Gorilla gorilla', 'Pan troglodytes', 
                'Rattus norvegicus', 'Oryctolagus cuniculus', 'Sus scrofa', 'Danio rerio', 'Oryzias latipes', 'Taeniopygia guttata', 'Columba livia', 'Anolis carolinensis']


In [None]:
def extend_sequence(row, reference_df):
    """
    Extends the sequence in the given row by finding the matching sequence in the reference dataframe and appending 100 downstream nucleotides.

    Args:
        row (pd.Series): A row from the data_partition_masked dataframe containing the sequence to be extended.
        reference_df (pd.DataFrame): The reference dataframe containing the full sequences to find the downstream nucleotides.

    Returns:
        result (str): The extended sequence with 100 downstream nucleotides appended if a match is found, otherwise the original sequence.
    """

    # Find the matching row in reference_df using the seq_number/Seq_number mapping
    match = reference_df[reference_df['Seq_number'] == row['seq_number']]
    
    if not match.empty:
        # Get the data_partition_masked sequence and reference_df sequence
        data_partition_masked_sequence = row['Sequence']
        reference_sequence = match.iloc[0]['Sequence']
        
        # Find the overlap position
        overlap_pos = reference_sequence.find(data_partition_masked_sequence)
        if overlap_pos != -1:
            # Extract 100 downstream nucleotides
            downstream_start = overlap_pos + len(data_partition_masked_sequence)
            downstream_end = downstream_start + 100
            downstream_seq = reference_sequence[downstream_start:downstream_end]
            
            result = data_partition_masked_sequence + downstream_seq
            
            # Assertion: Ensure the returned value is a string and includes the original sequence
            assert isinstance(result, str), "Output is not a string."
            assert data_partition_masked_sequence in result, "Original sequence not present in the output."
            assert len(result) >= len(data_partition_masked_sequence), "Output sequence is shorter than expected."
            
            return result
    
    # Assertion: If no match is found, ensure the original sequence is returned
    assert isinstance(row['Sequence'], str), "Original sequence is not a string."
    return row['Sequence']

In [None]:
def mask_non_TIS_samples(data_partition, number):
    """ 
    #Mask upstream seqeunce in non-TIS sequences corresponding to X% in TIS-sequences, with length distributions as in TIS sequences. 

    Args:
        data_partition (pd.DataFrame): The dataframe containing the sequences to be masked.
        number (int): The partition number for the data_partition dataframe.
    
    Output:
        data_partition_masked (pd.DataFrame): The dataframe containing the masked sequences.
    """
    data_partition_masked_full = pd.DataFrame()

    masked_count = 0
    total_count = 0

    for species in species_list:
        #print(species)
        data_partition_masked = pd.DataFrame()
        #Get TIS data, find fraction of sequences with masked nucleotides and distribution of number of masked nucleotides
        data_species = data_partition[data_partition["Species"] == species]

        TIS_data = data_species[data_species["TIS"] == 1]
        TIS_masked_seqs = TIS_data[TIS_data["Sequence"].str.contains("N")]
        TIS_masked_seqs_fracs = TIS_masked_seqs.shape[0]/TIS_data.shape[0]
        TIS_masked_seqs = TIS_masked_seqs.copy()  # Ensures it’s a new DataFrame
        TIS_masked_seqs["N_count"] = TIS_masked_seqs["Sequence"].str.count("N")
        N_distribution = TIS_masked_seqs["N_count"].value_counts().sort_index()

        total_count += TIS_data.shape[0]
        masked_count += TIS_masked_seqs.shape[0]

        #Get non-TIS data
        non_TIS_data = data_species[data_species["TIS"] == 0]
        non_TIS_samples_to_mask = int(len(non_TIS_data) * TIS_masked_seqs_fracs)
        non_TIS_sampled_rows = non_TIS_data.sample(n=non_TIS_samples_to_mask, random_state=42).copy()

        assert data_species.shape[0] == non_TIS_data.shape[0] + TIS_data.shape[0]

        if non_TIS_sampled_rows.shape[0] != 0:
            #Draw the number of "N"s for each sampled row based on N_distribution
            non_TIS_sampled_rows["N_count"] = np.random.choice(
                N_distribution.index,  # Possible counts of "N"
                size=len(non_TIS_sampled_rows),  # Number of samples to draw
                p=N_distribution.values / N_distribution.values.sum()  # Probabilities
            )

            #Modify the sequences
            def replace_with_N(sequence, n_count):
                return "N" * n_count + sequence[n_count:]

            non_TIS_sampled_rows["Sequence"] = non_TIS_sampled_rows.apply(
                lambda row: replace_with_N(row["Sequence"], row["N_count"]), axis=1
            )

            # Update non-TIS data with modified sequences
            non_TIS_data.update(non_TIS_sampled_rows[["Sequence"]])

        data_species_masked = pd.concat([TIS_data, non_TIS_data], axis=0, ignore_index=True)

        assert data_species_masked.shape[0] == data_species.shape[0]

        data_partition_masked = pd.concat([data_partition_masked, data_species_masked], axis = 0, ignore_index=True)

        #Read original datasets
        species_formatted = species.lower().replace(" ", "_")
        TIS_seqs_df = pd.read_csv(f"../../data/data_model_preparation/datasets/TIS/mRNA_positive_{species_formatted}.csv.gz", compression='gzip')
        non_TIS_seqs_df1 = pd.read_csv(f"../../data/data_model_preparation/datasets/non_TIS/mRNA/mRNA_negative_{species_formatted}.csv.gz", compression='gzip')
        non_TIS_seqs_df2 = pd.read_csv(f"../../data/data_model_preparation/datasets/non_TIS/intergenic/intergenic_data_{species_formatted}.csv.gz", compression='gzip')
        non_TIS_seqs_df3 = pd.read_csv(f"../../data/data_model_preparation/datasets/non_TIS/introns/introns_{species_formatted}.csv.gz", compression='gzip')

        all_seq_data = pd.concat([TIS_seqs_df, non_TIS_seqs_df1, non_TIS_seqs_df2, non_TIS_seqs_df3], axis = 0, ignore_index=True)

        # Apply the function to df1
        data_partition_masked['Sequence'] = data_partition_masked.apply(lambda row: extend_sequence(row, all_seq_data), axis=1)
        
        data_partition_masked_full = pd.concat([data_partition_masked_full, data_partition_masked], axis = 0, ignore_index=True)
    
    assert data_partition.shape[0] == data_partition_masked_full.shape[0]

    print("Percentage of non-TIS sequences to be padded: ", round(masked_count / total_count, 4))

    #Save the DataFrame as a CSV file compressed with gzip
    data_partition_masked.to_csv(f'../../data/data_model/datasets/data_partition_{str(number)}_masked_extended.csv.gz', compression='gzip', index=False)

In [None]:
mask_non_TIS_samples(data_partition_1, number = 1)
mask_non_TIS_samples(data_partition_2, number = 2)
mask_non_TIS_samples(data_partition_3, number = 3)
mask_non_TIS_samples(data_partition_4, number = 4)
mask_non_TIS_samples(data_partition_5, number = 5)

58428 232466
0.2513
58261 232465
0.2506
58553 232387
0.252
58339 232416
0.251
58638 232460
0.2522
