# Create negative set for promoter mining

Last updated: January 30, 2019

Created by: Miguel A. Alcantar

This notebook create a negative set for promoter sequences. Negative sets are largely scrambled promoter sequences and will eventually include random segments of genomic sequence.


In [1]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import random
random.seed(777)

In [2]:
# package versions 

print("Pandas: v" + pd.__version__) 
print("Numpy: v" + np.__version__)

Pandas: v0.25.1
Numpy: v1.17.2


# Helper functions


In [3]:
def trim_promter_seq(promoter_df, new_range, dataset_to_trim):

    """
    reduce size of the promoter sequences according to newly specified range

    inputs
    promoter_df: dataframe containing promoter sequences
    new_range: list containing new range for promoter sequences (e.g., [-249, 50])
    dataset_to_trim: string indicating which dataset to trim
        valid inputs are: 'EPDnew', 'RegulonDB', 'DBTBS', 'bacterial inducible promoters'

    outputs
    promoter_df: dataframe with updated promoter sequences
    """

    # split dataframe based on which dataset we need to trim
    if dataset_to_trim == 'EPDnew':
        promoter_df_to_trim = promoter_df.copy()[promoter_df['database/source'] == 'EPDnew']
        promoter_df_not_to_trim = promoter_df.copy()[promoter_df['database/source'] != 'EPDnew']

    elif dataset_to_trim == 'RegulonDB':
        promoter_df_to_trim = promoter_df.copy()[promoter_df['database/source'] == 'RegulonDB']
        promoter_df_not_to_trim = promoter_df.copy()[promoter_df['database/source'] != 'RegulonDB']

    elif dataset_to_trim == 'DBTBS':
        promoter_df_to_trim = promoter_df.copy()[promoter_df['database/source'] == 'DBTBS']
        promoter_df_not_to_trim = promoter_df.copy()[promoter_df['database/source'] != 'DBTBS']

    elif dataset_to_trim == 'bacterial inducible promoters':
        promoter_df_to_trim = promoter_df.copy()[promoter_df['database/source'] == 'Meyer et al 2019 (PMID: 30478458)']
        promoter_df_not_to_trim = promoter_df.copy()[promoter_df['database/source'] != 'Meyer et al 2019 (PMID: 30478458)']

    elif dataset_to_trim == 'fungal inducible promoters':
        promoter_df_to_trim = promoter_df.copy()[((promoter_df['database/source'] != 'EPDnew') &
                                                 (promoter_df['database/source'] != 'RegulonDB') &
                                                 (promoter_df['database/source'] != 'DBTBS') &
                                                (promoter_df['database/source'] != 'Meyer et al 2019 (PMID: 30478458)'))]


        promoter_df_not_to_trim = promoter_df.copy()[~((promoter_df['database/source'] != 'EPDnew') &
                                                 (promoter_df['database/source'] != 'RegulonDB') &
                                                 (promoter_df['database/source'] != 'DBTBS') &
                                                (promoter_df['database/source'] != 'Meyer et al 2019 (PMID: 30478458)'))]

    new_sequence_list = []
    new_range_list = []

    # change sequence length
    # this is done by setting the lower index at: (new lower bound) - (old lower bound)
    # and upper index at: (sequence length) -( (old upper bound) - (new upper bound) )
    for seq, seq_range in zip(list(promoter_df_to_trim['DNA sequence']), list(promoter_df_to_trim['range (with respect to TSS)'])):
        range_orig_splt = seq_range.split(' to ')
        lower_range_orig = int(range_orig_splt[0])
        upper_range_orig = int(range_orig_splt[1])

        if new_range[0] < lower_range_orig or new_range[1] > upper_range_orig:
            print('Requested new range is larger than original sequence length.')
            new_sequence_list.append(seq)
            new_range_list.append(seq_range)
        else:
            new_lower_idx = int(new_range[0]) - lower_range_orig
            new_upper_idx = upper_range_orig - int(new_range[1])
            seq_new = seq[new_lower_idx:len(seq) - new_upper_idx]
            new_sequence_list.append(seq_new)
            new_range_list.append(str(new_range[0]) + ' to ' + '+'+str(new_range[1]))

    promoter_df_to_trim['DNA sequence'] = new_sequence_list
    promoter_df_to_trim['range (with respect to TSS)'] = new_range_list

    promoter_df = pd.concat([promoter_df_to_trim,
           promoter_df_not_to_trim],
            sort=False).reset_index().drop('index', axis=1)

    return(promoter_df)


In [4]:
def mutate_sequence(sequence, mutation_rate, seed = 777):
    
    """
    mutate nucleotides in a given sequence
    
    inputs
    sequence: string containing nucleotide sequence -- it is assumed that the sequence only contains 'atcg'
    mutation_rate: float indicating desired mutation rate (should be values between 0 and 1)
    """
    random.seed(seed)
    bases = 'actg'
    new_sequence = []
    for base in sequence:
        if random.random() < mutation_rate:
            new_base = bases.strip(base)[random.randint(0, 2)]
            new_sequence.append(new_base)
        else:
            new_sequence.append(base)
    return(''.join(new_sequence))

In [5]:
def generate_random_sequence(num_sequences, sequence_length, seed=777):
    
    """
    generates a specified number of random sequences
    
    inputs
    num_sequences: integer indicating the number of random sequences to be generated
    sequence_length: integer indicating length of sequences to be generated
    
    outputs
    sequences: list of random sequences
    """
    random.seed(seed)
    bases = ['a', 'c', 't', 'g']
    random_sequences = []
    
    for idx in range(0,num_sequences*2):
        random_sequence = [random.choice(bases) for _ in range(sequence_length)]
        random_sequences.append(''.join(random_sequence))
        random_sequences_final = list(set(random_sequences[0:num_sequences]))
    random_sequences_df = pd.DataFrame()
    random_sequences_df['DNA sequence'] = random_sequences_final
    random_sequences_df['database/source'] = 'random'
    random_sequences_df = reorganize_promoter_df_columns(random_sequences_df)
    return(random_sequences_df)
    
    

In [6]:
def create_permuted_set(promoter_df, number_of_splits, percentage_to_conserve, mutation_rate, seed=777):
    
    """
    create negative promoter set from existing promoter sequences
    future iterations may want to pull genomic sequences
    
    inputs
    promoter_df: dataframe containing promoter sequences
    number_of_splits: integer indicating how many fragments should be created from the main sequence
    percentage_to_conserve: float indicating what percentage of sequences should not be permuted
    
    outputs
    negative_promoter_set: dataframe containing negative promoter set
    
    """
    random.seed(seed)
    promoter_sequences = promoter_df['DNA sequence']
    negative_sequences = []
    
    for sequence in promoter_sequences:
        
        # define length of nucleotide segments
        seq_length = int(len(sequence))
        segements_length = int(seq_length / number_of_splits)
        segment_indices = list(range(0,number_of_splits,1))
        
        # segment sequence
        sequence_segments = [sequence[start_idx:start_idx+segements_length] for start_idx in range(0, seq_length, segements_length)]
        
        # define which positions should be conserved and which should be mutated
        num_positions_permute = round(number_of_splits*(1-percentage_to_conserve))
        num_positions_conserve = number_of_splits - num_positions_permute
        permuted_indices = random.sample(segment_indices,num_positions_permute)
        conserved_indices = list(set(segment_indices) - set(permuted_indices))
        conserved_indices.sort()
        permuted_indices.sort()
        conserved_segments = [sequence_segments[conserved_idx] for conserved_idx in conserved_indices]
        permuted_segments = [sequence_segments[permuted_idx] for permuted_idx in permuted_indices]
          
        # permute segments
        random.shuffle(permuted_segments)
        
        # mutate permuted segmenes
        permuted_segments = [mutate_sequence(segment, mutation_rate, seed) for segment in permuted_segments]
        
        # place segments back into sequence
        mutated_permuted_sequence = [None] * len(sequence_segments)
        # populate permuted sequences
        for idx_to_insert, permuted_segment in zip(permuted_indices,permuted_segments):
            mutated_permuted_sequence[idx_to_insert] = permuted_segment
            
        for idx_to_insert, conserved_segment in zip(conserved_indices,conserved_segments):
            mutated_permuted_sequence[idx_to_insert] = conserved_segment

        negative_sequences.append(''.join(mutated_permuted_sequence))
        
    # create dataframe with negative promoters
    negative_sequence_df = promoter_df.copy()
    negative_sequence_df['DNA sequence'] = negative_sequences
    negative_sequence_df['class'] = 0
    return(negative_sequence_df)
    

In [7]:
promoter_df = pd.read_csv('../../data/parsed_promoter_data/20191203_promoters.csv',low_memory=False).fillna('')
promoter_df = trim_promter_seq(promoter_df, [-249,50], 'EPDnew')
promoter_df = trim_promter_seq(promoter_df, [-59,20], 'RegulonDB')

In [22]:
EPDnew_promoters_df = promoter_df.copy()
EPDnew_promoters_df = EPDnew_promoters_df[EPDnew_promoters_df['database/source'] == 'EPDnew']
EPDnew_negative_promoters_df = create_permuted_set(EPDnew_promoters_df, 20, 0.4, 0.2,seed=777)


In [23]:
RegulonDB_promoters_df = promoter_df.copy()
RegulonDB_promoters_df = RegulonDB_promoters_df[RegulonDB_promoters_df['database/source'] == 'RegulonDB']
RegulonDB_negative_promoters_df = create_permuted_set(RegulonDB_promoters_df, 8, 0.4, 0.2, seed=777)


In [None]:
promoters_all_df = pd.concat([promoter_df,
               EPDnew_negative_promoters_df,
               RegulonDB_negative_promoters_df],
                sort=False).reset_index().drop('index', axis=1)

In [None]:
promoters_all_df.to_csv('../../data/parsed_promoter_data/promoters_all.csv')



In [11]:
for org in set(promoter_df['organism']):
    test = promoter_df[promoter_df['organism'] == org]

In [12]:
test

Unnamed: 0,organism,database/source,DNA sequence,regulated gene,range (with respect to TSS),sigma factor/motif,inducer/repressor,promoter,class
157914,s_cerevisiae,EPDnew,acaacagatcaggtatatacctatatttcccaggttttcctacagc...,SAD1_1,-249 to +50,nonTATA,,,1
157915,s_cerevisiae,EPDnew,atatagaaagaattctgttgttgtaattgtcataactattgagctt...,WAR1_1,-249 to +50,nonTATA,,,1
157916,s_cerevisiae,EPDnew,ccagtggctgggcatttaatgagtagaatcggggcagtttcaatct...,PFD1_1,-249 to +50,nonTATA,,,1
157917,s_cerevisiae,EPDnew,gggttcataacgaaaaaaaaaaaatcgtaagtaatggtaaccatag...,COA2_1,-249 to +50,nonTATA,,,1
157918,s_cerevisiae,EPDnew,acagaaatgaatttgtaaatgtattcactaagtttagccgttttag...,TRM12_1,-249 to +50,nonTATA,,,1
...,...,...,...,...,...,...,...,...,...
163026,s_cerevisiae,EPDnew,ataaggaactaaatcgtcaagttcgaaagcctattcccattcatgc...,APL1_1,-249 to +50,nonTATA,,,1
163027,s_cerevisiae,EPDnew,cagcaacattgcgtgccgttgttcttttgttttttttttttttttt...,RNR3_1,-249 to +50,nonTATA,,,1
163028,s_cerevisiae,EPDnew,tacatggtgagacaacattggcgcgcacgccaatgttgtccttctc...,YPR063C_1,-249 to +50,nonTATA,,,1
163029,s_cerevisiae,EPDnew,cgaagaaagcggaaaaataaaaacgaaaaaataaaaaaaaaaaaaa...,IMD2_1,-249 to +50,nonTATA,,,1
