# Description

Takes the TSS positions from Thomasan.csv and makes a new csv of the promoter + ITR from sequence.fasta. The final sequence is in LaFleur's format (see LaFleur_average_lengths.ipynb) with an upstream length of 52bp, and a downstream length of 21bp (total length 73). If the "other strand" is the coding strand (Strand = '-'), the promoter direction changes and we take the reverse complement respective to the TSS.

In [1]:
import pandas as pd

In [10]:
df = pd.read_excel('../data/Thomasan_2015/Thomasan_unprocessed.xlsx', sheet_name=1, skiprows=2)
df.head()

  warn(msg)


Unnamed: 0,Pos,Strand,detCount,Condition,detected,enriched,stepHeight,stepFactor,enrichmentFactor,classCount,...,Secondary,Internal,Antisense,Automated,Manual,Putative sRNA,Putative asRNA,Comment,Sequence -50 nt upstream + TSS (51nt),Overlap with RegulonDB
0,38,+,1,M63_0.4,0,0,,,,1,...,1,0,0,1,0,0,0,,,X
1,38,+,1,LB_0.4,0,0,,,,1,...,1,0,0,1,0,0,0,,,X
2,38,+,1,LB_2.0,1,1,8.43,>100,9.63,1,...,1,0,0,1,0,0,0,,,X
3,113,+,3,M63_0.4,1,1,1.8,4,25.41,1,...,1,0,0,1,0,0,0,,ATAGCAGCTTCTGAACTGGTTACCTGCCGTGAGTAAATTAAAATTT...,
4,113,+,3,LB_0.4,1,1,1.96,2.33,13.27,1,...,1,0,0,1,0,0,0,,ATAGCAGCTTCTGAACTGGTTACCTGCCGTGAGTAAATTAAAATTT...,


In [13]:
# Include only necessary columns
df = df[['Pos','Strand','Condition','detected','enriched','stepHeight','stepFactor','enrichmentFactor','Sequence -50 nt upstream + TSS (51nt)']].copy()

# Replace NaN values and clean data
df.fillna({'stepHeight': 0, 'stepFactor': 0, 'enrichmentFactor': 0}, inplace=True)
df.replace({'stepFactor': {'>100': '100'}, 'enrichmentFactor': {'>100': '100'}}, inplace=True)
df['stepHeight'] = df['stepHeight'].astype(float)
df['stepFactor'] = df['stepFactor'].astype(float)
df['enrichmentFactor'] = df['enrichmentFactor'].astype(float)


In [14]:
duplicates = df.duplicated(subset=['Pos', 'Strand', 'Sequence -50 nt upstream + TSS (51nt)', 'Condition'], keep=False)

if duplicates.any():

    df = df.groupby(['Pos', 'Strand', 'Sequence -50 nt upstream + TSS (51nt)', 'Condition'], as_index=False).agg({
        'detected': 'mean',
        'enriched': 'mean',
        'stepHeight': 'mean',
        'stepFactor': 'mean',
        'enrichmentFactor': 'mean'
    })

df = df.pivot(index=['Pos', 'Strand', 'Sequence -50 nt upstream + TSS (51nt)'], 
                    columns='Condition', 
                    values=['detected', 'enriched', 'stepHeight', 'stepFactor', 'enrichmentFactor'])

df.columns = [f"{condition}_{metric}" for metric, condition in df.columns]
df.reset_index(inplace=True)
df.head()


Unnamed: 0,Pos,Strand,Sequence -50 nt upstream + TSS (51nt),LB_0.4_detected,LB_2.0_detected,M63_0.4_detected,LB_0.4_enriched,LB_2.0_enriched,M63_0.4_enriched,LB_0.4_stepHeight,LB_2.0_stepHeight,M63_0.4_stepHeight,LB_0.4_stepFactor,LB_2.0_stepFactor,M63_0.4_stepFactor,LB_0.4_enrichmentFactor,LB_2.0_enrichmentFactor,M63_0.4_enrichmentFactor
0,113,+,ATAGCAGCTTCTGAACTGGTTACCTGCCGTGAGTAAATTAAAATTT...,1.0,1.0,1.0,1.0,1.0,1.0,1.96,10.27,1.8,2.33,6.6,4.0,13.27,11.27,25.41
1,122,-,TAATTTTTATCTGTCTGTGCGCTATGCCTATATTGGTTAAAGTATT...,0.0,1.0,1.0,0.0,1.0,1.0,0.0,3.13,1.35,0.0,6.5,3.33,0.0,12.54,2.86
2,148,+,AATTAAAATTTTATTGACTTAGGTCACTAAATACTTTAACCAATAT...,1.0,1.0,1.0,1.0,1.0,1.0,64.53,154.97,831.14,10.9,10.96,16.97,2.62,5.86,2.9
3,412,+,CATCAGTGGCAAATGCAGAACGTTTTCTGCGTGTTGCCGATATTCT...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,24.14,0.0,0.0,41.0,0.0,0.0,12.49,0.0
4,1176,-,GCACCAATGAGCGTACCTGGTGCTTGAGGATTTCCGGTATTTTTAA...,1.0,1.0,1.0,1.0,1.0,1.0,3.19,9.78,5.01,100.0,100.0,100.0,2.95,5.27,3.41


In [16]:
# Read the e coli genome

with open('../data/Thomasan_2015/sequence.fasta') as f:
    sequence = f.readlines()
    sequence = [x.strip() for x in sequence]
    sequence = ''.join(sequence[1:])
f.close()

In [18]:
# Function to get the subsequence in the circular genome
def get_circular_sequence(sequence, start, end):
    seq_len = len(sequence)
    if start < 0: 
        return sequence[start % seq_len:] + sequence[:end % seq_len]
    elif end > seq_len:
        return sequence[start % seq_len:] + sequence[:end % seq_len]
    else:
        return sequence[start:end]

def reverse_complement(seq):
    complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
    return ''.join(complement[base] for base in reversed(seq))

In [19]:
# Find the sequences in the e coli genome based on the Thomasan data
# If the strand is -, we need to return the reverse complement of the sequence and flip the up/downstream values

up = 52
down = 21

for row in df.itertuples():
    if row.Strand == '+':
        start = row.Pos - up
        end = row.Pos + down
        df.at[row.Index, 'Sequence'] = get_circular_sequence(sequence, start, end)
    else:
        start = row.Pos - down
        end = row.Pos + up
        rev_comp_seq = get_circular_sequence(sequence, start, end)
        df.at[row.Index, 'Sequence'] = reverse_complement(rev_comp_seq)

df = df[['Sequence', 'LB_0.4_detected', 'LB_2.0_detected',
         'M63_0.4_detected', 'LB_0.4_enriched', 'LB_2.0_enriched',
         'M63_0.4_enriched', 'LB_0.4_stepHeight', 'LB_2.0_stepHeight',
         'M63_0.4_stepHeight', 'LB_0.4_stepFactor', 'LB_2.0_stepFactor',
         'M63_0.4_stepFactor', 'LB_0.4_enrichmentFactor',
         'LB_2.0_enrichmentFactor', 'M63_0.4_enrichmentFactor', ]]

df.head(10)

Unnamed: 0,Sequence,LB_0.4_detected,LB_2.0_detected,M63_0.4_detected,LB_0.4_enriched,LB_2.0_enriched,M63_0.4_enriched,LB_0.4_stepHeight,LB_2.0_stepHeight,M63_0.4_stepHeight,LB_0.4_stepFactor,LB_2.0_stepFactor,M63_0.4_stepFactor,LB_0.4_enrichmentFactor,LB_2.0_enrichmentFactor,M63_0.4_enrichmentFactor
0,GATAGCAGCTTCTGAACTGGTTACCTGCCGTGAGTAAATTAAAATT...,1.0,1.0,1.0,1.0,1.0,1.0,1.96,10.27,1.8,2.33,6.6,4.0,13.27,11.27,25.41
1,TGTAATTTTTATCTGTCTGTGCGCTATGCCTATATTGGTTAAAGTA...,0.0,1.0,1.0,0.0,1.0,1.0,0.0,3.13,1.35,0.0,6.5,3.33,0.0,12.54,2.86
2,AAATTAAAATTTTATTGACTTAGGTCACTAAATACTTTAACCAATA...,1.0,1.0,1.0,1.0,1.0,1.0,64.53,154.97,831.14,10.9,10.96,16.97,2.62,5.86,2.9
3,ACATCAGTGGCAAATGCAGAACGTTTTCTGCGTGTTGCCGATATTC...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,24.14,0.0,0.0,41.0,0.0,0.0,12.49,0.0
4,TGGCACCAATGAGCGTACCTGGTGCTTGAGGATTTCCGGTATTTTT...,1.0,1.0,1.0,1.0,1.0,1.0,3.19,9.78,5.01,100.0,100.0,100.0,2.95,5.27,3.41
5,GCGTGGTCGCCTCGGAGAAACTCATGCCTTCGTCTAACTTGCCGAA...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,3.88,2.31,0.0,2.5,4.0,0.0,0.83,2.06
6,GGACCCGCGAGATGATCTTTCTGGTATGGATGTGGCGCGTAAACTA...,1.0,1.0,0.0,1.0,1.0,0.0,10.06,11.55,3.08,7.83,5.33,2.23,3.57,9.33,0.38
7,CGCGTGGCGAAGGCCCGTGATGAAGGAAAAGTTTTGCGCTATGTTG...,1.0,1.0,1.0,0.0,1.0,0.0,10.79,240.04,16.24,100.0,100.0,15.56,1.09,2.93,1.11
8,CGCAAAGCTGACCTGCTCGTTGTGATCTTTCAGATTGTAGAGTTTC...,1.0,1.0,1.0,1.0,1.0,1.0,0.98,2.61,1.0,3.0,100.0,3.5,5.69,7.52,2.96
9,CGGCGGGCGCACGAGTACTGGAAAACTAAATGAAACTCTACAATCT...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.71,0.0,0.0,100.0,0.0,0.0,5.11,0.0


In [27]:
print(f'Num columns: {len(df.columns)}')
print()
for col in df.columns:
    print(col)

Num columns: 16

Sequence
LB_0.4_detected
LB_2.0_detected
M63_0.4_detected
LB_0.4_enriched
LB_2.0_enriched
M63_0.4_enriched
LB_0.4_stepHeight
LB_2.0_stepHeight
M63_0.4_stepHeight
LB_0.4_stepFactor
LB_2.0_stepFactor
M63_0.4_stepFactor
LB_0.4_enrichmentFactor
LB_2.0_enrichmentFactor
M63_0.4_enrichmentFactor


In [None]:
# All sequences have at least one condition detected

filtered_sequences = df.loc[
    (df['LB_0.4_detected'] != 1) & 
    (df['LB_2.0_detected'] != 1) & 
    (df['M63_0.4_detected'] != 1), 
    'Sequence'
]

filtered_sequences.tolist()


[]

In [None]:
# All sequences have at least one condition enriched

filtered_sequences = df.loc[
    (df['LB_0.4_enriched'] != 1) & 
    (df['LB_2.0_enriched'] != 1) & 
    (df['M63_0.4_enriched'] != 1), 
    'Sequence'
]

filtered_sequences.tolist()

[]

In [24]:
df.to_csv('../data/Thomasan_2015/Thomasan.csv', index=False)