# Preprocessing

In [1]:
import pandas as pd

import os
import re

from sklearn import preprocessing
from Bio import SeqIO

## Data Expression

In [2]:
def read_csv_file(file_path):
    try:
        return pd.read_csv(file_path, delimiter=';', encoding='utf-8', low_memory=False)
    except pd.errors.ParserError:
        print(f"Error parsing {file_path}")
        return pd.DataFrame()
    
def read_csv_file_with_filename(file_path):
    try:
        df = pd.read_csv(file_path, delimiter=';', encoding='utf-8', low_memory=False)
        df['csv'] = os.path.basename(file_path)
    except pd.errors.ParserError:
        print(f"Error parsing {file_path}")
        return pd.DataFrame()
    return df

In [3]:
data_path = f'{os.getcwd()}/data/data_expression'
all_files = [os.path.join(data_path, file) for file in os.listdir(data_path) if file.endswith('.csv')]

df_list = [read_csv_file_with_filename(file) for file in all_files]
df = pd.concat(df_list, ignore_index=True)
df.reset_index(drop=True, inplace=True)

In [4]:
def clean_column_name(col_name):
    """
    clean and format column name by replacing spaces and special characters with underscores and converting to lowercase.
    
    Parameters:
    col_name (str): The original column name.
    
    Returns:
    str: The cleaned and formatted column name.
    """
    col_name = col_name.replace('_', ' ')
    col_name = col_name.replace('-', ' ')
    col_name = re.sub(r'[^\w\s]', ' ', col_name)
    col_name = re.sub(r'\s+', ' ', col_name)
    # Replace non-word characters (except for spaces) with nothing
    col_name = re.sub(r'[^\w\s]', '', col_name)
    # Replace spaces with underscores
    col_name = col_name.replace(' ', '_')
    # Convert to lowercase
    cleaned_name = col_name.lower()
    return cleaned_name

def rename_columns(df):
    """
    Rename all columns of the DataFrame to a more convenient format.
    
    Parameters:
    df (pd.DataFrame): The DataFrame whose columns are to be renamed.
    
    Returns:
    pd.DataFrame: DataFrame with renamed columns.
    """
    # Create a dictionary to map old column names to new column names
    new_columns = {col: clean_column_name(col) for col in df.columns}
    
    # Rename columns in the DataFrame
    df.rename(columns=new_columns, inplace=True)
    
    return df

# Rename all columns to a more convenient format
df = rename_columns(df)

In [5]:
df["species"] = df.species.apply(lambda x: "Aggregatibacter actinomycetemcomitans D7S" if "Aggregatibacter actinomycetemcomitans" in x else x)

In [6]:
df

Unnamed: 0,no,respiration,oxidase,catalase,gram_st,species,chromosome,region,new_locus_tag,old_locus_tag,...,ngon_vic_1_ge_tpm,ngon_vic_1_ge_cpm,ngon_vic_2_ge_total_counts,ngon_vic_2_ge_rpkm,ngon_vic_2_ge_tpm,ngon_vic_2_ge_cpm,ngon_vic_3_ge_total_counts,ngon_vic_3_ge_rpkm,ngon_vic_3_ge_tpm,ngon_vic_3_ge_cpm
0,91338,Facultative anaerobe,Negative,Negative,Positive,Streptococcus pneumoniae D39,NC_008533,1..1362,SPD_RS00005,SPD_0001,...,,,,,,,,,,
1,91339,Facultative anaerobe,Negative,Negative,Positive,Streptococcus pneumoniae D39,NC_008533,1521..2657,SPD_RS00010,SPD_0002,...,,,,,,,,,,
2,91340,Facultative anaerobe,Negative,Negative,Positive,Streptococcus pneumoniae D39,NC_008533,2722..2916,SPD_RS00015,SPD_0003,...,,,,,,,,,,
3,91341,Facultative anaerobe,Negative,Negative,Positive,Streptococcus pneumoniae D39,NC_008533,3000..4115,SPD_RS00020,SPD_0004,...,,,,,,,,,,
4,91342,Facultative anaerobe,Negative,Negative,Positive,Streptococcus pneumoniae D39,NC_008533,4186..4755,SPD_RS00025,SPD_0005,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105083,63547,Aerobic,Positive,Positive,Negative,Neisseria gonorrhoeae FA 1090,NC_002946,complement(2151190..2152827),,NGO2178,...,515.891930,1158.407890,5306.0,421.344928,442.591970,991.642358,3696.0,351.569874,424.146435,954.527459
105084,63548,Aerobic,Positive,Positive,Negative,Neisseria gonorrhoeae FA 1090,NC_002946,2152852..2152986,,NGO2179,...,41.724190,7.721658,42.0,40.466901,42.507514,7.849412,12.0,13.849722,16.708799,3.099115
105085,63549,Aerobic,Positive,Positive,Negative,Neisseria gonorrhoeae FA 1090,NC_002946,complement(2153000..2153221),,NGO2180,...,205.859875,62.648915,318.0,186.319611,195.715097,59.431261,275.0,193.007110,232.850661,71.021388
105086,63550,Aerobic,Positive,Positive,Negative,Neisseria gonorrhoeae FA 1090,NC_002946,complement(2153286..2153651),,NGO2181,...,364.919185,183.090857,836.0,297.104763,312.086781,156.240673,530.0,225.625599,272.202770,136.877585


In [7]:
# category_mappings = {}
# multi_categories = ["respiration", "gram_st", "chromosome", "species"]
# binary_categories = ["oxidase", "catalase"]

# intersection = set(binary_categories).intersection(set(multi_categories))

# while intersection:
#     multi_categories.remove(intersection.pop())

# le = preprocessing.LabelEncoder()
# for category in multi_categories:
#     df[category] = le.fit_transform(df[category])
#     category_mappings[category] = dict(zip(le.classes_, le.transform(le.classes_)))


# lb = preprocessing.LabelBinarizer()
# for category in binary_categories:
#     # Fit LabelBinarizer only on non-NaN values and transform them
#     non_nan_mask = df[category].notna()
#     encoded_values = lb.fit_transform(df.loc[non_nan_mask, category])

#     # Convert the result to a DataFrame to match the original index
#     encoded_df = pd.DataFrame(encoded_values, index=df[non_nan_mask].index, columns=[category])

#     # Combine the encoded values with the original DataFrame, preserving NaNs
#     df[category] = encoded_df[category]
#     category_mappings[category] = dict(zip(le.classes_, le.transform(le.classes_)))

In [8]:
# Extract and clean regions
df['is_complement'] = df['region'].str.contains('complement')
df['region_clean'] = df['region'].str.replace('complement\(|\)', '', regex=True)
df['is_circular'] = df['region_clean'].str.contains('join')
df['region_clean'] = df['region_clean'].str.replace('join\(|\)', '', regex=True)

# Initialize dictionary to store results for each species
species_complements = {}

# Group by species
grouped = df.groupby('species')

# Process each group
for species, group in grouped:
    regions = group.loc[~group['is_complement'], 'region_clean'].unique()
    complements = group.loc[group['is_complement'], 'region_clean'].unique()
    
    # It is important to check that there are no regions that are both complements and not complements!!!
    assert len([region for region in regions if region in complements]) == 0
    
    # Store results for this species
    species_complements[species] = {
        'Regions_without_complements': [region for region in regions if region not in complements],
        'Complements_only': [complement for complement in complements if complement not in regions]
    }

# Print the results
# for species, results in species_complements.items():
#     print(f"Species: {species}")
#     print("  regions without complements:", results['Regions_without_complements'])
#     print("  Complements only:", results['Complements_only'])
#     print()


  df['is_complement'] = df['region'].str.contains('complement')
  df['region_clean'] = df['region'].str.replace('complement\(|\)', '', regex=True)
  df['is_circular'] = df['region_clean'].str.contains('join')


In [9]:
def parse_region(region):
    start, *intermediate, end = region.split('..')
    if not intermediate:
        end = int(end)
        start = int(start)
        length = end - start + 1
        return start, end, length
    else:
        first_end, second_start = intermediate[0].split(',')   
        length1 = int(first_end) - int(start) + 1
        length2 = int(end) - int(second_start) + 1
        return int(start), int(end), length1 + length2

# Apply the function to the DataFrame
df[['region_start', 'region_end', 'region_length']] = df['region_clean'].apply(lambda x: pd.Series(parse_region(x)))

  df[['region_start', 'region_end', 'region_length']] = df['region_clean'].apply(lambda x: pd.Series(parse_region(x)))
  df[['region_start', 'region_end', 'region_length']] = df['region_clean'].apply(lambda x: pd.Series(parse_region(x)))
  df[['region_start', 'region_end', 'region_length']] = df['region_clean'].apply(lambda x: pd.Series(parse_region(x)))


In [10]:
df.oxidase = df.oxidase.fillna(False)
df["oxidase"] = df["oxidase"].map({"Negative": False, "Positive": True})
df["catalase"] = df["catalase"].map({"Negative": False, "Positive": True})

## Data Sequences

In [11]:
def parse_fna_file(filepath):
    return list(SeqIO.parse(filepath, "fasta"))

def parse_gff_file(filepath):
    return pd.read_csv(filepath, sep="\t", comment='#', header=None,
                       names=["seqid", "source", "type", "start", "end", "score", "strand", "phase", "attributes"])

def extract_gc_content(sequence):
    seq_str = str(sequence.seq)
    gc_content = (seq_str.count('G') + seq_str.count('C')) / len(seq_str)
    return gc_content

def calculate_rpkm(counts, length_kb, total_reads_million):
    return counts / (length_kb * total_reads_million)

def calculate_tpm(df):
    df['RPK'] = df['counts'] / df['length_kb']
    sum_rpk = df['RPK'].sum()
    df['TPM'] = (df['RPK'] * 1e6) / sum_rpk
    return df

In [12]:
data_path = f"{os.getcwd()}/data/data_sequences/"
fna_files = [os.path.join(data_path, f) for f in os.listdir(data_path) if f.endswith('.fna')]
gff_files = [os.path.join(data_path, f) for f in os.listdir(data_path) if f.endswith('.gff')]

all_features = []

for fna_file in fna_files:
    sequences = parse_fna_file(fna_file)
    for seq in sequences:
        gc_content = extract_gc_content(seq)
        try:
            species, sequence_type = seq.description[len(seq.id)+1:].split(", ")
        except:
            species = "Achromobacter xylosoxidans strain SOLR10"
            if seq.description.endswith("chromosome"):
                sequence_type = "chromosome"
            else:
                sequence_type = "plasmoid"
                
        file_name = os.path.basename(fna_file)[:-4]
        region, region_id = file_name.split(".")
        chromosome, chromosome_id = seq.id.split(".")
        all_features.append({"region":region, "region_id": region_id,
                             "chromosome":chromosome, "chromosome_id": chromosome_id,
                             'species': species, "sequence_type": sequence_type,
                             'gc_content': gc_content, 'length_kb': len(seq) / 1000})

features_df = pd.DataFrame(all_features)
features_df.rename(columns={"seqid": "seq_id",
                            "start": "region_start",
                            "end": "region_end"}, inplace=True)

new_columns = {col: col.lower() for col in features_df.columns}
features_df.rename(columns=new_columns, inplace=True)

for gff_file in gff_files:
    gff_data = parse_gff_file(gff_file)

In [13]:
# Function to split and parse the string
def parse_column_values(column_value):
    key_value_pairs = column_value.split(';')
    parsed_dict = {}
    for pair in key_value_pairs:
        print(pair)
        key, val = pair.split('=')
        parsed_dict[key] = val
    return parsed_dict

def split_complex_values(value):
    # Initialize the parts with None
    part1, part2, part3, part4 = None, None, None, None

    if value:
        # Split based on '-'
        if '-' in value:
            part1, remainder = value.split('-', 1)
        else:
            part1, remainder = None, value

        # Split based on ':'
        if remainder and ':' in remainder:
            part2, remainder = remainder.split(':', 1)
        else:
            part2, remainder = remainder, ""

        # Split based on '..'
        if remainder:
            part4, part5 = remainder.split('..', 1)
        else:
            part4, part5 = None, None
        
        # Split based on '.'
        if part2 and '.' in part2:
            part2, part3 = part2.split('.', 1)
        else:
            part2, part3 = part2, None

    return part1, part2, part3, part4, part5


# Applying the function to the DataFrame column
parsed_data = gff_data['attributes'].apply(parse_column_values)

# Creating a new DataFrame with expanded columns
expanded_df = pd.DataFrame(parsed_data.tolist())

# Applying the complex value split to the 'ID' column
expanded_df[['seq_type', 'chromosome', 'chromosome_id', 'region_start', 'region_end']] = expanded_df['ID'].apply(lambda x: pd.Series(split_complex_values(x)))

# Dropping the original 'ID' column after expansion
expanded_df.drop(columns=['ID'], inplace=True)

# Concatenating with the original DataFrame
gff_data = pd.concat([gff_data, expanded_df], axis=1)

ID=NC_008767.1:1..2194961
Dbxref=taxon:272831
Is_circular=true
Name=ANONYMOUS
country=USA
gbkey=Src
genome=chromosome
mol_type=genomic DNA
serotype=C
strain=FAM18
ID=gene-NMC_RS00010
Name=lpxC
gbkey=Gene
gene=lpxC
gene_biotype=protein_coding
locus_tag=NMC_RS00010
old_locus_tag=NMC0001
ID=cds-WP_002220060.1
Parent=gene-NMC_RS00010
Dbxref=GenBank:WP_002220060.1
Name=WP_002220060.1
Ontology_term=GO:0009245,GO:0008759
gbkey=CDS
gene=lpxC
go_function=UDP-3-O-[3-hydroxymyristoyl] N-acetylglucosamine deacetylase activity|0008759||IEA
go_process=lipid A biosynthetic process|0009245||IEA
inference=COORDINATES: similar to AA sequence:RefSeq:WP_002246555.1
locus_tag=NMC_RS00010
product=UDP-3-O-acyl-N-acetylglucosamine deacetylase
protein_id=WP_002220060.1
transl_table=11
ID=gene-NMC_RS00015
Name=NMC_RS00015
end_range=3610,.
gbkey=Gene
gene_biotype=pseudogene
locus_tag=NMC_RS00015
old_locus_tag=NMC0002
partial=true
pseudo=true
ID=cds-NMC_RS00015
Parent=gene-NMC_RS00015
Note=incomplete%3B partial i

In [14]:
# Create a dictionary to map old column names to new column names
new_columns = {col: col.lower() for col in gff_data.columns}

gff_data.rename(columns=new_columns, inplace=True)
    
gff_data['is_complement'] = gff_data['strand'].map({'+': True, '-': False})
gff_data.drop(columns=['strand'], inplace=True)
# Map "true" to True and "false" to False
gff_data['is_circular'] = gff_data['is_circular'].map({'true': True, 'false': False})
# Replace NaN values with False
gff_data['is_circular'] = gff_data['is_circular'].fillna(False)
gff_data["partial"] = gff_data["partial"].fillna(False)

  gff_data['is_circular'] = gff_data['is_circular'].fillna(False)


In [15]:
sparse_columns = []
for column in gff_data.columns:
    column_length = len(gff_data[column])
    missing_values = gff_data[column].isna().sum()
    if missing_values > 0.99 * column_length:  # Assuming you want to delete columns with more than 50% missing values
        print(f"Column '{column}': Length = {column_length}, Missing Values = {missing_values}. This column is sparse and will be deleted.")
        sparse_columns.append(column)

Column 'country': Length = 4453, Missing Values = 4452. This column is sparse and will be deleted.
Column 'genome': Length = 4453, Missing Values = 4452. This column is sparse and will be deleted.
Column 'mol_type': Length = 4453, Missing Values = 4452. This column is sparse and will be deleted.
Column 'serotype': Length = 4453, Missing Values = 4452. This column is sparse and will be deleted.
Column 'strain': Length = 4453, Missing Values = 4452. This column is sparse and will be deleted.
Column 'gene_synonym': Length = 4453, Missing Values = 4448. This column is sparse and will be deleted.
Column 'bound_moiety': Length = 4453, Missing Values = 4448. This column is sparse and will be deleted.
Column 'regulatory_class': Length = 4453, Missing Values = 4448. This column is sparse and will be deleted.
Column 'exception': Length = 4453, Missing Values = 4451. This column is sparse and will be deleted.
Column 'region_start': Length = 4453, Missing Values = 4447. This column is sparse and w

In [16]:
gff_data.drop(columns=sparse_columns, inplace=True)
gff_data.drop(columns=['score', 'attributes', 'dbxref'], inplace=True)
gff_data.rename(columns={'start': 'region_start', 'end': 'region_end'}, inplace=True)
gff_data[['chromosome', 'chromosome_id']] = gff_data['seqid'].str.split('.', expand=True)
gff_data.drop(columns=['seqid', 'chromosome_id', 'end_range', 'start_range'], inplace=True)

In [17]:
gff_data

Unnamed: 0,source,type,region_start,region_end,phase,is_circular,name,gbkey,gene,gene_biotype,...,protein_id,transl_table,partial,pseudo,note,go_component,anticodon,seq_type,chromosome,is_complement
0,RefSeq,region,1,2194961,.,True,ANONYMOUS,Src,,,...,,,False,,,,,,NC_008767,True
1,RefSeq,gene,1261,2184,.,False,lpxC,Gene,lpxC,protein_coding,...,,,False,,,,,gene,NC_008767,False
2,Protein Homology,CDS,1261,2184,0,False,WP_002220060.1,CDS,lpxC,,...,WP_002220060.1,11,False,,,,,cds,NC_008767,False
3,RefSeq,pseudogene,3341,3610,.,False,NMC_RS00015,Gene,,pseudogene,...,,,true,true,,,,gene,NC_008767,False
4,Protein Homology,CDS,3341,3610,0,False,,CDS,,,...,,11,true,true,incomplete%3B partial in the middle of a conti...,,,cds,NC_008767,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4448,Protein Homology,CDS,2191558,2192007,0,False,WP_002221764.1,CDS,,,...,WP_002221764.1,11,False,,,,,cds,NC_008767,False
4449,RefSeq,gene,2192048,2193319,.,False,waaA,Gene,waaA,protein_coding,...,,,False,,,,,gene,NC_008767,False
4450,Protein Homology,CDS,2192048,2193319,0,False,WP_041423265.1,CDS,waaA,,...,WP_041423265.1,11,False,,,,,cds,NC_008767,False
4451,RefSeq,gene,2193382,2194830,.,False,gnd,Gene,gnd,protein_coding,...,,,False,,,,,gene,NC_008767,False


In [18]:
features_df

Unnamed: 0,region,region_id,chromosome,chromosome_id,species,sequence_type,gc_content,length_kb
0,GCF_000021165,1,NC_011333,1,Helicobacter pylori G27,complete sequence,0.388942,1652.982
1,GCF_000021165,1,NC_011334,1,Helicobacter pylori G27 plasmid pHPG27,complete sequence,0.348719,10.031
2,GCF_000210855,2,NC_016810,1,Salmonella enterica subsp. enterica serovar Ty...,complete sequence,0.521833,4878.012
3,GCF_000210855,2,NC_017718,1,Salmonella enterica subsp. enterica serovar Ty...,complete sequence,0.501864,86.908
4,GCF_000210855,2,NC_017719,1,Salmonella enterica subsp. enterica serovar Ty...,complete sequence,0.609576,8.688
...,...,...,...,...,...,...,...,...
71,GCF_000210475,1,NC_017723,1,Escherichia coli ETEC H10407 plasmid p58,complete sequence,0.488793,5.800
72,GCF_000210475,1,NC_017722,1,Escherichia coli ETEC H10407 plasmid p666,complete sequence,0.510280,66.681
73,GCF_000210475,1,NC_017724,1,Escherichia coli ETEC H10407 plasmid p948,complete sequence,0.469129,94.797
74,GCF_000008485,1,NC_002942,5,Legionella pneumophila subsp. pneumophila str....,complete sequence,0.382695,3397.754


## Upstream

In [19]:
upstream_df = pd.read_excel(f"{os.getcwd()}/data/data_sequences_upstream/upstream_sequences.xlsx")

In [20]:
# Extract and clean regions
upstream_df['is_complement'] = upstream_df['region'].str.contains('complement')
upstream_df['region_clean'] = upstream_df['region'].str.replace('complement\(|\)', '', regex=True)
upstream_df['is_circular'] = upstream_df['region_clean'].str.contains('join')
upstream_df['region_clean'] = upstream_df['region_clean'].str.replace('join\(|\)', '', regex=True)
# Apply the function to the DataFrame
upstream_df[['region_start', 'region_end', 'region_length']] = upstream_df['region_clean'].apply(lambda x: pd.Series(parse_region(x)))

In [21]:
upstream_df.csv

0         Vibrio cholerae O1 biovar El Tor str. N16961.csv
1         Vibrio cholerae O1 biovar El Tor str. N16961.csv
2         Vibrio cholerae O1 biovar El Tor str. N16961.csv
3         Vibrio cholerae O1 biovar El Tor str. N16961.csv
4         Vibrio cholerae O1 biovar El Tor str. N16961.csv
                               ...                        
96406    Legionella pneumophila subsp. pneumophila Phil...
96407    Legionella pneumophila subsp. pneumophila Phil...
96408    Legionella pneumophila subsp. pneumophila Phil...
96409    Legionella pneumophila subsp. pneumophila Phil...
96410    Legionella pneumophila subsp. pneumophila Phil...
Name: csv, Length: 96411, dtype: object

In [22]:
common_elements = set(df['csv']).intersection(set(upstream_df['csv']))
uncommon_elements = set(df['csv']).difference(set(upstream_df['csv']))
uncommon_elements_upstream = set(upstream_df['csv']).difference(set(df['csv']))

In [23]:
len(common_elements)

27

In [24]:
uncommon_elements

{'Shigella flexneri 5a str. M90T.csv',
 'Staphylococcus\xa0aureus MRSA252.csv',
 'Staphylococcus\xa0aureus MSSA476.csv',
 'Staphylococcus\xa0epidermidis 1457.csv',
 'Yersinia pseudotuberculosis YPIII.csv'}

In [25]:
uncommon_elements_upstream

{'Staphylococcus��aureus MRSA252.csv',
 'Staphylococcus��aureus MSSA476.csv',
 'Staphylococcus��epidermidis 1457.csv'}

In [26]:
replacement_dict = {'Staphylococcus��aureus MRSA252.csv': "Staphylococcus\xa0aureus MRSA252.csv",
                    'Staphylococcus��aureus MSSA476.csv': 'Staphylococcus\xa0aureus MSSA476.csv',
                    'Staphylococcus��epidermidis 1457.csv' : 'Staphylococcus\xa0epidermidis 1457.csv'}

In [27]:
upstream_df["csv"] = upstream_df["csv"].replace(replacement_dict)
assert len(set(upstream_df['csv']).difference(set(df['csv']))) == 0

In [28]:
print(f"We don't have upstream data for only {set(df['csv']).difference(set(upstream_df['csv']))}")

We don't have upstream data for only {'Yersinia pseudotuberculosis YPIII.csv', 'Shigella flexneri 5a str. M90T.csv'}


In [29]:
upstream_df

Unnamed: 0,csv,contig,region,upstream200,is_complement,region_clean,is_circular,region_start,region_end,region_length
0,Vibrio cholerae O1 biovar El Tor str. N16961.csv,NC_002505,complement(235..402),CAGGCTCTGCAGAATACACCACCGAATACCTCTGCACTACGTTATG...,True,235..402,False,235,402,168
1,Vibrio cholerae O1 biovar El Tor str. N16961.csv,NC_002505,complement(372..806),ATCTCGATGCCCTAGAGCGAGCCGCAGAGCACTTAGCGATTGGCCA...,True,372..806,False,372,806,435
2,Vibrio cholerae O1 biovar El Tor str. N16961.csv,NC_002505,complement(816..2210),TGCCAGTCATGTTCACTTTCTTCTTCCTGTGGTTCCCATCAGGTCT...,True,816..2210,False,816,2210,1395
3,Vibrio cholerae O1 biovar El Tor str. N16961.csv,NC_002505,complement(2271..3896),TGGTTTATTAGTCCACTTATCGGCCCACGCTGCCGATTCACTCCTA...,True,2271..3896,False,2271,3896,1626
4,Vibrio cholerae O1 biovar El Tor str. N16961.csv,NC_002505,complement(3899..4156),CTTTCTCATCCTCGTTTGGGACTCGCGGTTCCTAAAAAGCAGATCA...,True,3899..4156,False,3899,4156,258
...,...,...,...,...,...,...,...,...,...,...
96406,Legionella pneumophila subsp. pneumophila Phil...,NC_002942,complement(3393934..3395274),CAGCACCAGCTGATCCAATGCAAGCTAAGGTAATGATGTTTTTACC...,True,3393934..3395274,False,3393934,3395274,1341
96407,Legionella pneumophila subsp. pneumophila Phil...,NC_002942,complement(3395275..3396945),ATCAGTATTTTATTAGCCCTTTGATAACACCATGTTGTCGCTATTA...,True,3395275..3396945,False,3395275,3396945,1671
96408,Legionella pneumophila subsp. pneumophila Phil...,NC_002942,complement(3396955..3397200),AATAAGTTAGGCTATGCACGCCTTGGTTTAGCATTGTCAAAAAAAA...,True,3396955..3397200,False,3396955,3397200,246
96409,Legionella pneumophila subsp. pneumophila Phil...,NC_002942,complement(3397167..3397355),TAAAAAGACGTCGTGCTAAAGGTCGTAAGCGTTTATCTGCCTAAGT...,True,3397167..3397355,False,3397167,3397355,189
