In [1]:
import os
import re
import pandas as pd

# Expression data section

In [2]:
def extract_species_id_and_column_name(column_name, reference_columns):
    parts = column_name.split("_")
    if "_".join(parts[2:]) in reference_columns:
        return "_".join(parts[:2]), "_".join(parts[2:])
    else:
        return parts[0], "_".join(parts[1:])


def read_xlsx_files_to_dataframe(directory, concatenated_df=None):
    # Initialize an empty dataframe if concatenated_df is not provided
    if concatenated_df is None:
        concatenated_df = pd.DataFrame()

    # Get the set of species that have already been read
    existing_species = (
        set(concatenated_df["Species"])
        if "Species" in concatenated_df.columns
        else set()
    )

    # Reference columns for species ID extraction
    reference_columns = {
        col.split("_")[0] for col in concatenated_df.columns if "TPM" in col
    }
    reference_columns.update(
        {
            "_".join(col.split("_")[:2])
            for col in concatenated_df.columns
            if "TPM" in col
        }
    )

    # Iterate over all files in the given directory
    for filename in os.listdir(directory):
        if filename.endswith(".xlsx"):
            file_path = os.path.join(directory, filename)

            # Remove the extension from the filename
            base_filename = os.path.splitext(filename)[0]

            # Read the Excel file into a pandas dataframe
            df = pd.read_excel(file_path)

            # Check if the species in the file is already read
            if "Species" in df.columns:
                species_in_file = set(df["Species"])
                if species_in_file.intersection(existing_species):
                    print(
                        f"Species {species_in_file} is skipped as it has already been read from {base_filename}."
                    )
                    continue

            # Extract 'Species' and 'Region' columns
            columns_to_extract = ["Species", "Chromosome", "Region"]

            # Identify TPM columns and extract them
            tpm_columns = [col for col in df.columns if "TPM" in col]

            # Create a new dataframe with the required columns
            extracted_data = df[columns_to_extract + tpm_columns].copy()

            # Extract species id from the TPM columns and rename them
            species_ids = []
            for col in tpm_columns:
                species_id, new_col_name = extract_species_id_and_column_name(
                    col, reference_columns
                )
                extracted_data[new_col_name] = extracted_data[col]
                extracted_data.drop(columns=[col], inplace=True)
                species_ids.append(species_id)

            try:
                if len(set(species_ids)) == 1:
                    extracted_data["Species ID"] = species_ids[0]
                else:
                    raise ValueError("Multiple species IDs found in one file.")
            except ValueError as e:
                print(f"Error processing file {filename}: {e}")
                extracted_data["Species ID"] = species_ids[0]

            # Concatenate the extracted data to the main dataframe
            if concatenated_df.empty:
                concatenated_df = extracted_data
            else:
                if set(concatenated_df.columns) == set(extracted_data.columns):
                    concatenated_df = pd.concat(
                        [concatenated_df, extracted_data], ignore_index=True
                    )
                    print(f"Data from {filename} is extracted")

                else:
                    unmatched_columns = set(
                        concatenated_df.columns
                    ).symmetric_difference(set(extracted_data.columns))
                    print(
                        f"Column names do not match across the files. Unmatched columns: {unmatched_columns}"
                    )
                    return concatenated_df

    return concatenated_df

In [3]:
# Reads expression data and creates a dataframe.
directory = f"{os.getcwd()}/data/data_expression"
final_df = read_xlsx_files_to_dataframe(directory)
print(final_df)

Data from Shigella flexneri 5a str. M90T.xlsx is extracted
Data from Escherichia coli UPEC 536.xlsx is extracted
Error processing file Streptococcus suis S10 P 17.xlsx: Multiple species IDs found in one file.
Data from Streptococcus suis S10 P 17.xlsx is extracted
Data from Acinetobacter baumannii AB5075-UW.xlsx is extracted
Data from Klebsiella pneumoniae subsp. pneumoniae MGH 78578.xlsx is extracted
Column names do not match across the files. Unmatched columns: {'G27_Tm_3 (GE) - TPM', 'G27_Oxs_2 (GE) - TPM', 'As_3 (GE) - TPM', 'Vic_3 (GE) - TPM', 'G27_Bs_2 (GE) - TPM', 'Li_1 (GE) - TPM', 'G27_Nd_1 (GE) - TPM', 'Li_3 (GE) - TPM', 'G27_Oxs_3 (GE) - TPM', 'G27_Tm_1 (GE) - TPM', 'As_2 (GE) - TPM', 'G27_Ns_1 (GE) - TPM', 'G27_Bs_1 (GE) - TPM', 'Oxs_3 (GE) - TPM', 'Sp_2 (GE) - TPM', 'G27_Li_1 (GE) - TPM', 'G27_Bs_3 (GE) - TPM', 'G27_Nd_3 (GE) - TPM', 'G27_As_3 (GE) - TPM', 'G27_Mig_2 (GE) - TPM', 'Oxs_2 (GE) - TPM', 'Mig_3 (GE) - TPM', 'G27_Li_3 (GE) - TPM', 'Vic_2 (GE) - TPM', 'Ns_3 (GE) 

In [4]:
final_df

Unnamed: 0,Species,Chromosome,Region,As_1 (GE) - TPM,As_2 (GE) - TPM,As_3 (GE) - TPM,Bs_1 (GE) - TPM,Bs_2 (GE) - TPM,Bs_3 (GE) - TPM,Ctrl_1 (GE) - TPM,...,Sp_1 (GE) - TPM,Sp_2 (GE) - TPM,Sp_3 (GE) - TPM,Tm_1 (GE) - TPM,Tm_2 (GE) - TPM,Tm_3 (GE) - TPM,Vic_1 (GE) - TPM,Vic_2 (GE) - TPM,Vic_3 (GE) - TPM,Species ID
0,Vibrio cholerae O1 biovar El Tor str. N16961,NC_002505,complement(235..402),0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,8.392188,0.000000,0.000000,0.000000,0.000000,0.000000,Vibrio
1,Vibrio cholerae O1 biovar El Tor str. N16961,NC_002505,complement(372..806),393.166189,311.937046,383.829258,377.730614,273.496217,331.362458,315.921082,...,302.104930,175.084374,351.995974,385.693391,383.276940,406.241639,276.773111,178.371963,223.310216,Vibrio
2,Vibrio cholerae O1 biovar El Tor str. N16961,NC_002505,complement(816..2210),162.400143,171.281001,174.959500,261.605512,228.250776,233.104257,154.689875,...,203.637405,145.679966,220.608060,260.216487,177.158771,215.282147,162.996568,97.415072,128.631521,Vibrio
3,Vibrio cholerae O1 biovar El Tor str. N16961,NC_002505,complement(2271..3896),720.122251,825.256114,860.384381,560.678176,683.429275,731.909515,554.730307,...,276.626761,442.196629,473.234066,327.976622,676.676655,632.319698,622.953497,648.744407,603.585147,Vibrio
4,Vibrio cholerae O1 biovar El Tor str. N16961,NC_002505,complement(3899..4156),132.002479,371.588303,244.840701,150.111163,232.967463,208.688639,205.699493,...,170.299063,242.590426,153.183540,128.419995,108.867940,232.579308,223.051153,367.948661,124.616224,Vibrio
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24127,Klebsiella pneumoniae subsp. pneumoniae MGH 78578,NC_009653,1433..1663,2196.775545,2257.918323,2155.878280,1635.286545,2357.383900,1929.630652,2260.813993,...,12814.854657,7358.802631,10632.870032,12801.790715,10104.657626,10909.565036,9206.009642,6822.881422,4465.778497,KLEBS
24128,Klebsiella pneumoniae subsp. pneumoniae MGH 78578,NC_009653,1660..1998,2181.385479,1981.081622,1666.816029,1746.367340,1849.783568,1862.333827,1403.000191,...,11653.963228,9174.082484,8283.044866,4301.461428,4804.391257,3884.423218,3057.111744,3325.739216,3063.158487,KLEBS
24129,Klebsiella pneumoniae subsp. pneumoniae MGH 78578,NC_009653,complement(2287..2544),107.262956,165.195837,160.945638,268.944256,336.767008,251.302784,306.821038,...,68.457070,47.994576,67.531582,250.489120,247.530226,322.815816,209.147013,255.838672,283.460097,KLEBS
24130,Klebsiella pneumoniae subsp. pneumoniae MGH 78578,NC_009653,2667..2858,59.660108,97.389121,74.907776,46.934265,44.546468,41.170402,47.496680,...,188.819912,283.767930,90.745563,208.279627,163.870817,182.534512,105.147091,141.333099,141.038356,KLEBS


In [5]:
final_df.to_csv(f"{os.getcwd()}/data/expression_final_df.csv")

In [6]:
print("Number of species: ", len(set(final_df["Species"].tolist())))
print(set(final_df["Species"].tolist()))

Number of species:  6
{'Vibrio cholerae O1 biovar El Tor str. N16961', 'Shigella flexneri 5a str. M90T', 'Escherichia coli UPEC 536', 'Klebsiella pneumoniae subsp. pneumoniae MGH 78578', 'Acinetobacter baumannii AB5075-UW', 'Streptococcus suis S10 -P1/7'}


# upstream_sequences section (Move after you get all the species)

In [7]:
# Functions to process sequence data
def preprocess_tsv(tsv_path):
    # Read the TSV file into a pandas dataframe
    tsv_df = pd.read_csv(tsv_path, sep="\t")

    # Drop the .csv extension from the 'csv' column and rename it to 'Species'
    tsv_df["Species"] = tsv_df["csv"].str.replace(".csv", "", regex=False)

    # Rename the 'contig' column to 'Chromosome'
    tsv_df.rename(columns={"contig": "Chromosome", "region": "Region"}, inplace=True)

    # Drop the original 'csv' column
    tsv_df.drop(columns=["csv"], inplace=True)

    # Replace specific species names
    tsv_df["Species"] = tsv_df["Species"].replace(
        {
            "Salmonella enterica subsp. enterica serovar TyphimuriumSL1344": "Salmonella enterica subsp. enterica serovar Typhimurium SL1344",
            "Escherichia coli EPEC 0127 H6 E2348 69": "Escherichia coli EPEC 0127:H6 E2348/69",
            "Streptococcus suis S10 P 17": "Streptococcus suis S10 -P1/7",
        }
    )

    return tsv_df


def merge_dataframes(main_df, tsv_df):
    # Check if the two datasets have the same species
    species_diff_1 = set(tsv_df["Species"]).difference(set(main_df["Species"]))
    species_diff_2 = set(main_df["Species"]).difference(set(tsv_df["Species"]))

    if species_diff_1 or species_diff_2:
        print(f"Species present in TSV but not in main dataframe: {species_diff_1}")
        print(f"Species present in main dataframe but not in TSV: {species_diff_2}")

    # Perform an inner merge on 'Species', 'Chromosome', and 'Region' columns
    # What other merge can we do here!!
    merged_df = pd.merge(
        main_df, tsv_df, on=["Species", "Chromosome", "Region"], how="inner"
    )

    return merged_df

In [8]:
# Read and preprocess the TSV file
tsv_path = f"{os.getcwd()}/data/data_sequences_upstream/upstream_sequences.tsv"
sequence_df = preprocess_tsv(tsv_path)

In [9]:
sequence_df

Unnamed: 0,Chromosome,Region,upstream200,Species
0,NC_002505,complement(235..402),CAGGCTCTGCAGAATACACCACCGAATACCTCTGCACTACGTTATG...,Vibrio cholerae O1 biovar El Tor str. N16961
1,NC_002505,complement(372..806),ATCTCGATGCCCTAGAGCGAGCCGCAGAGCACTTAGCGATTGGCCA...,Vibrio cholerae O1 biovar El Tor str. N16961
2,NC_002505,complement(816..2210),TGCCAGTCATGTTCACTTTCTTCTTCCTGTGGTTCCCATCAGGTCT...,Vibrio cholerae O1 biovar El Tor str. N16961
3,NC_002505,complement(2271..3896),TGGTTTATTAGTCCACTTATCGGCCCACGCTGCCGATTCACTCCTA...,Vibrio cholerae O1 biovar El Tor str. N16961
4,NC_002505,complement(3899..4156),CTTTCTCATCCTCGTTTGGGACTCGCGGTTCCTAAAAAGCAGATCA...,Vibrio cholerae O1 biovar El Tor str. N16961
...,...,...,...,...
96406,NC_002942,complement(3393934..3395274),CAGCACCAGCTGATCCAATGCAAGCTAAGGTAATGATGTTTTTACC...,Legionella pneumophila subsp. pneumophila Phil...
96407,NC_002942,complement(3395275..3396945),ATCAGTATTTTATTAGCCCTTTGATAACACCATGTTGTCGCTATTA...,Legionella pneumophila subsp. pneumophila Phil...
96408,NC_002942,complement(3396955..3397200),AATAAGTTAGGCTATGCACGCCTTGGTTTAGCATTGTCAAAAAAAA...,Legionella pneumophila subsp. pneumophila Phil...
96409,NC_002942,complement(3397167..3397355),TAAAAAGACGTCGTGCTAAAGGTCGTAAGCGTTTATCTGCCTAAGT...,Legionella pneumophila subsp. pneumophila Phil...


In [10]:
merged_df = merge_dataframes(final_df, sequence_df)

Species present in TSV but not in main dataframe: {'Enterococcus faecalis OG1RF', 'Legionella pneumophila subsp. pneumophila Philadelphia 1', 'Escherichia coli ETEC H10407', 'Streptococcus pneumoniae D39', 'Helicobacter pylori G27', 'Borrelia burgdorferi B31', 'Staphylococcus\xa0aureus MRSA252', 'Burkholderia pseudomallei K96243', 'Streptococcus agalactiae NEM316', 'Haemophilus influenzae 86-028NP', 'Campylobacter jejuni subsp. jejuni 81-176', 'Listeria monocytogenes EGD-e', 'Neisseria meningitidis serogroup C FAM18', 'Francisella tularensis subsp. holarctica FSC200', 'Mycobacterium tuberculosis H37Ra', 'Streptococcus pyogenes 5448', 'Aggregatibacter actinomycetemcomitans D7S-1', 'Escherichia coli EPEC 0127:H6 E2348/69', 'Staphylococcus\xa0epidermidis 1457', 'Salmonella enterica subsp. enterica serovar Typhimurium SL1344', 'Achromobacter xylosoxidans SOLR10', 'Staphylococcus\xa0aureus MSSA476', 'Helicobacter pylori J99', 'Neisseria gonorrhoeae FA 1090', 'Pseudomonas aeruginosa PAO1'}
S

In [11]:
merged_df

Unnamed: 0,Species,Chromosome,Region,As_1 (GE) - TPM,As_2 (GE) - TPM,As_3 (GE) - TPM,Bs_1 (GE) - TPM,Bs_2 (GE) - TPM,Bs_3 (GE) - TPM,Ctrl_1 (GE) - TPM,...,Sp_2 (GE) - TPM,Sp_3 (GE) - TPM,Tm_1 (GE) - TPM,Tm_2 (GE) - TPM,Tm_3 (GE) - TPM,Vic_1 (GE) - TPM,Vic_2 (GE) - TPM,Vic_3 (GE) - TPM,Species ID,upstream200
0,Vibrio cholerae O1 biovar El Tor str. N16961,NC_002505,complement(235..402),0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,8.392188,0.000000,0.000000,0.000000,0.000000,0.000000,Vibrio,CAGGCTCTGCAGAATACACCACCGAATACCTCTGCACTACGTTATG...
1,Vibrio cholerae O1 biovar El Tor str. N16961,NC_002505,complement(372..806),393.166189,311.937046,383.829258,377.730614,273.496217,331.362458,315.921082,...,175.084374,351.995974,385.693391,383.276940,406.241639,276.773111,178.371963,223.310216,Vibrio,ATCTCGATGCCCTAGAGCGAGCCGCAGAGCACTTAGCGATTGGCCA...
2,Vibrio cholerae O1 biovar El Tor str. N16961,NC_002505,complement(816..2210),162.400143,171.281001,174.959500,261.605512,228.250776,233.104257,154.689875,...,145.679966,220.608060,260.216487,177.158771,215.282147,162.996568,97.415072,128.631521,Vibrio,TGCCAGTCATGTTCACTTTCTTCTTCCTGTGGTTCCCATCAGGTCT...
3,Vibrio cholerae O1 biovar El Tor str. N16961,NC_002505,complement(2271..3896),720.122251,825.256114,860.384381,560.678176,683.429275,731.909515,554.730307,...,442.196629,473.234066,327.976622,676.676655,632.319698,622.953497,648.744407,603.585147,Vibrio,TGGTTTATTAGTCCACTTATCGGCCCACGCTGCCGATTCACTCCTA...
4,Vibrio cholerae O1 biovar El Tor str. N16961,NC_002505,complement(3899..4156),132.002479,371.588303,244.840701,150.111163,232.967463,208.688639,205.699493,...,242.590426,153.183540,128.419995,108.867940,232.579308,223.051153,367.948661,124.616224,Vibrio,CTTTCTCATCCTCGTTTGGGACTCGCGGTTCCTAAAAAGCAGATCA...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19821,Klebsiella pneumoniae subsp. pneumoniae MGH 78578,NC_009653,1433..1663,2196.775545,2257.918323,2155.878280,1635.286545,2357.383900,1929.630652,2260.813993,...,7358.802631,10632.870032,12801.790715,10104.657626,10909.565036,9206.009642,6822.881422,4465.778497,KLEBS,GTAAATCAATTTGTTATCGCCACTTAGTTAAAAAAAATTTTTTTTT...
19822,Klebsiella pneumoniae subsp. pneumoniae MGH 78578,NC_009653,1660..1998,2181.385479,1981.081622,1666.816029,1746.367340,1849.783568,1862.333827,1403.000191,...,9174.082484,8283.044866,4301.461428,4804.391257,3884.423218,3057.111744,3325.739216,3063.158487,KLEBS,AATGGCGCTGCTTTGCGTATCCCGCAGCCGTTCATGAAGCAGCTTG...
19823,Klebsiella pneumoniae subsp. pneumoniae MGH 78578,NC_009653,complement(2287..2544),107.262956,165.195837,160.945638,268.944256,336.767008,251.302784,306.821038,...,47.994576,67.531582,250.489120,247.530226,322.815816,209.147013,255.838672,283.460097,KLEBS,AAGCTTCTCCAGCAGGAGCAGAGACTGATGCTGGATGAATTTAGCC...
19824,Klebsiella pneumoniae subsp. pneumoniae MGH 78578,NC_009653,2667..2858,59.660108,97.389121,74.907776,46.934265,44.546468,41.170402,47.496680,...,283.767930,90.745563,208.279627,163.870817,182.534512,105.147091,141.333099,141.038356,KLEBS,GATGCTTGTCAGGGGGGCGGAGCCTATGGAAAAACGGCCGTTGTGC...


In [12]:
def clean_column_name(col_name):
    """
    clean and format column name by replacing spaces and special characters with underscores and converting to lowercase.

    Parameters:
    col_name (str): The original column name.

    Returns:
    str: The cleaned and formatted column name.
    """
    col_name = col_name.replace("_", " ")
    col_name = col_name.replace("-", " ")
    col_name = re.sub(r"[^\w\s]", " ", col_name)
    col_name = re.sub(r"\s+", " ", col_name)
    # Replace non-word characters (except for spaces) with nothing
    col_name = re.sub(r"[^\w\s]", "", col_name)
    # Replace spaces with underscores
    col_name = col_name.replace(" ", "_")
    # Convert to lowercase
    cleaned_name = col_name.lower()
    return cleaned_name


def rename_columns(df):
    """
    Rename all columns of the DataFrame to a more convenient format.

    Parameters:
    df (pd.DataFrame): The DataFrame whose columns are to be renamed.

    Returns:
    pd.DataFrame: DataFrame with renamed columns.
    """
    # Create a dictionary to map old column names to new column names
    new_columns = {col: clean_column_name(col) for col in df.columns}

    # Rename columns in the DataFrame
    df.rename(columns=new_columns, inplace=True)

    return df

In [13]:
merged_df = rename_columns(merged_df)

# Extract and clean regions
merged_df["is_complement"] = merged_df["region"].str.contains("complement")
merged_df["region_clean"] = merged_df["region"].str.replace(
    "complement\(|\)", "", regex=True
)
merged_df["is_circular"] = merged_df["region_clean"].str.contains("join")
merged_df["region_clean"] = merged_df["region_clean"].str.replace(
    "join\(|\)", "", regex=True
)


def parse_region(region):
    start, *intermediate, end = region.split("..")
    if not intermediate:
        end = int(end)
        start = int(start)
        length = end - start + 1
        return start, end, length
    else:
        first_end, second_start = intermediate[0].split(",")
        length1 = int(first_end) - int(start) + 1
        length2 = int(end) - int(second_start) + 1
        return int(start), int(end), length1 + length2


merged_df[["region_start", "region_end", "region_length"]] = merged_df[
    "region_clean"
].apply(lambda x: pd.Series(parse_region(x)))
merged_df.drop(columns=["region"], inplace=True)

In [14]:
merged_df

Unnamed: 0,species,chromosome,as_1_ge_tpm,as_2_ge_tpm,as_3_ge_tpm,bs_1_ge_tpm,bs_2_ge_tpm,bs_3_ge_tpm,ctrl_1_ge_tpm,ctrl_2_ge_tpm,...,vic_2_ge_tpm,vic_3_ge_tpm,species_id,upstream200,is_complement,region_clean,is_circular,region_start,region_end,region_length
0,Vibrio cholerae O1 biovar El Tor str. N16961,NC_002505,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,Vibrio,CAGGCTCTGCAGAATACACCACCGAATACCTCTGCACTACGTTATG...,True,235..402,False,235,402,168
1,Vibrio cholerae O1 biovar El Tor str. N16961,NC_002505,393.166189,311.937046,383.829258,377.730614,273.496217,331.362458,315.921082,342.779023,...,178.371963,223.310216,Vibrio,ATCTCGATGCCCTAGAGCGAGCCGCAGAGCACTTAGCGATTGGCCA...,True,372..806,False,372,806,435
2,Vibrio cholerae O1 biovar El Tor str. N16961,NC_002505,162.400143,171.281001,174.959500,261.605512,228.250776,233.104257,154.689875,184.843102,...,97.415072,128.631521,Vibrio,TGCCAGTCATGTTCACTTTCTTCTTCCTGTGGTTCCCATCAGGTCT...,True,816..2210,False,816,2210,1395
3,Vibrio cholerae O1 biovar El Tor str. N16961,NC_002505,720.122251,825.256114,860.384381,560.678176,683.429275,731.909515,554.730307,697.202002,...,648.744407,603.585147,Vibrio,TGGTTTATTAGTCCACTTATCGGCCCACGCTGCCGATTCACTCCTA...,True,2271..3896,False,2271,3896,1626
4,Vibrio cholerae O1 biovar El Tor str. N16961,NC_002505,132.002479,371.588303,244.840701,150.111163,232.967463,208.688639,205.699493,245.932501,...,367.948661,124.616224,Vibrio,CTTTCTCATCCTCGTTTGGGACTCGCGGTTCCTAAAAAGCAGATCA...,True,3899..4156,False,3899,4156,258
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19821,Klebsiella pneumoniae subsp. pneumoniae MGH 78578,NC_009653,2196.775545,2257.918323,2155.878280,1635.286545,2357.383900,1929.630652,2260.813993,2388.506147,...,6822.881422,4465.778497,KLEBS,GTAAATCAATTTGTTATCGCCACTTAGTTAAAAAAAATTTTTTTTT...,False,1433..1663,False,1433,1663,231
19822,Klebsiella pneumoniae subsp. pneumoniae MGH 78578,NC_009653,2181.385479,1981.081622,1666.816029,1746.367340,1849.783568,1862.333827,1403.000191,1621.881235,...,3325.739216,3063.158487,KLEBS,AATGGCGCTGCTTTGCGTATCCCGCAGCCGTTCATGAAGCAGCTTG...,False,1660..1998,False,1660,1998,339
19823,Klebsiella pneumoniae subsp. pneumoniae MGH 78578,NC_009653,107.262956,165.195837,160.945638,268.944256,336.767008,251.302784,306.821038,372.654817,...,255.838672,283.460097,KLEBS,AAGCTTCTCCAGCAGGAGCAGAGACTGATGCTGGATGAATTTAGCC...,True,2287..2544,False,2287,2544,258
19824,Klebsiella pneumoniae subsp. pneumoniae MGH 78578,NC_009653,59.660108,97.389121,74.907776,46.934265,44.546468,41.170402,47.496680,83.273274,...,141.333099,141.038356,KLEBS,GATGCTTGTCAGGGGGGCGGAGCCTATGGAAAAACGGCCGTTGTGC...,False,2667..2858,False,2667,2858,192


In [15]:
merged_df.to_csv(f"{os.getcwd()}/data/combined_data.csv")