In [1]:
import pandas as pd
pd.set_option('display.max_rows', None)  # Show all rows
import re

In [97]:
# Create an empty list to store rows of data
data = []

In [23]:
# Preprocess the text to fix broken lines
# Merge lines where parenthesis content spills to the next line
#lines = raw_text.strip().split('\n')
with open("path.txt", "r") as file:
    lines = file.read().strip().split('\n')
fixed_lines = []
temp_line = ""

for line in lines:
    if "(" in line and ")" not in line:  # Line starts a parenthetical section but doesn't end
        temp_line += " " + line.strip()  # Accumulate broken line
    elif ")" in line and temp_line:  # Line completes the parenthetical section
        temp_line += " " + line.strip()
        fixed_lines.append(temp_line.strip())  # Add the full fixed line
        temp_line = ""  # Reset temp_line
    elif temp_line:  # Append to temp_line if it is still incomplete
        temp_line += " " + line.strip()
    else:
        fixed_lines.append(line.strip())  # Normal lines are added directly

# Step 2: Process lines for structured columns
processed_data = []
for line in fixed_lines:
    # Split at the first opening parenthesis "(" if it exists
    if '(' in line:
        primary_name, details = line.split('(', 1)
        details = details.rstrip(')')  # Remove trailing ")"
    else:
        primary_name, details = line, ""
    processed_data.append({"species": primary_name.strip(), "alternative": details.strip()})

# Convert to DataFrame
bact = pd.DataFrame(processed_data)

In [24]:
# Expand rows with comma-separated values
bact['alternative_split'] = bact['alternative'].str.split(',')
max_splits = bact['alternative_split'].map(len).max()
split_columns = pd.DataFrame(bact['alternative_split'].tolist(), columns=[f'alternative_{i+1}' for i in range(max_splits)])
split_columns = split_columns.fillna('')
bact = pd.concat([bact.drop(columns=['alternative','alternative_split']), split_columns], axis=1)
bact = bact.map(lambda x: x.strip() if isinstance(x, str) else x)

In [22]:
# Display the DataFrame
bact

Unnamed: 0,species,alternative_1,alternative_2,alternative_3,alternative_4,alternative_5
0,Abiotrophia defectiva,Streptococcus defectivus,,,,
1,Acholeplasma hippikon,,,,,
2,Acholeplasma laidlawii,,,,,
3,Acholeplasma oculi,,,,,
4,Achromobacter denitrificans,Achromobacter xylosoxidans subsp. denitrificans,Alcaligenes denitrificans,Alcaligenes denitrificans subsp. denitrificans,Alcaligenes xylosoxidans subsp. denitrificans,
5,Achromobacter insolitus,,,,,
6,Achromobacter ruhlandii,Alcaligenes ruhlandii,,,,
7,Achromobacter spanius,,,,,
8,Achromobacter xylosoxidans,Achromobacter xylosoxidans subsp. xylosoxidans,Alcaligenes denitrificans subsp. xylosoxidans,Alcaligenes xylosoxidans subsp. xylosoxidans,Alcaligenes xylosoxidans,
9,Acidaminococcus fermentans,,,,,


## Comments on changes done in the text from human supervised analysis of GTDB and NCBI
- *Actinobacillus pseudopneumoniae* is considered *A. lignieressi* ERASED from list
- *Actinomadura latina* added alternative name *Spirillospora latina*
- *Actinomadura madurae* added alternative name *Spirillospora madurae*
- *Yersinia pseudotuberculosis* is included in the list but GTDB considers that it belongs to *Yersinia pestis* its genome is `GCF_900637475.1` ERASED from list
- *Treponema pertenue* is considered *Treponema pallidum* ERASED from list

## Read data from GTDB

In [6]:
gtdb = pd.read_csv('gtdb/bac120_taxonomy.tsv', header=None, names=['GCF','taxonomy'], sep='\t')

In [7]:
def extract_species(taxonomy):
    # Split the taxonomy string at ';' and get the last part after 's__'
    species_part = taxonomy.split(';')[-1]  # Get the last part (species)
    return species_part.strip()

# Apply the function to the 'taxonomy' column to create a 'species' column
gtdb['species'] = gtdb['taxonomy'].apply(extract_species)
gtdb['species'] = gtdb['species'].str.replace('s__', '', regex=False)

# Get a list of species from df_taxonomy
taxonomy_species = gtdb['species'].unique().tolist()

In [28]:
gtdb[gtdb['GCF'] == "RS_GCF_900115095.1"]

Unnamed: 0,GCF,taxonomy,species
404761,RS_GCF_900115095.1,d__Bacteria;p__Actinomycetota;c__Actinomycetes...,Spirillospora madurae


## Match both sets

In [9]:
def check_in_taxonomy(row, taxonomy_species):
    # Check if the species or alternatives  in the taxonomy species list
    if row['species'] in taxonomy_species:
        return 'species'
    elif row['alternative_1'] in taxonomy_species:
        return 'alternative_1'
    elif row['alternative_2'] in taxonomy_species:
        return 'alternative_2'
    elif row['alternative_3'] in taxonomy_species:
        return 'alternative_3'
    else:
        return 'no match'

In [10]:
def find_match_with_column(row, taxonomy_species):
    for column in ['species', 'alternative_1', 'alternative_2', 'alternative_3']:
        value = row[column]
        if pd.notna(value):  # Check if the value is not NaN
            for taxon in taxonomy_species:
                if re.search(re.escape(taxon), value, re.IGNORECASE):  # Case-insensitive partial match
                    return {'match': taxon, 'column': column}  # Return matching taxonomy name and column
    return {'match': None, 'column': None}  # Return None if no match is found

In [11]:
# Apply the function to each row of df
bact['match_info'] = bact.apply(lambda row: find_match_with_column(row, taxonomy_species), axis=1)
# Split the match_info dictionary into separate columns for clarity
bact['matching_taxon'] = bact['match_info'].apply(lambda x: x['match'] if isinstance(x, dict) else None)
bact['matching_column'] = bact['match_info'].apply(lambda x: x['column'] if isinstance(x, dict) else None)
# Drop the intermediate match_info column
bact.drop(columns=['match_info'], inplace=True)

In [12]:
bact[bact['matching_column'] == 'species'].shape

(802, 8)

In [13]:
bact

Unnamed: 0,species,alternative_1,alternative_2,alternative_3,alternative_4,alternative_5,matching_taxon,matching_column
0,Abiotrophia defectiva,Streptococcus defectivus,,,,,Abiotrophia defectiva,species
1,Acholeplasma hippikon,,,,,,Acholeplasma hippikon,species
2,Acholeplasma laidlawii,,,,,,Acholeplasma laidlawii,species
3,Acholeplasma oculi,,,,,,Acholeplasma oculi,species
4,Achromobacter denitrificans,Achromobacter xylosoxidans subsp. denitrificans,Alcaligenes denitrificans,Alcaligenes denitrificans subsp. denitrificans,Alcaligenes xylosoxidans subsp. denitrificans,,Achromobacter denitrificans,species
5,Achromobacter insolitus,,,,,,Achromobacter insolitus,species
6,Achromobacter ruhlandii,Alcaligenes ruhlandii,,,,,Achromobacter ruhlandii,species
7,Achromobacter spanius,,,,,,Achromobacter spanius,species
8,Achromobacter xylosoxidans,Achromobacter xylosoxidans subsp. xylosoxidans,Alcaligenes denitrificans subsp. xylosoxidans,Alcaligenes xylosoxidans subsp. xylosoxidans,Alcaligenes xylosoxidans,,Achromobacter xylosoxidans,species
9,Acidaminococcus fermentans,,,,,,Acidaminococcus fermentans,species


In [14]:
bact[bact['matching_column'].isna()]

Unnamed: 0,species,alternative_1,alternative_2,alternative_3,alternative_4,alternative_5,matching_taxon,matching_column
31,Actinobacillus pleuropneumoniae,Haemophilus pleuropneumoniae,,,,,,
34,Actinomadura latina,,,,,,,
35,Actinomadura madurae,,,,,,,
36,Actinomyces cardiffensis,,,,,,,
37,Actinomyces europaeus,,,,,,,
40,Actinomyces hongkongensis,,,,,,,
43,Actinomyces meyeri,Actinobacterium meyeri,,,,,,
44,Actinomyces neuii subsp. neuii,,,,,,,
45,Actinomyces odontolyticus,,,,,,,
48,Actinomyces radingae,,,,,,,
