Visualizing  Distribution of chemicals linked by machine learning algorithm with anti-<i>Salmonella Typhimurium</i> activity of essential oils


In [13]:
import pandas as pd
from ast import literal_eval

# Define the base path and the list of file identifiers
base_path = "/Users/mariiakokina/Documents/eo_database/extra_trees/features/"
file_ids = [875, 1386]



# Initialize a dictionary to hold the structures, their corresponding chemical names, and file bit numbers
structures_to_names_and_bits = {}

# Iterate through the list of file identifiers
for file_id in file_ids:
    # Construct the filename
    filename = f"{base_path}Bit_{file_id}_updated.csv"
    
    # Read the CSV file
    try:
        df = pd.read_csv(filename)
        
        # Iterate over each row in the dataframe
        for index, row in df.iterrows():
            # Safely evaluate the string representation of the list in BitInterpretations column
            tuples_list = literal_eval(row['BitInterpretations'])
            
            # Extract the fourth element (structure) from each tuple
            for tup in tuples_list:
                structure = tup[3]
                chemical_name = row['Chemical_name']
                
                # Initialize dictionary entry if not present
                if structure not in structures_to_names_and_bits:
                    structures_to_names_and_bits[structure] = {'names': set(), 'bits': []}
                
                # Add the chemical name to the set (to avoid duplicates) and bit number to the list
                structures_to_names_and_bits[structure]['names'].add(chemical_name)
                if file_id not in structures_to_names_and_bits[structure]['bits']:
                    structures_to_names_and_bits[structure]['bits'].append(file_id)
    except FileNotFoundError:
        print(f"File not found: {filename}")
    except Exception as e:
        print(f"Error reading file {filename}: {e}")

# Prepare data for DataFrame
data_for_df = []
for structure, info in structures_to_names_and_bits.items():
    # Convert bit numbers list to a string for display
    bits_str = ', '.join(map(str, info['bits']))
    # Append tuple for each structure with its bit numbers, count of unique names, and concatenated chemical names
    data_for_df.append((structure, bits_str, len(info['names']), ', '.join(info['names'])))

# Create a DataFrame
df_structures_names_bits = pd.DataFrame(data_for_df, columns=['Substructure', 'Morgan Fingerprint Bit Number', 'Number of Occurrences', 'Chemical'])

# Save the DataFrame to a CSV file
csv_file_path = "active_against_salmoella.csv"
df_structures_names_bits.to_csv(csv_file_path, index=False)

print(f"Data saved to {csv_file_path}")


Data saved to active_against_salmoella.csv


In [80]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.io import write_image
import os

# Define the base path and the list of file identifiers
base_path = "features/"
file_ids = [875, 1386]

# Initialize an empty list to hold the chemical names from all files
all_chemical_names = []

# Iterate through the list of file identifiers
for file_id in file_ids:
    # Construct the filename using the corrected pattern
    filename = f"{base_path}Bit_{file_id}_updated.csv"
    
    # Read the CSV file
    try:
        df = pd.read_csv(filename)
        
        # Extract the 'Chemical_name' column and add it to the list
        all_chemical_names.extend(df['Chemical_name'].tolist())
    except FileNotFoundError:
        print(f"File not found: {filename}")
    except Exception as e:
        print(f"Error reading file {filename}: {e}")

# Convert the list of chemical names to a DataFrame for easy counting
chemical_names_df = pd.DataFrame(all_chemical_names, columns=['Chemical_name'])

# Count the occurrences of each chemical name
chemical_name_counts = chemical_names_df['Chemical_name'].value_counts()

# Print the counts
print(chemical_name_counts)

# Filter the counts to separate those chemicals present more than 2 times
more_than_three = chemical_name_counts[chemical_name_counts > 1]
others_count = chemical_name_counts[chemical_name_counts <= 1].sum()

# Adding the "Others" category
if others_count > 0:
    # Create a new Series for "Others" and concatenate it
    others_series = pd.Series([others_count], index=['Others'])
    more_than_three = pd.concat([more_than_three, others_series])


# Convert to DataFrame for Plotly
df_more_than_three = more_than_three.reset_index()
df_more_than_three.columns = ['Chemical', 'Count']

# Define a custom color palette
colors = ['#FEBFB3', '#E1396C', '#96D38C', '#D0F9B1', '#FAD0C4', '#FF9F9A', '#FAB0E4']

# Creating the pie chart with uniformly separated slices
fig = go.Figure(data=[go.Pie(labels=df_more_than_three['Chemical'],
                             values=df_more_than_three['Count'],
                             pull=0.05,  # Uniformly pull away all slices
                             marker_colors=colors,
                             textinfo='percent+label',
                             outsidetextfont=dict(size=12),  # Ensure outside text is visible
                             textposition='outside',  # Place labels outside with lines connecting to slices
                             hoverinfo='label+percent',
                             insidetextorientation='radial',
                            )])

# Update the layout with the desired title and font settings
fig.update_layout(
    font=dict(family="EB Garamond, serif", size=12, color="black"),
    title=dict(text='Distribution of chemicals identified by the XGBoost algorithm as predictive of high anti-<i>S.</i> Typhimurium activity in essential oils',
               x=0.5,
               font=dict(family="EB Garamond, serif", size=12, color="black")),
    legend_title_font=dict(family="EB Garamond, serif", size=12, color="black"),
    legend_font=dict(family="EB Garamond, serif", size=12, color="black")
)

# Show the figure
fig.show()


file_path = "charts/chemicals_linked_with_anti_salmonella_activity.png"
fig.write_image(file_path, scale=3)

print(f"Image saved to {file_path}")


Carvacrol          17
Thymol              9
Camphor             8
Eugenol             7
Sabinene            4
Eugenyl acetate     2
Name: Chemical_name, dtype: int64


Image saved to charts/chemicals_linked_with_anti_salmonella_activity.png


Visualizing  Distribution of chemicals linked by machine learning algorithm with no anti-<i>Salmonella Typhimurium</i> activity of essential oils


In [17]:
import pandas as pd
from ast import literal_eval

# Define the base path and the list of file identifiers
base_path = "/Users/mariiakokina/Documents/eo_database/extra_trees/features/"
file_ids = [549, 794, 807, 314, 389, 1388]



# Initialize a dictionary to hold the structures, their corresponding chemical names, and file bit numbers
structures_to_names_and_bits = {}

# Iterate through the list of file identifiers
for file_id in file_ids:
    # Construct the filename
    filename = f"{base_path}Bit_{file_id}_updated.csv"
    
    # Read the CSV file
    try:
        df = pd.read_csv(filename)
        
        # Iterate over each row in the dataframe
        for index, row in df.iterrows():
            # Safely evaluate the string representation of the list in BitInterpretations column
            tuples_list = literal_eval(row['BitInterpretations'])
            
            # Extract the fourth element (structure) from each tuple
            for tup in tuples_list:
                structure = tup[3]
                chemical_name = row['Chemical_name']
                
                # Initialize dictionary entry if not present
                if structure not in structures_to_names_and_bits:
                    structures_to_names_and_bits[structure] = {'names': set(), 'bits': []}
                
                # Add the chemical name to the set (to avoid duplicates) and bit number to the list
                structures_to_names_and_bits[structure]['names'].add(chemical_name)
                if file_id not in structures_to_names_and_bits[structure]['bits']:
                    structures_to_names_and_bits[structure]['bits'].append(file_id)
    except FileNotFoundError:
        print(f"File not found: {filename}")
    except Exception as e:
        print(f"Error reading file {filename}: {e}")

# Prepare data for DataFrame
data_for_df = []
for structure, info in structures_to_names_and_bits.items():
    # Convert bit numbers list to a string for display
    bits_str = ', '.join(map(str, info['bits']))
    # Append tuple for each structure with its bit numbers, count of unique names, and concatenated chemical names
    data_for_df.append((structure, bits_str, len(info['names']), ', '.join(info['names'])))

# Create a DataFrame
df_structures_names_bits = pd.DataFrame(data_for_df, columns=['Substructure', 'Morgan Fingerprint Bit Number', 'Number of Occurrences', 'Chemical'])

# Save the DataFrame to a CSV file
csv_file_path = "INactive_against_salmoella.csv"
df_structures_names_bits.to_csv(csv_file_path, index=False)

print(f"Data saved to {csv_file_path}")


Data saved to INactive_against_salmoella.csv


In [81]:
import pandas as pd
import plotly.graph_objects as go
import os

# Define the base path and the list of file identifiers
base_path = "features/"
file_ids = [549, 794, 807, 314, 389, 1388]

# Initialize an empty list to hold the chemical names from all files
all_chemical_names = []

# Iterate through the list of file identifiers
for file_id in file_ids:
    filename = f"{base_path}Bit_{file_id}_updated.csv"
    
    try:
        df = pd.read_csv(filename)
        all_chemical_names.extend(df['Chemical_name'].tolist())
    except FileNotFoundError:
        print(f"File not found: {filename}")
    except Exception as e:
        print(f"Error reading file {filename}: {e}")

chemical_names_df = pd.DataFrame(all_chemical_names, columns=['Chemical_name'])
chemicals_to_exclude = ["Carvacrol", "Eugenol", "Thymol", "Eugenyl acetate"]
chemical_names_df = chemical_names_df[~chemical_names_df['Chemical_name'].isin(chemicals_to_exclude)]
chemical_name_counts = chemical_names_df['Chemical_name'].value_counts()
print(chemical_name_counts)
more_than_six = chemical_name_counts[chemical_name_counts > 8]
others_count = chemical_name_counts[chemical_name_counts <= 8].sum()

if others_count > 0:
    others_series = pd.Series([others_count], index=['Others'])
    more_than_six = pd.concat([more_than_six, others_series])

df_more_than_six = more_than_six.reset_index()
df_more_than_six.columns = ['Chemical', 'Count']
colors = ['#FEBFB3', '#E1396C', '#96D38C', '#D0F9B1', '#FAD0C4', '#FF9F9A', '#FAB0E4']

fig = go.Figure(data=[go.Pie(labels=df_more_than_six['Chemical'],
                             values=df_more_than_six['Count'],
                             pull=0.05,
                             marker_colors=colors,
                             textinfo='percent+label',
                             textposition='outside',
                             hoverinfo='label+percent',
                             insidetextorientation='radial'
                            )])

fig.update_layout(
    font=dict(family="EB Garamond, serif", size=10, color="black"),
    title=dict(text='Distribution of chemicals identified by the XGBoost algorithm as predictive of low or no anti-<i>S.</i> Typhimurium activity in essential oils',
               x=0.5, font=dict(family="EB Garamond, serif", size=12, color="black")),
)

if not os.path.exists('charts'):
    os.makedirs('charts')

file_path = "charts/chemicals_linked_with_absence_of_anti_salmonella_activity.png"
fig.write_image(file_path, scale=3)
print(f"Image saved to {file_path}")

# Optionally, if you want to adjust the vertical position further:
fig.update_layout(legend=dict(y=1.8)) 

# Display the plot in the output
fig.show()

Linalool                   31
Limonene                   24
Hexadecanoic acid          18
alpha-Pinene               16
Camphor                    16
delta-3-Carene             15
(+)-alpha-Terpineol        14
Geranial                   12
beta-Pinene                10
Neral                      10
Linalyl acetate            10
Sabinene                    8
Geraniol                    8
Linolenic acid              6
Myrcene                     4
Methyl benzoate             4
Menthone                    4
Citronellol                 4
2-Phenylethanol             4
Carvone                     4
(E)-beta-Ocimene            3
beta-Elemene                3
Terpinen-4-ol               3
Alkanes                     3
(-)-Citronellal             3
Menthol                     2
(E,Z)-alpha-Farnesene       2
Neryl acetate               2
Borneol                     2
Terpinyl acetate            2
beta-Bisabolene (tent.)     2
Geranyl acetate             2
Isomenthone                 2
Bicycloger

Visualizing  Distribution of chemicals having mixed impact on model predictions of anti-<i>Salmonella Typhimurium</i> activity of essential oils

# Extra fixes for the paper

In [76]:
import subprocess
import os

# Define the source and target directories
source_dir = "/Users/mariiakokina/Documents/eo_database/extra_trees/visualizations"
target_dir = source_dir  # Assuming you want to save the PNG files in the same directory

# List all SVG files in the source directory
svg_files = [f for f in os.listdir(source_dir) if f.endswith('.svg')]

# Convert each SVG file to PNG using Inkscape
for svg_file in svg_files:
    source_path = os.path.join(source_dir, svg_file)
    target_path = os.path.join(target_dir, svg_file.replace('.svg', '.png'))
    
    # Construct the Inkscape command for conversion
    # Adding --export-dpi=300 for the DPI setting
    # Inkscape 1.0 and later versions handle transparency by default for PNG exports
    command = [
        '/Applications/Inkscape.app/Contents/MacOS/inkscape', 
        source_path, 
        '--export-dpi=300', 
        '--export-type=png', 
        '--export-filename', 
        target_path
    ]
    
    # Execute the command
    subprocess.run(command)

print("Conversion complete.")




Conversion complete.


In [110]:
import pandas as pd

def extract_dois_from_csv(csv_path):
    # Load the CSV file
    data = pd.read_csv(csv_path)
    
    # Check if 'Reference' column exists
    if 'Reference' not in data.columns:
        print("The column 'Reference' was not found in the CSV file.")
        return
    
    # Filter out rows where 'Reference' is 'No Reference' or empty and remove 'DOI:' prefix
    filtered_references = data['Reference'].dropna()
    filtered_references = filtered_references[filtered_references != 'No Reference']
    filtered_references = filtered_references.str.replace('DOI:', '').str.strip()
    
    # Prepend 'https://www.doi.org/' to each DOI
    doi_urls = ['https://www.doi.org/' + doi for doi in filtered_references]
    
    return doi_urls

# Example usage:
# Replace 'path_to_your_csv_file.csv' with the actual path to your CSV file
doi_list = extract_dois_from_csv('aggregated_plants.csv')
for doi in doi_list:
    print(doi)


https://www.doi.org/10.3390/f10111042
https://www.doi.org/10.3390/medicines4020030
https://www.doi.org/10.3389/fvets.2023.1188752
https://www.doi.org/10.3390/molecules191220034
https://www.doi.org/10.3390/molecules191220034
https://www.doi.org/10.3390/molecules24050900
https://www.doi.org/10.5281/zenodo.159101
https://www.doi.org/10.1007/s10722-013-0010-4
https://www.doi.org/10.1590/S0102-695X2013000600002
https://www.doi.org/10.1177/1934578X20915034
https://www.doi.org/10.4103/0975-7406.199342
https://www.doi.org/10.1111/1750-3841.12052
https://www.doi.org/10.4315/0362-028X-69.9.2274
https://www.doi.org/10.21010/ajtcam.v14i3.8
https://www.doi.org/10.3390/antibiotics10101191
https://www.doi.org/10.1080/13880200802055917
https://www.doi.org/10.1186/s12941-020-00371-1
https://www.doi.org/10.3390/antibiotics12020254
https://www.doi.org/10.1111/lam.13610. Epub 2021 Dec 1.
https://www.doi.org/10.3389/fvets.2023.1188752
https://www.doi.org/10.1007/s10068-017-0241-9
https://www.doi.org/10.339