Visualizing  Distribution of chemicals linked by machine learning algorithm with anti-<i>Salmonella Typhimurium</i> activity of essential oils


In [93]:
import pandas as pd
from ast import literal_eval

# Define the base path and the list of file identifiers
base_path = "/Users/mariiakokina/Documents/eo_database/SOM/features/"
file_ids = [875, 1607, 1750, 1602, 1365, 1853, 1930, 741]



# Initialize a dictionary to hold the structures, their corresponding chemical names, and file bit numbers
structures_to_names_and_bits = {}

# Iterate through the list of file identifiers
for file_id in file_ids:
    # Construct the filename
    filename = f"{base_path}Bit_{file_id}_updated.csv"
    
    # Read the CSV file
    try:
        df = pd.read_csv(filename)
        
        # Iterate over each row in the dataframe
        for index, row in df.iterrows():
            # Safely evaluate the string representation of the list in BitInterpretations column
            tuples_list = literal_eval(row['BitInterpretations'])
            
            # Extract the fourth element (structure) from each tuple
            for tup in tuples_list:
                structure = tup[3]
                chemical_name = row['Chemical_name']
                
                # Initialize dictionary entry if not present
                if structure not in structures_to_names_and_bits:
                    structures_to_names_and_bits[structure] = {'names': set(), 'bits': []}
                
                # Add the chemical name to the set (to avoid duplicates) and bit number to the list
                structures_to_names_and_bits[structure]['names'].add(chemical_name)
                if file_id not in structures_to_names_and_bits[structure]['bits']:
                    structures_to_names_and_bits[structure]['bits'].append(file_id)
    except FileNotFoundError:
        print(f"File not found: {filename}")
    except Exception as e:
        print(f"Error reading file {filename}: {e}")

# Prepare data for DataFrame
data_for_df = []
for structure, info in structures_to_names_and_bits.items():
    # Convert bit numbers list to a string for display
    bits_str = ', '.join(map(str, info['bits']))
    # Append tuple for each structure with its bit numbers, count of unique names, and concatenated chemical names
    data_for_df.append((structure, bits_str, len(info['names']), ', '.join(info['names'])))

# Create a DataFrame
df_structures_names_bits = pd.DataFrame(data_for_df, columns=['Substructure', 'Morgan Fingerprint Bit Number', 'Number of Occurrences', 'Chemical'])

# Save the DataFrame to a CSV file
csv_file_path = "active_against_salmoella.csv"
df_structures_names_bits.to_csv(csv_file_path, index=False)

print(f"Data saved to {csv_file_path}")


Data saved to active_against_salmoella.csv


In [104]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.io import write_image


# Define the base path and the list of file identifiers
base_path = "features/"
file_ids = [875, 1607, 1750, 1602, 1365, 1853, 1930, 741]

# Initialize an empty list to hold the chemical names from all files
all_chemical_names = []

# Iterate through the list of file identifiers
for file_id in file_ids:
    # Construct the filename using the corrected pattern
    filename = f"{base_path}Bit_{file_id}_updated.csv"
    
    # Read the CSV file
    try:
        df = pd.read_csv(filename)
        
        # Extract the 'Chemical_name' column and add it to the list
        all_chemical_names.extend(df['Chemical_name'].tolist())
    except FileNotFoundError:
        print(f"File not found: {filename}")
    except Exception as e:
        print(f"Error reading file {filename}: {e}")

# Convert the list of chemical names to a DataFrame for easy counting
chemical_names_df = pd.DataFrame(all_chemical_names, columns=['Chemical_name'])

# Count the occurrences of each chemical name
chemical_name_counts = chemical_names_df['Chemical_name'].value_counts()

# Print the counts
print(chemical_name_counts)

# Filter the counts to separate those chemicals present more than 6 times
more_than_three = chemical_name_counts[chemical_name_counts > 6]
others_count = chemical_name_counts[chemical_name_counts <= 6].sum()

# Adding the "Others" category
if others_count > 0:
    more_than_three = more_than_three.append(pd.Series(others_count, index=['Others']))


# Convert to DataFrame for Plotly
df_more_than_three = more_than_three.reset_index()
df_more_than_three.columns = ['Chemical', 'Count']

# Define a custom color palette
colors = ['#FEBFB3', '#E1396C', '#96D38C', '#D0F9B1', '#FAD0C4', '#FF9F9A', '#FAB0E4']

# Creating the pie chart with uniformly separated slices
fig = go.Figure(data=[go.Pie(labels=df_more_than_three['Chemical'],
                             values=df_more_than_three['Count'],
                             pull=0.05,  # Uniformly pull away all slices
                             marker_colors=colors,
                             textinfo='percent+label',
                             outsidetextfont=dict(size=12),  # Ensure outside text is visible
                             textposition='outside',  # Place labels outside with lines connecting to slices
                             hoverinfo='label+percent',
                             insidetextorientation='radial',
                            )])

# Update the layout with the desired title and font settings
fig.update_layout(
    font=dict(family="EB Garamond, serif", size=12, color="black"),
    title=dict(text='Distribution of chemicals linked by random forest algorithm with anti-<i>Salmonella Typhimurium</i> activity of essential oils',
               x=0.5,
               font=dict(family="EB Garamond, serif", size=12, color="black")),
    legend_title_font=dict(family="EB Garamond, serif", size=12, color="black"),
    legend_font=dict(family="EB Garamond, serif", size=12, color="black")
)

# Show the figure
fig.show()

file_path = "charts/chemicals_linked_with_anti_salmonella_activity.png"
fig.write_image(file_path, scale=3)

print(f"Image saved to {file_path}")

Carvacrol                            136
para-Cymene                           45
Thymol                                36
Eugenol                               28
2-Hydroxy-4-methoxy-acetophenone       4
Eugenyl acetate                        4
Cuminaldehyde                          3
Coumarins                              3
Furfural                               3
ar-Turmerone                           3
Methyl thymol                          2
2-Phenylethanol                        2
Methyl benzoate                        2
Methyl chavicol                        1
Diosphenol                             1
p-Cresyl methyl ether                  1
Isodiosphenol                          1
alpha-Turmerone                        1
Methyl (Z)-cinnamate                   1
Coumarin                               1
Ethyl cinnamate                        1
Ethyl (E)-4-methoxycinnamate           1
Allyl-2,3,4,5-tetramethoxybenzene      1
Cinnamyl aldehyde                      1
Anethole        


The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Image saved to charts/chemicals_linked_with_anti_salmonella_activity.png


Visualizing  Distribution of chemicals linked by machine learning algorithm with no anti-<i>Salmonella Typhimurium</i> activity of essential oils


In [95]:
import pandas as pd
from ast import literal_eval

# Define the base path and the list of file identifiers
base_path = "/Users/mariiakokina/Documents/eo_database/SOM/features/"
file_ids = [1309, 1574, 383, 1950, 171, 549, 287]



# Initialize a dictionary to hold the structures, their corresponding chemical names, and file bit numbers
structures_to_names_and_bits = {}

# Iterate through the list of file identifiers
for file_id in file_ids:
    # Construct the filename
    filename = f"{base_path}Bit_{file_id}_updated.csv"
    
    # Read the CSV file
    try:
        df = pd.read_csv(filename)
        
        # Iterate over each row in the dataframe
        for index, row in df.iterrows():
            # Safely evaluate the string representation of the list in BitInterpretations column
            tuples_list = literal_eval(row['BitInterpretations'])
            
            # Extract the fourth element (structure) from each tuple
            for tup in tuples_list:
                structure = tup[3]
                chemical_name = row['Chemical_name']
                
                # Initialize dictionary entry if not present
                if structure not in structures_to_names_and_bits:
                    structures_to_names_and_bits[structure] = {'names': set(), 'bits': []}
                
                # Add the chemical name to the set (to avoid duplicates) and bit number to the list
                structures_to_names_and_bits[structure]['names'].add(chemical_name)
                if file_id not in structures_to_names_and_bits[structure]['bits']:
                    structures_to_names_and_bits[structure]['bits'].append(file_id)
    except FileNotFoundError:
        print(f"File not found: {filename}")
    except Exception as e:
        print(f"Error reading file {filename}: {e}")

# Prepare data for DataFrame
data_for_df = []
for structure, info in structures_to_names_and_bits.items():
    # Convert bit numbers list to a string for display
    bits_str = ', '.join(map(str, info['bits']))
    # Append tuple for each structure with its bit numbers, count of unique names, and concatenated chemical names
    data_for_df.append((structure, bits_str, len(info['names']), ', '.join(info['names'])))

# Create a DataFrame
df_structures_names_bits = pd.DataFrame(data_for_df, columns=['Substructure', 'Morgan Fingerprint Bit Number', 'Number of Occurrences', 'Chemical'])

# Save the DataFrame to a CSV file
csv_file_path = "INactive_against_salmoella.csv"
df_structures_names_bits.to_csv(csv_file_path, index=False)

print(f"Data saved to {csv_file_path}")


Data saved to INactive_against_salmoella.csv


In [109]:
import pandas as pd
import plotly.graph_objects as go
from plotly.io import write_image

# Define the base path and the list of file identifiers
base_path = "features/"
file_ids = [1309, 1574, 383, 1950, 171, 549, 287]

# Initialize an empty list to hold the chemical names from all files
all_chemical_names = []

# Iterate through the list of file identifiers
for file_id in file_ids:
    # Construct the filename using the corrected pattern
    filename = f"{base_path}Bit_{file_id}_updated.csv"
    
    # Read the CSV file
    try:
        df = pd.read_csv(filename)
        
        # Extract the 'Chemical_name' column and add it to the list
        all_chemical_names.extend(df['Chemical_name'].tolist())
    except FileNotFoundError:
        print(f"File not found: {filename}")
    except Exception as e:
        print(f"Error reading file {filename}: {e}")

# Convert the list of chemical names to a DataFrame for easy counting
chemical_names_df = pd.DataFrame(all_chemical_names, columns=['Chemical_name'])

# Count the occurrences of each chemical name
chemical_name_counts = chemical_names_df['Chemical_name'].value_counts()

# Exclude the specified chemicals from the counts
chemicals_to_exclude = ["Carvacrol", "Perilla aldehyde", "alpha-Turmerone"]
chemical_name_counts = chemical_name_counts.drop(labels=chemicals_to_exclude, errors='ignore')

# Print the adjusted counts
print(chemical_name_counts)

# Filter the counts to separate those chemicals present more than 6 times
more_than_three = chemical_name_counts[chemical_name_counts > 10]
others_count = chemical_name_counts[chemical_name_counts <= 10].sum()

# Adding the "Others" category
if others_count > 0:
    more_than_three = more_than_three.append(pd.Series(others_count, index=['Others']))

# Convert to DataFrame for Plotly
df_more_than_three = more_than_three.reset_index()
df_more_than_three.columns = ['Chemical', 'Count']

# Define a custom color palette
colors = ['#FEBFB3', '#E1396C', '#96D38C', '#D0F9B1', '#FAD0C4', '#FF9F9A', '#FAB0E4']

# Creating the pie chart with uniformly separated slices
fig = go.Figure(data=[go.Pie(labels=df_more_than_three['Chemical'],
                             values=df_more_than_three['Count'],
                             pull=0.05,  # Uniformly pull away all slices
                             marker_colors=colors,
                             textinfo='percent+label',
                             outsidetextfont=dict(size=12),  # Ensure outside text is visible
                             textposition='outside',  # Place labels outside with lines connecting to slices
                             hoverinfo='label+percent',
                             insidetextorientation='radial',
                            )])

# Update the layout with the desired title and font settings
fig.update_layout(
    font=dict(family="EB Garamond, serif", size=12, color="black"),
    title=dict(text='Distribution of chemicals contributing to the random forest algorithm’s classification <br> of essential oils as inactive against <i>Salmonella Typhimurium</i>',
               x=0.5,
               font=dict(family="EB Garamond, serif", size=12, color="black")),
    legend_title_font=dict(family="EB Garamond, serif", size=12, color="black"),
    legend_font=dict(family="EB Garamond, serif", size=12, color="black"),
    legend=dict(orientation="v", yanchor="bottom", y=0.5, xanchor="right", x=1.5)  # Adjusted for better layout
)

# Show the figure
fig.show()

file_path = "charts/chemicals_linked_with_absence_of_anti_salmonella_activity.png"
fig.write_image(file_path, scale=3)

print(f"Image saved to {file_path}")


alpha-Pinene               64
Linalool                   31
beta-Pinene                30
delta-3-Carene             15
Geranial                   12
Linalyl acetate            10
Neral                      10
Geraniol                    8
alpha-Phellandrene          5
Myrcene                     4
Citronellol                 4
Menthone                    4
(-)-Citronellal             3
Isopinocamphone             3
alpha-Copaene               2
Bicyclogermacrene           2
alpha-Eudesmol              2
(E,Z)-alpha-Farnesene       2
Viridiflorene               2
Menthol                     2
Ledol                       2
beta-Bisabolene (tent.)     2
Neryl acetate               2
Isomenthone                 2
Geranyl acetate             2
Isodiosphenol               2
beta-Eudesmol               1
Cubenol                     1
Farnesyl acetate            1
Geranyllinalool             1
Geranic acid                1
Dill ether                  1
Citronellyl acetate         1
(Z)-beta-F


The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Image saved to charts/chemicals_linked_with_absence_of_anti_salmonella_activity.png


Visualizing  Distribution of chemicals having mixed impact on model predictions of anti-<i>Salmonella Typhimurium</i> activity of essential oils

In [17]:
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator

# Generate the molecule from SMILES
acetic_acid = Chem.MolFromSmiles('CC(=O)O')

# Initialize the Morgan fingerprint generator with a specific radius
mfp_gen = rdFingerprintGenerator.GetMorganGenerator(radius=2)

# Create an AdditionalOutput object to store detailed bit information
ao = rdFingerprintGenerator.AdditionalOutput()
ao.AllocateAtomCounts()
ao.AllocateAtomToBits()
ao.AllocateBitInfoMap()

# Generate the fingerprint with the additional output to capture detailed bit information
fp = mfp_gen.GetFingerprint(acetic_acid, additionalOutput=ao)

# Retrieve and print the mapping of bit numbers to central atom and radius
bit_info_map = ao.GetBitInfoMap()
print("Bit Info Map:")
for bit, info in bit_info_map.items():
    print(f"Bit: {bit}, Info: {info}")

# Retrieve and print the number of bits each atom sets
atom_counts = ao.GetAtomCounts()
print("\nAtom Counts:")
for idx, count in enumerate(atom_counts):
    print(f"Atom Index: {idx}, Count: {count}")

# Retrieve and print which bits each atom sets
atom_to_bits = ao.GetAtomToBits()
print("\nAtom To Bits:")
for idx, bits in enumerate(atom_to_bits):
    print(f"Atom Index: {idx}, Bits: {bits}")

# Now, if you want to visualize this information, you can use RDKit's drawing functions
# Here, we'll just add the atom indices to the molecule for reference
from rdkit.Chem.Draw import rdMolDraw2D
from IPython.display import SVG

# Create a drawer with atom index options
drawer = rdMolDraw2D.MolDraw2DSVG(300, 300)
opts = drawer.drawOptions()

# Add atom indices to the drawing options
opts.addAtomIndices = True
drawer.DrawMolecule(acetic_acid)
drawer.FinishDrawing()

# Save the SVG to a file
svg_path = 'acetic_acid_atom_indices.svg'
with open(svg_path, 'w') as f:
    f.write(drawer.GetDrawingText())

svg_path


Bit Info Map:
Bit: 389, Info: ((3, 1),)
Bit: 508, Info: ((1, 1),)
Bit: 650, Info: ((2, 0),)
Bit: 807, Info: ((1, 0), (3, 0))
Bit: 1017, Info: ((0, 1),)
Bit: 1057, Info: ((0, 0),)
Bit: 1917, Info: ((2, 1),)

Atom Counts:
Atom Index: 0, Count: 2
Atom Index: 1, Count: 2
Atom Index: 2, Count: 2
Atom Index: 3, Count: 2

Atom To Bits:
Atom Index: 0, Bits: (1057, 1017)
Atom Index: 1, Bits: (807, 508)
Atom Index: 2, Bits: (650, 1917)
Atom Index: 3, Bits: (807, 389)


'acetic_acid_atom_indices.svg'

In [43]:
from rdkit import Chem
from rdkit.Chem import Draw, rdFingerprintGenerator

# Define the molecule
acetic_acid = Chem.MolFromSmiles('CC(=O)O')

# Initialize the Morgan fingerprint generator with AdditionalOutput for bit info
mfp_gen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
ao = rdFingerprintGenerator.AdditionalOutput()
ao.AllocateBitInfoMap()

# Generate the fingerprint and collect the bit info map
fp = mfp_gen.GetFingerprint(acetic_acid, additionalOutput=ao)
bit_info_map = ao.GetBitInfoMap()

# Specify the bits you want to visualize
bits_to_visualize = [389, 508, 650, 807, 1017, 1057, 1917]

# SVG header and footer to wrap individual SVG images
svg_header = '''<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">'''
svg_footer = '''</svg>'''

# Composite SVG content
composite_svg_content = svg_header

# Dimensions for the individual SVG images (these are arbitrary and can be adjusted)
single_width, single_height = 200, 200

for i, bit in enumerate(bits_to_visualize):
    if bit in bit_info_map:
        # Generate SVG content for the bit
        svg_content = Draw.DrawMorganBit(acetic_acid, bit, bit_info_map, useSVG=True)
        
        # Extract the SVG content without the header and footer
        svg_body = svg_content.split('<svg')[1].split('>')[1].rsplit('</svg>', 1)[0]
        
        # Calculate the position in the grid
        x_offset = (i % 4) * single_width
        y_offset = (i // 4) * single_height
        
        # Wrap the SVG body in a group element with a transform to position it
        svg_group = f'<g transform="translate({x_offset},{y_offset})">{svg_body}</g>'
        
        # Add the group to the composite SVG content
        composite_svg_content += svg_group

# Close the composite SVG
composite_svg_content += svg_footer

# Save the composite SVG to a file
svg_file_path = 'acetic_acid_all_bits_vector.svg'
with open(svg_file_path, 'w') as svg_file:
    svg_file.write(composite_svg_content)

print(f"All bits visualization saved to: {svg_file_path}")


AttributeError: 'SVG' object has no attribute 'split'

In [103]:
import subprocess
import os

# Define the source and target directories
source_dir = "/Users/mariiakokina/Documents/eo_database/SOM/visualizations"
target_dir = source_dir  # Assuming you want to save the PNG files in the same directory

# List all SVG files in the source directory
svg_files = [f for f in os.listdir(source_dir) if f.endswith('.svg')]

# Convert each SVG file to PNG using Inkscape
for svg_file in svg_files:
    source_path = os.path.join(source_dir, svg_file)
    target_path = os.path.join(target_dir, svg_file.replace('.svg', '.png'))
    
    # Construct the Inkscape command for conversion
    # Adding --export-dpi=300 for the DPI setting
    # Inkscape 1.0 and later versions handle transparency by default for PNG exports
    command = [
        '/Applications/Inkscape.app/Contents/MacOS/inkscape', 
        source_path, 
        '--export-dpi=300', 
        '--export-type=png', 
        '--export-filename', 
        target_path
    ]
    
    # Execute the command
    subprocess.run(command)

print("Conversion complete.")




Conversion complete.


In [110]:
import pandas as pd

def extract_dois_from_csv(csv_path):
    # Load the CSV file
    data = pd.read_csv(csv_path)
    
    # Check if 'Reference' column exists
    if 'Reference' not in data.columns:
        print("The column 'Reference' was not found in the CSV file.")
        return
    
    # Filter out rows where 'Reference' is 'No Reference' or empty and remove 'DOI:' prefix
    filtered_references = data['Reference'].dropna()
    filtered_references = filtered_references[filtered_references != 'No Reference']
    filtered_references = filtered_references.str.replace('DOI:', '').str.strip()
    
    # Prepend 'https://www.doi.org/' to each DOI
    doi_urls = ['https://www.doi.org/' + doi for doi in filtered_references]
    
    return doi_urls

# Example usage:
# Replace 'path_to_your_csv_file.csv' with the actual path to your CSV file
doi_list = extract_dois_from_csv('aggregated_plants.csv')
for doi in doi_list:
    print(doi)


https://www.doi.org/10.3390/f10111042
https://www.doi.org/10.3390/medicines4020030
https://www.doi.org/10.3389/fvets.2023.1188752
https://www.doi.org/10.3390/molecules191220034
https://www.doi.org/10.3390/molecules191220034
https://www.doi.org/10.3390/molecules24050900
https://www.doi.org/10.5281/zenodo.159101
https://www.doi.org/10.1007/s10722-013-0010-4
https://www.doi.org/10.1590/S0102-695X2013000600002
https://www.doi.org/10.1177/1934578X20915034
https://www.doi.org/10.4103/0975-7406.199342
https://www.doi.org/10.1111/1750-3841.12052
https://www.doi.org/10.4315/0362-028X-69.9.2274
https://www.doi.org/10.21010/ajtcam.v14i3.8
https://www.doi.org/10.3390/antibiotics10101191
https://www.doi.org/10.1080/13880200802055917
https://www.doi.org/10.1186/s12941-020-00371-1
https://www.doi.org/10.3390/antibiotics12020254
https://www.doi.org/10.1111/lam.13610. Epub 2021 Dec 1.
https://www.doi.org/10.3389/fvets.2023.1188752
https://www.doi.org/10.1007/s10068-017-0241-9
https://www.doi.org/10.339