### Data assembling


In [1]:
import os
import json

# Directory path
directory = "./initial_data/"

# List to store the combined data
combined_data = []

# Iterate over the files in the directory
for file_name in os.listdir(directory):
    if file_name.endswith(".json"):
        # Construct the file path
        file_path = os.path.join(directory, file_name)

        # Read data from the JSON file
        with open(file_path, "r") as file:
            data = json.load(file)

        # Append the data to the combined list
        combined_data.extend(data)

# Write the combined data to a new JSON file in the specified directory
output_file = os.path.join(directory, "combined_data.json")
with open(output_file, "w") as file:
    json.dump(combined_data, file, indent=4)

print(f"Combined data has been written to {output_file}.")

Combined data has been written to ./partial_data/combined_data.json.


In [17]:
import json
from collections import OrderedDict

# Load the JSON file
with open("initial_data/combined_data.json", "r") as file:
    data = json.load(file)

# Create an ordered dictionary to store unique plants
unique_plants = OrderedDict()

# Iterate over the data
for item in data:
    # Get the plant name
    name = item.get("name")
    
    # If this plant name has not been added to unique_plants, add it
    if name not in unique_plants:
        unique_plants[name] = item

# Convert the values of unique_plants back to a list
unique_data = list(unique_plants.values())

# Save the unique data back to a JSON file
with open("initial_data/combined_data_unique.json", "w") as file:
    json.dump(unique_data, file, indent=4)

# Print results
print(f"Total number of plants: {len(data)}")
print(f"Number of unique plants: {len(unique_data)}")


Total number of plants: 4106
Number of unique plants: 4100


In [25]:
import pandas as pd
import json
import os
from tqdm import tqdm
from collections import defaultdict

# Directory path where the JSON files are located
directory = "./initial_data/"

# List to store the JSON data
data_list = []

# Load the combined JSON data directly
with open(os.path.join(directory, "combined_data_unique.json"), "r") as file:
    data_list = json.load(file)


# Create a dictionary to store the chemical values
chemicals_data = defaultdict(dict)

# Iterate over the JSON data with progress logging
for entry in tqdm(data_list, desc="Processing data"):
    name = entry.get("name")
    name_botanical = entry.get("name_botanical")
    compounds = entry.get("compound")
    if compounds:
        for compound in compounds:
            chemical_name = compound.get("name")
            percentage_average = compound.get("pivot", {}).get("percentage_average")
            if chemical_name and name:
                # Append both name and name_botanical to the column name
                column_name = f"{name} - {name_botanical}"
                suffix = 1
                while column_name in chemicals_data[chemical_name]:
                    suffix += 1
                    column_name = f"{name} {suffix} - {name_botanical}"
                chemicals_data[chemical_name][column_name] = float(percentage_average) if percentage_average else 0.0

# Create a DataFrame from the chemicals_data dictionary
df = pd.DataFrame.from_dict(chemicals_data, orient="index")

# Sort the columns (plant species) alphabetically
df_sorted = df.reindex(sorted(df.columns), axis=1)

# Fill NaN values with 0
df_sorted.fillna(0, inplace=True)

# Save the DataFrame as a CSV file
output_path = "initial_data/chemicals_data.csv"
df_sorted.to_csv(output_path)

# Print the DataFrame
print(df_sorted)


Processing data: 100%|█████| 4100/4100 [00:00<00:00, 54996.71it/s]


                         Abies alba conifer 1 - Abies alba Mill., fam. Pinaceae  \
Limonene                                                            0.5474        
beta-Pinene                                                         0.0050        
alpha-Pinene                                                        0.0736        
Bornyl acetate                                                      0.0096        
Camphene                                                            0.1478        
...                                                                    ...        
2-Butyl isothiocyanate                                              0.0000        
Ethyl hematommate                                                   0.0000        
Ethyl chlorohematommate                                             0.0000        
Chloroatranorin                                                     0.0000        
Atranorin                                                           0.0000        

   

### Data cleaning


In [10]:

import pandas as pd

# Load the data
df = pd.read_csv('./initial_data/chemicals_data.csv', index_col=0)

# Transpose the data so that plants are rows and chemicals are columns as entities (samples) 
#should be rows and the features should be columns.
df = df.transpose()

# Replace non-numeric values with NaN
df = df.apply(pd.to_numeric, errors='coerce')

# Fill in missing values with 0s
df = df.fillna(0)

# Check the number of plants and chemicals before removing duplicates
num_plants_original = df.shape[0]
num_chemicals_original = df.shape[1]
print("Number of plants before removing duplicates:", num_plants_original)
print("Number of chemicals before removing duplicates:", num_chemicals_original)

# Check and remove duplicate rows and columns
duplicated_rows = df.duplicated(keep=False)
duplicated_columns = df.T.duplicated(keep=False)

num_duplicates_rows = duplicated_rows.sum()
num_duplicates_columns = duplicated_columns.sum()

duplicate_rows = df[duplicated_rows]

# Print the names of duplicate rows
print("Names of duplicate rows:")
print(duplicate_rows.index.tolist())

df = df.loc[~duplicated_rows]
df = df.loc[:, ~duplicated_columns]

# Check the number of unique plants and chemicals left after removing duplicates
num_plants_cleaned = df.shape[0]
num_chemicals_cleaned = df.shape[1]
print("Number of unique plants (rows) left:", num_plants_cleaned)
print("Number of unique chemicals (columns) left:", num_chemicals_cleaned)

# Convert the index to strings for comparison
df.index = df.index.astype(str)

# Identify plants with values outside the range of 0 to 1
invalid_plants = df[(df < 0) | (df > 1)].dropna(how="all").index
print("Plants with values outside the range of 0 to 1:")
print(invalid_plants)

# Set the name for the index
df.index.name = 'Plant_Species'

# Save cleaned data
df.to_csv('./initial_data/chemicals_data_clean.csv')

# Print the number of duplicates
print("Number of duplicate rows:", num_duplicates_rows)
print("Number of duplicate columns:", num_duplicates_columns)


Number of plants before removing duplicates: 4099
Number of chemicals before removing duplicates: 4199
Names of duplicate rows:
['Patchouli (Indonesia, Sumatra) 2 - Pogostemon cablin Benth. (P. patchouli Pellet), fam. Lamiaceae (Labiatae)', 'Patchouli (Indonesia, Sumatra) 3 - Pogostemon cablin Benth., fam. Lamiaceae (Labiatae)', 'Peppermint (Italy)  6b - Mentha piperita L., fam. Lamiaceae (Labiatae)', 'Peppermint (USA)  4 - Mentha piperita L. cultivar Kennewick, fam. Lamiaceae (Labiatae)', 'Peppermint (USA)  5 - Mentha piperita L. cultivar Michigan, fam. Lamiaceae (Labiatae)', 'Peppermint (USA)  6 - Mentha piperita L., fam. Lamiaceae (Labiatae)', 'Peppermint 4 - Mentha piperita L., fam. Lamiaceae (Labiatae)', 'Peppermint 5a - Mentha piperita L. cultivar Kennewick, fam. Lamiaceae (Labiatae)', 'Peppermint 5b - Mentha piperita L. cultivar Michigan, fam. Lamiaceae (Labiatae)', 'Peppermint headspace - Mentha piperita L., fam. Lamiaceae (Labiatae)', 'Sassafras (Brazil) 2 - Sassafras albidum 

This is an extra step for removing repeating scientific names as some plants are tested multiple times.

In [74]:
import pandas as pd
import re

# Load the chemicals_data_clean.csv file
chemicals_df = pd.read_csv('/Users/mariiakokina/Documents/eo_database/initial_data/chemicals_data_clean.csv')

# Function to extract pairs of consecutive words from the Plant_Species column
# where the first word starts with a capital letter and the second with a lowercase letter
def extract_word_pairs(name):
    return set(re.findall(r'\b[A-Z][a-z]+\s[a-z]+\b', name))

# Apply the function to extract word pairs
chemicals_df['word_pairs'] = chemicals_df['Plant_Species'].apply(extract_word_pairs)

# Identify duplicates
to_remove = set()
seen_pairs = set()
for i, row in chemicals_df.iterrows():
    for pair in row['word_pairs']:
        if pair in seen_pairs:
            to_remove.add(i)
        else:
            seen_pairs.add(pair)

removed_plants = chemicals_df.loc[to_remove]['Plant_Species'].tolist()

# Remove duplicates
before_deduplication = len(chemicals_df)
chemicals_df = chemicals_df.drop(index=to_remove)
after_deduplication = len(chemicals_df)

# Drop the 'word_pairs' column
chemicals_df.drop(columns=['word_pairs'], inplace=True)

# Save the updated DataFrame to a new file
chemicals_df.to_csv('/Users/mariiakokina/Documents/eo_database/initial_data/chemicals_data_clean_unique.csv', index=False)

print(f"Number of plant species before deduplication: {before_deduplication}")
print(f"Number of plant species after deduplication: {after_deduplication}")
print(f"Removed plant species: {', '.join(removed_plants)}")


  removed_plants = chemicals_df.loc[to_remove]['Plant_Species'].tolist()


Number of plant species before deduplication: 4087
Number of plant species after deduplication: 1173
Removed plant species: Abies alba conifer 2 - Abies alba Mill., fam. Pinaceae, Abies alba from cones - Abies alba Mill., fam. Pinaceae, Abies alba needle (Montenegro) - Abies alba Mill., fam. Pinaceae, Abies alba needle (Serbia) - Abies alba Mill., fam. Pinaceae, Abies cephalonica needle (Greece) - Abies cephalonica Loud., fam. Pinaceae, Abies holophylla (China) 2 - Abies holophylla Maxim., fam. Pinaceae, Abies sibirica 2 - Abies sibirica Ledeb., fam. Pinaceae, Abies sibirica 3 - Abies sibirica Ledeb., fam. Pinaceae, Achillea biebersteinii (Turkey) 1 - Achillea biebersteinii Afan., fam. Asteraceae (Compositae), Achillea biebersteinii (Turkey) 2 - Achillea biebersteinii Afan., fam. Asteraceae (Compositae), Achillea ligustica leaf (Greece) - Achillea ligustica All., fam. Asteraceae (Compositae), Achillea wilhelmsii (Turkey) - Achillea wilhelmsii C. Koch (A. santolina Auct. Mult.), fam. As

We should not have values outside the [0, 1] range. Here's a code to detect and print rows with values outside this range:

In [75]:
import pandas as pd

# Load the data
df = pd.read_csv('./initial_data/chemicals_data_clean_unique.csv', index_col=0)

# Drop non-numeric columns if they exist
columns_to_drop = ['Plant_Species']
df = df.drop(columns=columns_to_drop, errors='ignore')

# Convert all columns to numeric, turning non-numeric values into NaNs
df = df.apply(pd.to_numeric, errors='coerce')

# Check for values outside the range [0, 1]
outside_range = (df < 0) | (df > 1)
outside_range_rows = df[outside_range.any(axis=1)]

if not outside_range_rows.empty:
    print("\nRows with values outside the range [0, 1]:")
    print(outside_range_rows)
else:
    print("\nAll values are within the range [0, 1].")



All values are within the range [0, 1].
