In [1]:
# Importing necessary libraries
import subprocess
import sys
import os

req_file = "requirements.txt"
if os.path.exists(req_file):
    print("Installing packages from requirements.txt...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", req_file])
else:
    print("requirements.txt not found.")
    
import yaml
import json
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from Setup.classification_utils import get_projectResults, create_classified_df, replace_wrong_classes_fromDict, remove_phrases_from_dict

Installing packages from requirements.txt...


# 1. Gemini API & Classes and Descriptions

Accessing the Gemini API Key from the config file.

In [2]:
# Loading API Key
with open("Setup/config.yaml", "r") as file:
    config = yaml.safe_load(file)
gemini_api_key = config["gemini_api_key"]

Accessing the clasyes with descriptions from the description file.

In [3]:
# Loading Knowledge Gaps and their Descriptions
with open("Setup/descriptions.yaml", "r") as file:
    descriptions = yaml.safe_load(file)
class_descriptions = {key: value for key, value in descriptions.items()}
print(class_descriptions)

{'1.1 Characteristics of plastics - Characteristics of plastics-general': 'Includes characteristics of plastic themselves in terms of their potential toxicology or ecotoxicology (i.e., particle size, size distribution, shape, surface area, redox potential and properties, purity or identity of contaminants, catalytic activity, generation of reactive oxygen species).\n', '1.2 Characteristics of plastics - Bioaccumulation, bioconcentration and persistence': 'Includes how plastics behave with respect to their ability to bioaccumulate, concentrate or persist in organisms and/or the environment.\n', '1.3 Characteristics of plastics - Biological processes and biotic interactions with plastic': 'Includes all types of biological processes and interactions with plastics, including.\n', '1.4 Characteristics of plastics - Cellular uptake of plastic': 'Includes all types of cellular uptake of plastics (from plants, animals, bacteria, fungi, etc.).\n', '1.5 Characteristics of plastics - Degradation'

# 1. Classification

Looping through the PDF's stored in the subfolders in the 'Files' main folder. Used functions are stored in classifications.utils.py.

In [4]:
# Classifying Research Papers in folder=Files using Gemini 1.5 Flash, based on: 
# (1) the class descriptions from descriptions.yaml 
# (2) the prompt specified in classification_utils.py
project_results = {}
project_results = get_projectResults(class_descriptions, gemini_api_key)


File 1: Processing 2022-04-08.pdf in project 'IMPTOX'...
Total Classes: 4, Classes: {'6.6 Testing - Characterization test methods': "The paper details the development and validation of a UHPLC-MS/MS method for quantifying microcystins and nodularin, including descriptions of the instrumentation, procedures, and parameters used.  Sections 2.1-2.6 comprehensively describe the method's characterization.", '6.10 Testing - Human and environmental effects and toxicity test methods': 'The study validates a method for quantifying toxins that cause adverse human health effects (hepatotoxicity). Section 2.6 details the validation process and parameters evaluated, directly relating to toxicity testing.', '6.3 Testing - Bioaccumulation and persistence test methods': 'While not explicitly a bioaccumulation/persistence study, the validated method (Sections 2.1-2.6) provides a tool to measure the presence and concentration of these toxins in drinking water, a crucial aspect of assessing their bioacc

Creating a dataframe based on the obtained project_results dictionary.

In [5]:
# Creating a Dataframe with the classified papers
classified_df = create_classified_df(project_results, class_descriptions, local_folder="Files")

# Security Measure: If the API key is in the DataFrame, replace it with '?'
classified_df = classified_df.replace(gemini_api_key, '?', regex=True)

'''
Displaying the DataFrame with all columns
pd.set_option('display.max_columns', None)
classified_df
'''

"\nDisplaying the DataFrame with all columns\npd.set_option('display.max_columns', None)\nclassified_df\n"

# 2 Error Minimization

Accessing the wrong classes mapping and strings file to reduce errors.

In [6]:
# Loading the mapped wrong classed from the YAML file
with open("Setup/wrong_classes.yaml", "r") as file:
    wrong_classes_mapping = yaml.safe_load(file)

# Loading the strings to check from the YAML file
with open(f"Setup/strings_to_check.yaml", "r") as file:
    strings_to_check = yaml.safe_load(file)
strings_to_check = list(strings_to_check["strings_to_check"])

Executing error minimization steps.

In [7]:
# Replaces wrong classes (e.g. 3) to correct ones (e.g. 3 Defining plastics)
replace_wrong_classes_fromDict(wrong_classes_mapping, project_results)

# Removing assigned classes based on specific phrases
# CAUTION: Based on the setup of the classification, this may remove classes that are not intended to be removed.
removed_project_results = {}
remove_phrases_from_dict(strings_to_check, removed_project_results, project_results)

1.7 Fate and behavior within an organism -> 1.7 Characteristics of plastics - Fate and behavior within an organism
1.7 Fate and behavior within an organism -> 1.7 Characteristics of plastics - Fate and behavior within an organism
1.7 Fate and behavior within an organism -> 1.7 Characteristics of plastics - Fate and behavior within an organism
3 -> 3 Defining plastics
7 -> 7 Unspecific uncertainties
1.7 Fate and behavior within an organism -> 1.7 Characteristics of plastics - Fate and behavior within an organism
1.7 Fate and behavior within an organism -> 1.7 Characteristics of plastics - Fate and behavior within an organism
1.7 Fate and behavior within an organism -> 1.7 Characteristics of plastics - Fate and behavior within an organism
Project: PlasticHeal, PDF: ph_013_22_emergnlrp3inflamm_intjmolsci.pdf, Class: 4.6 Effects assessment - Environmental effects and ecotoxicity, Justification: This class is not applicable. The research focuses solely on human health effects related to NLR

Creating the final dataset.

In [8]:
# Create a new DataFrame with the modified project_results
df = create_classified_df(project_results, class_descriptions, local_folder="Files")
# Print out any class, that was not yet assigned to any prespecified class
df["Other_Assigned_Classes"].unique()

array(['{}'], dtype=object)

Exporting results to the results excel file.

In [9]:
# Export the Final DataFrame to Excel
df.to_excel("results.xlsx")