In [1]:
# This notebook requires Python 3.12.3 or higher

import sys
required_version = (3, 12, 3)
if sys.version_info < required_version:
    raise Exception(f"This notebook requires Python {required_version} or higher!")
else:
    print(f"Python version {sys.version} is compatible.")

Python version 3.12.3 (main, Sep 11 2024, 14:17:37) [GCC 13.2.0] is compatible.


In [2]:
%pip install nltk matplotlib

Note: you may need to restart the kernel to use updated packages.


# A- Information Retrieval 1


In [3]:
# I just put all of the class in the A task here because I reuse them

import re
import os
from difflib import SequenceMatcher


class InformationRetrieval:
    def __init__(self, abstracts_path: str) -> None:
        self.abstracts: list[str] = []
        self.load_abstracts(abstracts_path)

    def load_abstracts(self, filename: str) -> list[str]:
        with open(filename, 'r') as file:
            self.abstracts = [line.lower() for line in file.readlines()]
        return self.abstracts

    # Using regex boundary \b to matches keyword to satisfy exact matching.
    # Because, for example, "AI" will matches "pair", but that is wrong.
    #                       but we cannot split all the words because then multiple words such as "machine learning" won't be matched
    @staticmethod
    def query_match_abstract(query: str, abstract: str) -> bool:
        pattern = re.compile(r'\b' + re.escape(query) + r'\b')
        return bool(pattern.search(abstract))

    def _query_match(self, query: str) -> bool:
        return any(self.query_match_abstract(query, abstract) for abstract in self.abstracts)

    def query_match(self, query: str) -> bool:
        return self._query_match(query.lower())

    def perform_query_matching(self, queries: list[str]) -> None:
        for query in queries:
            matches = self.query_match(query)
            print(f"Query: '{query}' - Match in abstracts: {1 if matches else 0}")


class InformationRetrievalIndexed():
    def __init__(self, keywords_path: str, abstracts_folder_path: str) -> None:
        self.abstracts: list[str] = []
        self.keywords: list[list[str]] = []
        self.keywords = self.load_keywords(keywords_path)
        self.abstracts = self.load_abstracts(abstracts_folder_path)
        self.inverted_index: dict[str, list[int]] = {}
        self.inverted_index_built = False

    def load_keywords(self, filename: str) -> list[list[str]]:
        with open(filename, 'r') as file:
            self.keywords = [line.strip().lower().split(", ") for line in file.readlines()]
        return self.keywords

    def load_abstracts(self, folder_path: str) -> list[str]:
        abstracts = []
        # List all files in the directory (assuming files are named A1.txt, A2.txt, ..., A20.txt)
        files = [f for f in os.listdir(folder_path) if f.startswith('A')]

        for file in files:
            with open(os.path.join(folder_path, file), 'r') as f:
                content = f.read().lower()
                abstracts.append(content)
        return abstracts

    def build_inverted_index(self) -> None:
        # Build list of unique keywords, just set comprehension looping through all keywords
        set_keywords = {
            keyword
            for keyword_list in self.keywords
            for keyword in keyword_list
        }

        # Loop through each abstract and check for keywords
        self.inverted_index = {keyword: [] for keyword in set_keywords}
        for idx, abstract in enumerate(self.abstracts):
            for kw in self.inverted_index:
                # Reuse the query_match_abstract method because if we just do plain "if kw in abstract", it will get bugged: "AI" matches "Pair"
                if InformationRetrieval.query_match_abstract(kw, abstract):
                    self.inverted_index[kw].append(idx)

        self.inverted_index_built = True

    def export_inverted_index(self, filename: str) -> None:
        with open(filename, 'w') as f:
            for keyword, idxs in self.inverted_index.items():
                f.write(f"{keyword}: {idxs}\n")

    def import_inverted_index(self, filename: str) -> None:
        self.inverted_index = {}
        with open(filename, 'r') as f:
            for line in f:
                keyword, idxs = line.strip().split(': ')
                if (idxs == '[]'):
                    self.inverted_index[keyword] = []
                else:
                    self.inverted_index[keyword] = list(map(int, idxs.strip('[]\n').split(', ')))
        self.inverted_index_built = True

    def _query_match(self, query: str) -> list[int]:
        return self.inverted_index[query] if query in self.inverted_index and bool(self.inverted_index[query]) else []

    def query_match(self, query: str) -> list[int]:
        if not self.inverted_index_built:
            self.build_inverted_index()
        return self._query_match(query.lower())

    def perform_query_matching(self, queries: list[str]) -> None:
        for query in queries:
            matches = self.query_match(query)
            transformed_matches = [f"A{match}" for match in matches]
            if matches:
                print(f"Query: '{query}' - Match in abstracts: {transformed_matches}")
            else:
                print(f"Query: '{query}' - No match in abstract")

    def display_inverted_index(self) -> None:
        if not self.inverted_index_built:
            self.build_inverted_index()
        for keyword, idxs in self.inverted_index.items():
            print(f"Keyword: '{keyword}' - Found in files with indexes: {idxs}")


class InformationRetrievalIndexedRelaxed(InformationRetrievalIndexed):
    def __init__(self, keywords_path: str, abstracts_folder: str, threshold: float = 0.9) -> None:
        super().__init__(keywords_path, abstracts_folder)
        self.threshold = threshold

    @staticmethod
    def similarity(a: str, b: str) -> float:
        return SequenceMatcher(None, a, b).ratio()

    def _scan_abstract_for_match(self, query: str, abstract: str) -> tuple[bool, float, str]:
        """This function scan abstract for matches using word similarity with a sliding window
        Note: the sliding window and the matching is done on a word basis, not a string basis

        For example, the abstract saying something like "Machine learning is a subset of AI", then a sliding window
        of 2 will be "Machine learning" and "learning is", "is a", "a subset", "subset of", "of AI"

        The length of the window is determined by the number of words in the query.

        Args:
            query (str): search query
            abstract (str): abstract to scan

        Returns:
            tuple[bool, float, str]: (if matched, similarity percentage, word that matched)
        """
        query_length = len(query.split())  # Count words in the query

        # Tokenize the abstract and scan with a sliding window approach
        abstract_tokens = abstract.split()
        for i in range(len(abstract_tokens) - query_length + 1):
            # Get the substring of abstract that matches the length of the query (in words)
            substring = ' '.join(abstract_tokens[i:i + query_length])

            # Compare similarity between the query and this substring
            similarity = self.similarity(query, substring)
            if similarity >= self.threshold:
                return (True, similarity, substring)

        return (False, 0.0, '')

    def _query_match(self, query: str) -> list[tuple[int, float]]:
        query_lower = query.lower()
        matches = []
        for idx, abstract in enumerate(self.abstracts):
            matched, score, word = self._scan_abstract_for_match(query_lower, abstract)
            if matched:
                matches.append((idx, score, word))

        return matches

    def perform_query_matching(self, queries: list[str]) -> None:
        for query in queries:
            matches = self.query_match(query)
            transformed_matches = [f"{word} in A{match} at {score * 100:.2f}%" for match, score, word in matches]
            if matches:
                print(f"Query: '{query}' - Match {transformed_matches}")
            else:
                print(f"Query: '{query}' - No match in abstract")


# Task C Information Retrieval

## Task C-1: Consider an academic journal of your own choice and collect 30 abstracts using a method of your own (can be a simple manual copy-and-paste operation or webcrawling) in a single file using the same query term (E.g., Go to webscience or sciencedirect or Springer and input a query T of your choice). Save the first 30 results as documents, where each document has four metadata: Title, List of Authors, Abstract text, List of keywords.

In [4]:
# Get the abstract from arXiv. Many papers does not have keyword section, so I used advanced search and find "keyword" in the abstract.
# I know the instruction asked to write the content to a file, but I will write the content here so that it can be reproduced easily.

# The content of the abstracts and keywords are as follows:
# abstract.txt and keywords.txt contains the content of the abstracts and keywords, separated by newlines.

# This cell will write the content to the files. So SKIP running this cell if you want to use the content you provided.

query = "Advancement in Physic and Biology Science by Machine learning"


abstracts_raw = """
Machine learning (ML) has drawn tremendous interest for its capacity to extract useful information that may be overlooked with conventional analysis techniques and for its versatility in a wide range of research domains, including biomedical sensing and imaging. In this perspective, we provide an overview focused on the uses and benefits of ML in areas of plasmonics in biology. ML methodologies for processing data from plasmonic biosensing and imaging systems by supervised and unsupervised learning to achieve enhanced detection and quantification of target analytes are described. In addition, deep learning-based approaches to improve the design of plasmonic structures are presented. Data analysis based on ML for classification, regression, and clustering by dimension reduction is presented. We also discuss ML-based prediction and design of plasmonic structures and sensors using discriminative and generative models. Challenges and the outlook for ML for plasmonics in biology are summarized. Based on these insights, we are convinced that ML can add value to plasmonics techniques in biological sensing and imaging applications to make them more powerful with improved detection and resolution.
Background and objective Mechanistic-based Model simulations (MM) are an effective approach commonly employed, for research and learning purposes, to better investigate and understand the inherent behavior of biological systems. Recent advancements in modern technologies and the large availability of omics data allowed the application of Machine Learning (ML) techniques to different research fields, including systems biology. However, the availability of information regarding the analyzed biological context, sufficient experimental data, as well as the degree of computational complexity, represent some of the issues that both MMs and ML techniques could present individually. For this reason, recently, several studies suggest overcoming or significantly reducing these drawbacks by combining the above-mentioned two methods. In the wake of the growing interest in this hybrid analysis approach, with the present review, we want to systematically investigate the studies available in the scientific literature in which both MMs and ML have been combined to explain biological processes at genomics, proteomics, and metabolomics levels, or the behavior of entire cellular populations. Methods Elsevier Scopus®, Clarivate Web of Science™ and National Library of Medicine PubMed® databases were enquired using the queries reported in Table 1, resulting in 350 scientific articles. Results Only 14 of the 350 documents returned by the comprehensive search conducted on the three major online databases met our search criteria, i.e. present a hybrid approach consisting of the synergistic combination of MMs and ML to treat a particular aspect of systems biology. Conclusions Despite the recent interest in this methodology, from a careful analysis of the selected papers, it emerged how examples of integration between MMs and ML are already present in systems biology, highlighting the great potential of this hybrid approach to both at micro and macro biological scales.
This bibliometric research explores the global evolution of machine learning applications in medical and healthcare research for 3 decades (1994 to 2023). The study applies data mining techniques to a comprehensive dataset of published articles related to machine learning applications in the medical and healthcare sectors. The data extraction process includes the retrieval of relevant information from the source sources such as journals, books, and conference proceedings. An analysis of the extracted data is then conducted to identify the trends in the machine learning applications in medical and healthcare research. The Results revealed the publications published and indexed in the Scopus and PubMed database over the last 30 years. Bibliometric Analysis revealed that funding played a more significant role in publication productivity compared to collaboration (co-authorships), particularly at the country level. Hotspots analysis revealed three core research themes on MLHC research hence demonstrating the importance of machine learning applications to medical and healthcare research. Further, the study showed that the MLHC research landscape has largely focused on ML applications to tackle various issues ranging from chronic medical challenges (e.g., cardiological diseases) to patient data security. The findings of this research may be useful to policy makers and practitioners in the medical and healthcare sectors and to global research endeavours in the field. Future studies could include addressing issues such as growing ethical considerations, integration, and practical applications in wearable technology, IoT, and smart healthcare systems.
Determining, understanding, and predicting the so-called structure-property relation is an important task in many scientific disciplines, such as chemistry, biology, meteorology, physics, engineering, and materials science. Structure refers to the spatial distribution of, e.g., substances, material, or matter in general, while property is a resulting characteristic that usually depends in a non-trivial way on spatial details of the structure. Traditionally, forward simulations models have been used for such tasks. Recently, several machine learning algorithms have been applied in these scientific fields to enhance and accelerate simulation models or as surrogate models. In this work, we develop and investigate the applications of six machine learning techniques based on two different datasets from the domain of materials science: data from a two-dimensional Ising model for predicting the formation of magnetic domains and data representing the evolution of dual-phase microstructures from the Cahn-Hilliard model. We analyze the accuracy and robustness of all models and elucidate the reasons for the differences in their performances. The impact of including domain knowledge through tailored features is studied, and general recommendations based on the availability and quality of training data are derived from this.
Drug discovery and development is a time-consuming process that involves identifying, designing, and testing new drugs to address critical medical needs. In recent years, machine learning (ML) has played a vital role in technological advancements and has shown promising results in various drug discovery and development stages. ML can be categorized into supervised, unsupervised, semi-supervised, and reinforcement learning. Supervised learning is the most used category, helping organizations solve several real-world problems. This study presents a comprehensive survey of supervised learning algorithms in drug design and development, focusing on their learning process and succinct mathematical formulations, which are lacking in the literature. Additionally, the study discusses widely encountered challenges in applying supervised learning for drug discovery and potential solutions. This study will be beneficial to researchers and practitioners in the pharmaceutical industry as it provides a simplified yet comprehensive review of the main concepts, algorithms, challenges, and prospects in supervised learning.
Additive manufacturing (AM) has undergone significant development over the past decades, resulting in vast amounts of data that carry valuable information. Numerous research studies have been conducted to extract insights from AM data and utilize it for optimizing various aspects such as the manufacturing process, supply chain, and real-time monitoring. Data integration into proposed digital twin frameworks and the application of machine learning techniques is expected to play pivotal roles in advancing AM in the future. In this paper, we provide an overview of machine learning and digital twin-assisted AM. On one hand, we discuss the research domain and highlight the machine-learning methods utilized in this field, including material analysis, design optimization, process parameter optimization, defect detection and monitoring, and sustainability. On the other hand, we examine the status of digital twin-assisted AM from the current research status to the technical approach and offer insights into future developments and perspectives in this area. This review paper aims to examine present research and development in the convergence of big data, machine learning, and digital twin-assisted AM. Although there are numerous review papers on machine learning for additive manufacturing and others on digital twins for AM, no existing paper has considered how these concepts are intrinsically connected and interrelated. Our paper is the first to integrate the three concepts big data, machine learning, and digital twins and propose a cohesive framework for how they can work together to improve the efficiency, accuracy, and sustainability of AM processes. By exploring latest advancements and applications within these domains, our objective is to emphasize the potential advantages and future possibilities associated with integration of these technologies in AM.
Microfluidic devices are increasingly widespread in the literature, being applied to numerous exciting applications, from chemical research to Point-of-Care devices, passing through drug development and clinical scenarios. Setting up these microenvironments, however, introduces the necessity of locally controlling the variables involved in the phenomena under investigation. For this reason, the literature has deeply explored the possibility of introducing sensing elements to investigate the physical quantities and the biochemical concentration inside microfluidic devices. Biosensors, particularly, are well known for their high accuracy, selectivity, and responsiveness. However, their signals could be challenging to interpret and must be carefully analysed to carry out the correct information. In addition, proper data analysis has been demonstrated even to increase biosensors' mentioned qualities. To this regard, machine learning algorithms are undoubtedly among the most suitable approaches to undertake this job, automatically learning from data and highlighting biosensor signals' characteristics at best. Interestingly, it was also demonstrated to benefit microfluidic devices themselves, in a new paradigm that the literature is starting to name “intelligent microfluidics”, ideally closing this benefic interaction among these disciplines. This review aims to demonstrate the advantages of the triad paradigm microfluidics-biosensors-machine learning, which is still little used but has a great perspective. After briefly describing the single entities, the different sections will demonstrate the benefits of the dual interactions, highlighting the applications where the reviewed triad paradigm was employed.
Carbon dots (CDs) have been a subject of great interest among researchers due to their diverse physicochemical properties and numerous advantageous attributes such as good biocompatibility, unique optical properties, low cost, eco-friendliness, abundant functional groups (e.g., amino, hydroxyl, and carboxyl) high stability, and excellent electron mobility. With the rapid advancement of data-driven technologies, machine learning (ML) has gained significant attention as a primary and indispensable tool in different applications in numerous research fields, including the monitoring of chemical reactions. By utilizing machine learning algorithms, the properties of carbon dots can be enhanced, such as fluorescence, stability, and electrocatalytic activity, as well as optimizing the synthesis process. Moreover, machine learning can be utilized to screen carbon dot precursors and predict their properties, providing various advantages in developing carbon dots with superior properties. As a result, machine learning offers numerous benefits in carbon dots synthesis, which has the potential to impact various fields. Photoelectrochemical sensors are a type of chemical sensor that use light to generate a photocurrent, which is then used to detect the presence of a target analyte. These sensors have gained significant attention due to their high sensitivity, selectivity, and low cost, making them a promising tool for a variety of applications in fields such as environmental monitoring and biomedical sensing. Due to their fascinating electronic and photonic properties, CQDs have gained considerable attention in the development of photoelectrochemical sensors. This review article provides an overview of recent advancements in the machine learning synthesis of CQDs and their applications in constructing photoelectrochemical sensors.
Lithium-ion batteries play a pivotal role in a wide range of applications, from electronic devices to large-scale electrified transportation systems and grid-scale energy storage. Nevertheless, they are vulnerable to both progressive aging and unexpected failures, which can result in catastrophic events such as explosions or fires. Given their expanding global presence, the safety of these batteries and potential hazards from serious malfunctions are now major public concerns. Over the past decade, scholars and industry experts are intensively exploring methods to monitor battery safety, spanning from materials to cell, pack and system levels and across various spectral, spatial, and temporal scopes. In this Review, we start by summarizing the mechanisms and nature of battery failures. Following this, we explore the intricacies in predicting battery system evolution and delve into the specialized knowledge essential for data-driven, machine learning models. We offer an exhaustive review spotlighting the latest strides in battery fault diagnosis and failure prognosis via an array of machine learning approaches. Our discussion encompasses: (1) supervised and reinforcement learning integrated with battery models, apt for predicting faults/failures and probing into failure causes and safety protocols at the cell level; (2) unsupervised, semi-supervised, and self-supervised learning, advantageous for harnessing vast data sets from battery modules/packs; (3) few-shot learning tailored for gleaning insights from scarce examples, alongside physics-informed machine learning to bolster model generalization and optimize training in data-scarce settings. We conclude by casting light on the prospective horizons of comprehensive, real-world battery prognostics and management.
Allosteric regulation is a fundamental biological mechanism that can control critical cellular processes via allosteric modulator binding to protein distal functional sites. The advantages of allosteric modulators over orthosteric ones have sparked the development of numerous computational approaches, such as the identification of allosteric binding sites, to facilitate allosteric drug discovery. Building on the success of machine learning (ML) models for solving complex problems in biology and chemistry, several ML models for predicting allosteric sites have been developed. In this review, we provide an overview of these models and discuss future perspectives powered by the field of artificial intelligence such as protein language models.
Machine learning (ML) is a range of powerful computational algorithms capable of generating predictive models via intelligent autonomous analysis of relatively large and often unstructured data. ML has become an integral part of our daily lives with a plethora of applications, including web, business, automotive industry, clinical diagnostics, scientific research, and more recently, forensic science. In the field of forensic DNA, the manual analysis of complex data can be challenging, time-consuming, and error-prone. The integration of novel ML-based methods may aid in streamlining this process while maintaining the high accuracy and reproducibility required for forensic tools. Due to the relative novelty of such applications, the forensic community is largely unaware of ML capabilities and limitations. Furthermore, computer science and ML professionals are often unfamiliar with the forensic science field and its specific requirements. This manuscript offers a brief introduction to the capabilities of machine learning methods and their applications in the context of forensic DNA analysis and offers a critical review of the current literature in this rapidly developing field.
Recent advancements in immune sequencing and experimental techniques are generating extensive T cell receptor (TCR) repertoire data, enabling the development of models to predict TCR binding specificity. Despite the computational challenges posed by the vast diversity of TCRs and epitopes, significant progress has been made. This review explores the evolution of computational models designed for this task, emphasizing machine learning efforts, including early unsupervised clustering approaches, supervised models, and recent applications of Protein Language Models (PLMs), deep learning models pretrained on extensive collections of unlabeled protein sequences that capture crucial biological properties. We survey the most prominent models in each category and offer a critical discussion on recurrent challenges, including the lack of generalization to new epitopes, dataset biases, and shortcomings in model validation designs. Focusing on PLMs, we discuss the transformative impact of Transformer-based protein models in bioinformatics, particularly in TCR specificity analysis. We discuss recent studies that exploit PLMs to deliver notably competitive performances in TCR-related tasks, while also examining current limitations and future directions. Lastly, we address the pressing need for improved interpretability in these often opaque models, and examine current efforts to extract biological insights from large black box models.
Machine learning models used for energy conversion system optimization cannot extrapolate outside the bounds of training data and often produce physically unrealistic results when making predictions in regions of sparse training data. The toy model concept introduced in this work allows machine learning models to extrapolate to some extent and also reduces the possibility of physically unrealistic results. It uses physics to shrink the model input (feature space) of data-based models, so that extrapolations in the data-based feature space tend to become interpolations in the physics-based (toy variable) feature space. The physics-based model can be any model or experiment that can shrink the feature space without affecting interpolation and is termed a ‘toy model' because it does not need to be accurate or make predictions of interest. The concept has been applied to model experimental data obtained from three complex systems: a. Aerodynamic forces on a spinning and vibrating baseball with inclined axis of rotation (toy model: CFD model), b. Hydraulic turbine efficiency (toy model: PIV images of flow through stationary blades), and c. Combustion generated engine emissions (toy model: system-level 1-D model). All extrapolations were converted into interpolations for the first two systems while a 75% conversion was achieved for the emission predictions. The engine toy model produced 736,281 possible feature spaces from which one unique feature space was chosen for every prediction based on agreement between different machine learning algorithms. It is shown that the ability of the toy variables to reorganize the data is important, while their accuracy is relatively unimportant. The toy model concept was demonstrated to work with neural networks and regression, and can be used to increase model robustness or reduce training data requirements.
Following other fields of science, Deep Learning models are gaining attention within the statistical physics community as a powerful and efficient way for analysing experimental and synthetic time series, and for quantifying properties thereof. Applying such models is nevertheless a path full of pitfalls, not only due to their inherent complexity, but also to a lack of understanding of some of their idiosyncrasies. We here discuss some of these pitfalls in the context of time series classification, covering from the selection of the best model hyperparameters, how the models have to be trained, to the way data have to be pre-processed. While not providing one-fits-all answers, the statistical physics practitioner will here find what questions ought to be posed, and a first guide about how to tackle them.
The optimization of the electrode manufacturing process is important for upscaling the application of Lithium-Ion Batteries (LIBs) to cater for growing energy demand. LIB manufacturing is important to be optimized because it determines the practical performance of the cells when the latter are being used in applications such as electric vehicles. In this study, we tackled the issue of high-performance electrodes for desired battery applications by proposing a data-driven approach supported by a deterministic machine learning-assisted pipeline for bi-objective optimization of the electrochemical performances. This pipeline allows the inverse design of the process parameters to adopt to manufacture electrodes for energy or power applications. This work is an analogy to our previous work that addressed the optimization of the electrode microstructures for kinetic, ionic, and electronic transport properties improvement. An electrochemical model is fed with the electrode properties characterizing the electrode microstructures generated by manufacturing simulations, and used to simulate the electrochemical performances. Secondly, the resulting dataset was used to train a deterministic model to implement fast optimizations to identify optimal electrodes. Our results suggested a high amount of active material, combined with intermediate values of solid content in the slurry and calendering degree, to achieve the optimal electrodes.
Owing to the hexagonal close-packed (HCP) crystal structure inherent in Mg alloys, strong basal texture can readily be induced through the processes of rolling or extrusion. The anisotropy of the texture of Mg alloys impacts their stamping and forming capabilities, limiting their use in certain applications. Microalloying and shear deformation are currently the most common methods of weakening the texture of Mg alloys. Many shearing processes have been extensively studied, and given that they require complex equipment and make it difficult to achieve mass production, major attention has turned to studying the design of microalloys. Traditional trial-and-error approaches for developing micro-alloying confront many challenges, including longer test cycles and increasing expenses. The rapid advancement of big data and artificial intelligence opens up a new channel for the efficient advancement of metallic materials, specifically the application of machine learning to aid in the design of Mg alloys. ML modeling can be used to find correlations between features and attributes in data, allowing for the development and design of high-performance Mg alloys. The article provides an extensive overview of machine learning applications in Mg alloys. These include the discovery of high-performance alloys, the selection of coating designs, the design of Mg matrix composites, the prediction of second phases, the microstructure modification, optimization of rolling or extrusion parameters, and the prediction of mechanical and corrosion properties. In conclusion, challenges and prospects for the rational design of alloys with machine learning support were discussed.
This review aims to highlight the role that computational chemistry has played in advancing the supramolecular chemistry field. We demonstrated recent uses of computational methodologies to elucidate noncovalent interactions in various processes occurring in supramolecular systems. We also emphasized the contributions of these techniques to studying reactions within confined space, showing how computational methodologies help clarify the effects of reactivity and conformational locking. Furthermore, we underscore the utilization of Molecular Dynamics (MD) in elucidating dynamical processes, understanding temperature and pressure effects, and exploring conformational space within supramolecular chemistry. Finally, we highlight the impact that the age of machine learning has on computational chemistry, showing how these universal approximators can enhance existing methods, predict properties, and efficiently explore the chemical space encompassed by these complex systems. This article explores the use of electronic structure methods to understand noncovalent interactions in supramolecular systems and the chemical reactions that occur within their chemical space. It also highlights the use of molecular dynamics and machine learning techniques in supramolecular chemistry.
For decades, drug delivery scientists have been performing trial-and-error experimentation to manually sample parameter spaces and optimize release profiles through rational design. To enable this approach, scientists spend much of their career learning nuanced drug-material interactions that drive system behavior. In relatively simple systems, rational design criteria allow us to fine tune release profiles and enable efficacious therapies. However, as materials and drugs become increasingly sophisticated and their interactions have non-linear and compounding effects, the field is suffering the Curse of Dimensionality which prevents us from comprehending complex structure-function relationships. In the past, we have embraced this complexity by implementing high-throughput screens to increase the probability of finding ideal compositions. However, this brute force method was inefficient and led many to abandon these fishing expeditions. Fortunately, methods in data science including artificial intelligence / machine learning (AI/ML) are providing ideal analytical tools to model this complex data and ascertain quantitative structure-function relationships. In this Oration, I speak to the potential value of data science in drug delivery with particular focus on polymeric delivery systems. Here, I do not suggest that AI/ML will simply replace mechanistic understanding of complex systems. Rather, I propose that AI/ML should be yet another useful tool in the lab to navigate complex parameter spaces. The recent hype around AI/ML is breathtaking and potentially over inflated, but the value of these methods is poised to revolutionize how we perform science. Therefore, I encourage readers to consider adopting these skills and applying data science methods to their own problems. If done successfully, I believe we will all realize a paradigm shift in our approach to drug delivery.
Machine learning (ML) has been rapidly transforming the landscape of natural sciences and has the potential to revolutionize the process of data analysis and hypothesis formulation as well as expand scientific knowledge. ML has been particularly instrumental in the advancement of cheminformatics and materials science, including membrane technology. In this review, we analyze the current state-of-the-art membrane-related ML applications from ML and membrane perspectives. We first discuss the ML foundations of different algorithms and design choices. Then, traditional and deep learning methods, including application examples from the membrane literature, are reported. We also discuss the importance of learning data and both molecular and membrane-system featurization. Moreover, we follow up on the discussion with examples of ML applications in membrane science and technology. We detail the literature using data-driven methods from property prediction to membrane fabrication. Various fields are also discussed, such as reverse osmosis, gas separation, and nanofiltration. We also differentiate between downstream predictive tasks and generative membrane design. Additionally, we formulate best practices and the minimum requirements for reporting reproducible ML studies in the field of membranes. This is the first systematic and comprehensive review of ML in membrane science.
Not long ago, carbon quantum dots (CQDs) came into view as a revolutionary class of materials, propelling advancements in water remediation and electrochemical technology. This comprehensive review explores the cutting-edge developments in CQDs-based materials and their applications, addressing critical challenges in water treatment and electrochemical processes. Synthesized as ultra-tiny, dispersed particles with dimensions less than 10 nm, CQDs exhibit remarkable optical properties, including adjustable fluorescence emission across various colors. With a surge in published scientific articles, CQDs have garnered significant attention, offering potential solutions in heavy metal sensing, remediation, and electrocatalytic hydrogen evolution reactions (HER). The review highlights the high sensitivity of CQDs as fluorescent sensors, detecting contaminants in water with limits of detection down to femtomolar concentrations. Moreover, CQDs demonstrate excellent adsorptive capabilities for heavy metal removal, surpassing traditional adsorbents in terms of removal efficiency. Furthermore, CQDs serve as promising electrocatalysts, enhancing reaction kinetics and enabling efficient water splitting for clean energy generation. Furthermore, this review emphasizes the importance of machine learning in advancing CQDs-based materials, supported by case studies and examples that illustrate how machine learning techniques optimize CQDs synthesis, enhance their properties, and broaden their applications. However, challenges remain in the precise synthesis of CQDs, scalability of production processes, and understanding the interactions between CQDs and pollutants. Overcoming these challenges will unlock the full potential of CQDs-based materials, leading to sustainable and efficient solutions in water control and electrochemical processes.
In Model Predictive Control (MPC) closed-loop performance heavily depends on the quality of the underlying prediction model, where such a model must be accurate and yet simple. A key feature in modern MPC applications is the potential for online model adaptation to cope with time-varying changes, part-to-part variations, and complex features of the system dynamics not caught by models derived from first principles. In this paper, we propose to use a physics-informed, or gray-box, model that extends the physics-based model with a data-driven component, namely a Recurrent Neural Network (RNN). Relying on physics-informed models allows for a rather limited size of the RNN, thereby enhancing online applicability compared to pure black-box models. This work presents a method based on Moving Horizon Estimation (MHE) for simultaneous state estimation and learning of the RNN sub-model, a potentially challenging issue due to limited information available in noisy input-output data and lack of knowledge of the internal state of the RNN. We provide a case study on a quadruple tank benchmark showing how the method can cope with part-to-part variations.
Chronic lymphocytic leukemia (CLL) is a B cell neoplasm characterized by the accumulation of aberrant monoclonal B lymphocytes. CLL is the predominant type of leukemia in Western countries, accounting for 25% of cases. Although many patients remain asymptomatic, a subset may exhibit typical lymphoma symptoms, acquired immunodeficiency disorders, or autoimmune complications. Diagnosis involves blood tests showing increased lymphocytes and further examination using peripheral blood smear and flow cytometry to confirm the disease. With the significant advancements in machine learning (ML) and artificial intelligence (AI) in recent years, numerous models and algorithms have been proposed to support the diagnosis and classification of CLL. In this review, we discuss the benefits and drawbacks of recent applications of ML algorithms in the diagnosis and evaluation of patients diagnosed with CLL.
As a follow-up to our recent Communication in the Journal of Chemical Physics [J. Chem. Phys. 159 071101 (2023)], we report and make available the Jupyter Notebook software here. This software performs binary machine learning classification (MLC) with the goal of learning negligible Hamiltonian matrix elements for vibrational dynamics. We illustrate its usefulness for a Hamiltonian matrix for H2O by using three MLC algorithms: Random Forest, Support Vector Machine, and Multi-layer Perceptron.
While machine learning (ML) has made significant contributions to the biopharmaceutical field, its applications are still in the early stages in terms of providing direct support for quality-by-design based development and manufacturing of biologics, hindering the enormous potential for bioprocesses automation from their development to manufacturing. However, the adoption of ML-based models instead of conventional multivariate data analysis methods is significantly increasing due to the accumulation of large-scale production data. This trend is primarily driven by the real-time monitoring of process variables and quality attributes of biopharmaceutical products through the implementation of advanced process analytical technologies. Given the complexity and multidimensionality of a bioproduct design, bioprocess development, and product manufacturing data, ML-based approaches are increasingly being employed to achieve accurate, flexible, and high-performing predictive models to address the problems of analytics, monitoring, and control within the biopharma field. This paper aims to provide a comprehensive review of the current applications of ML solutions in the design, monitoring, control, and optimisation of upstream, downstream, and product formulation processes of monoclonal antibodies. Finally, this paper thoroughly discusses the main challenges related to the bioprocesses themselves, process data, and the use of machine learning models in monoclonal antibody process development and manufacturing. Moreover, it offers further insights into the adoption of innovative machine learning methods and novel trends in the development of new digital biopharma solutions.
As the world's supply chains become disrupted through geopolitical instability and the race towards a net-zero future, policies have been implemented to improve the security of certain minerals and raw materials critical to a country's survival and sustainability goals. Circular economies (CE) are sought to be an ecosystem that will reduce virgin material consumption rates, lower carbon emissions, and decelerate the rate of landfilling. However, cost-effective and commercially attractive substitutes to conventional products are needed for this to be realised. Machine learning (ML) and the explosion of interest in artificial intelligence (AI) have led to growing interests in predictive and generative applications for sustainability. Phosphorous and, nutrients overall, operate on finite reserves essential for food supply chains; while such nutrients are largely present in municipal wastewater streams. Wastewater treatment plants (WWTPs) must then face a transformational force to become nutrient recovery centres, rather than follow a linear treat-for-disposal model. In this framework paper, ML is positioned as an enabler for scaled, cost-effective and safer recovery of nutrients and other valuable products — tying in economic, societal, technical and commercial factors through open data connectivity. Moreover, the paper issues a policy guide for institutions wishing to advance food, energy and water security through machine learning, circular economy wastewater treatment plants (ML CE WWTP).
In the past 40 years, therapeutic antibody discovery and development have advanced considerably, with machine learning (ML) offering a promising way to speed up the process by reducing costs and the number of experiments required. Recent progress in ML-guided antibody design and development (D&D) has been hindered by the diversity of data sets and evaluation methods, which makes it difficult to conduct comparisons and assess utility. Establishing standards and guidelines will be crucial for the wider adoption of ML and the advancement of the field. This perspective critically reviews current practices, highlights common pitfalls and proposes method development and evaluation guidelines for various ML-based techniques in therapeutic antibody D&D. Addressing challenges across the ML process, best practices are recommended for each stage to enhance reproducibility and progress.
Climate change (CC) is one of the greatest threats to human health, safety, and the environment. Given its current and future impacts, numerous studies have employed computational tools (e.g., machine learning, ML) to understand, mitigate, and adapt to CC. Therefore, this paper seeks to comprehensively analyze the research/publications landscape on the MLCC research based on published documents from Scopus. The high productivity and research impact of MLCC has produced highly cited works categorized as science, technology, and engineering to the arts, humanities, and social sciences. The most prolific author is Shamsuddin Shahid (based at Universiti Teknologi Malaysia), whereas the Chinese Academy of Sciences is the most productive affiliation on MLCC research. The most influential countries are the United States and China, which is attributed to the funding activities of the National Science Foundation and the National Natural Science Foundation of China (NSFC), respectively. Collaboration through co-authorship in high-impact journals such as Remote Sensing was also identified as an important factor in the high rate of productivity among the most active stakeholders researching MLCC topics worldwide. Keyword co-occurrence analysis identified four major research hotspots/themes on MLCC research that describe the ML techniques, potential risky sectors, remote sensing, and sustainable development dynamics of CC. In conclusion, the paper finds that MLCC research has a significant socio-economic, environmental, and research impact, which points to increased discoveries, publications, and citations in the near future.
Background Virtual reality technology has been widely used in surgical simulators, providing new opportunities for assessing and training surgical skills. Machine learning algorithms are commonly used to analyze and evaluate the performance of participants. However, their interpretability limits the personalization of the training for individual participants. Methods Seventy-nine participants were recruited and divided into three groups based on their skill level in intracranial tumor resection. Data on the use of surgical tools were collected using a surgical simulator. Feature selection was performed using the Minimum Redundancy Maximum Relevance and SVM-RFE algorithms to obtain the final metrics for training the machine learning model. Five machine learning algorithms were trained to predict the skill level, and the support vector machine performed the best, with an accuracy of 92.41% and Area Under Curve value of0.98253. The machine learning model was interpreted using Shapley values to identify the important factors contributing to the skill level of each participant. Results This study demonstrates the effectiveness of machine learning in differentiating the evaluation and training of virtual reality neurosurgical per- formances. The use of Shapley values enables targeted training by identifying deficiencies in individual skills. Conclusions This study provides insights into the use of machine learning for personalized training in virtual reality neurosurgery. The interpretability of the machine learning models enables the development of individualized training programs. In addition, this study highlighted the potential of explanatory models in training external skills.
The past decade has seen a sharp increase in machine learning (ML) applications in scientific research. This review introduces the basic constituents of ML, including databases, features, and algorithms, and highlights a few important achievements in chemistry that have been aided by ML techniques. The described databases include some of the most popular chemical databases for molecules and materials obtained from either experiments or computational calculations. Important two-dimensional (2D) and three-dimensional (3D) features representing the chemical environment of molecules and solids are briefly introduced. Decision tree and deep learning neural network algorithms are overviewed to emphasize their frameworks and typical application scenarios. Three important fields of ML in chemistry are discussed: ① retrosynthesis, in which ML predicts the likely routes of organic synthesis; ② atomic simulations, which utilize the ML potential to accelerate potential energy surface sampling; and ③ heterogeneous catalysis, in which ML assists in various aspects of catalytic design, ranging from synthetic condition optimization to reaction mechanism exploration. Finally, a prospect on future ML applications is provided.
Hyperspectral Imaging (HSI) plays a crucial role in detecting, identifying, and classifying a wide range of natural resources, including minerals, geological phenomena like volcanic eruptions, and vegetation. Segmentation and classification of HSI play vital roles in extracting meaningful information and identifying different land cover or land use categories within the scene. One of the primary limitations associated with HSI is the scarcity of labeled samples. Obtaining annotated samples is a laborious and time-consuming process, posing a significant challenge in the field. This work presents an Enhanced Affinity Propagation Clustering (EAPC) and Modified Extreme Learning Machine (MELM) for segmentation and classification of HSI. Initially, the HSI images are pre-processed by the non-linear diffusion partial differential equation. Then, the segmentation process is performed by the EAPC and it is the combination of Affinity Propagation Clustering (APC) with Light Spectrum Algorithm (LSA). Finally, the classification is performed by the MELM and the experimentation is demonstrated on the Salinas dataset and achieved better accuracy and sensitivity of 97.3 % and 98.2 % respectively.
"""

keywords_raw = """
machine learning, plasmonics, data analysis, structure design, deep learning, biosensors, imaging
Mathematical modeling, Machine learning, Reinforcement learning, Systems biology, Simulation, Systematic literature review
Machine Learning, Healthcare analytics, Artificial Intelligence, Medical research, IoT, Algorithms, Bibliometric Analysis
Structure-properties relation, Forward model, Feature engineering, Power spectrum density, Convolutional neural network, Support vector regression, Ising model, Cahn-Hilliard model
Artificial intelligence, Deep learning, Machine learning, Neural network, Supervised learning
Additive manufacturing, Big data, Machine learning, Digital twin, Data-driven
Machine learning, Lab-on-a-Chip, Biosensing system, Intelligent microfluidics, Biosensors integration
Carbon quantum dot, Machine learning, Synthesis, Photoelectrochemical sensors
Lithium-ion batteries, Safety, Machine learning, Deep learning, Fault, Failure, Thermal runaway, Detection, Prediction
Allostery, Machine learning, Drug design, Protein binding sites
ANN, artificial neural networks, AT, analytical threshold, BN, Bayesian networks, CART, classification and regression trees, CE, capillary electrophoresis, CNN, convolutional neural network, DBSCAN, density-based spatial clustering of applications with noise, DeT, Decision Tree, DL, deep learning, DT, dynamic threshold, EPG, electropherogram, GAN, generative adversarial networks, GDA, generalised discriminant analysis, HID, human identification, k-NN, k-nearest neighbours, LDA, linear discriminant analysis, LR, likelihood ratio, MCMC, Markov Chain Monte Carlo, MAC, maximum allele count, MCA, multiple correspondence analysis, ML, machine learning, MLE, maximum likelihood estimation, MLP, multilayer perception, MLR, multinomial logistic regression, MPS, massively parallel sequencing, NB, Naive Bayes, NGS, Next Generation Sequencing, NoC, number of contributors, NT, no threshold, PCA, principal component analysis, PCoA, principal coordinates analysis, PG, probabilistic genotyping, PGS, probabilistic genotyping software, RF, random forest, SNP, single nucleotide polymorphism, ST, stochastic threshold, STR, short tandem repeat, SVM, support vector machine, TAC, total allele count, t-SNE, t-distributed stochastic neighbour embedding, Machine learning, Forensic DNA profiling, Human identification, AI, STRs
Machine learning, T cell receptor, Specificity prediction, Protein language models, Interpretability
Physics-Based Machine Learning, Extrapolation, Feature Selection, Feature Engineering, Engines, Turbines
Deep Learning, Chaos, Classification
Battery cell manufacturing, Bayesian optimization, Machine learning, Electrode, Numerical simulation
Mg alloy, Machine learning, Strength, Plasticity, Microalloying
computational chemistry, machine learning, supramolecular chemistry, noncovalent interactions, molecular dynamics, density functional theory, electronic structure calculations
Machine learning, Artificial intelligence, Drug delivery, Controlled release, Formulation, Encapsulation
Deep learning, Predictive models, Generative models, Molecular modeling, Cheminformatics
Carbon quantum dots, Water remediation, Electrochemical advancements, Heavy metal sensing, Fluorescent sensors, Clean energy generation
Learning-based MPC, Nonlinear MPC, Moving Horizon Estimation, Physics-informed learning, Adaptive MPC, Recurrent Neural Network, Gated Recurrent Unit
Artificial intelligence, Chronic lymphocytic leukemia, Diagnosis, Machine learning
Machine Learning Classification, Vibrational Configuration Interaction
Biopharmaceuticals, Machine learning, Upstream, Downstream, Bioprocesses, Digital twin, Soft sensors
Machine learning, Circular economy, Nutrient, Sustainable supply chains
machine learning, data curation, drug discovery, data standardisation, FAIR data, model performance, metrics, model evaluation, antibodies, protein language models
Machine learning, Climate change, Sustainable development, Bibliometric analysis
Machine learning, Neurosurgery, Shapley values, Virtual reality, Human-robot interaction
Machine learning, Atomic simulation, Catalysis, Retrosynthesis, Neural network potential
Hyperspectral imaging, Enhanced affinity propagation clustering, Classification, Modified extreme learning machine
"""

title_raw = """
Machine learning and its applications for plasmonics in biology
Combined mechanistic modeling and machine-learning approaches in systems biology - A systematic literature review
Evolution of Machine Learning Applications in Medical and Healthcare Analytics Research: A Bibliometric Analysis
Efficient surrogate models for materials science simulations: Machine learning-based prediction of microstructure properties
Supervised machine learning in drug discovery and development: Algorithms, applications, challenges, and prospects
Big data, machine learning, and digital twin assisted additive manufacturing: A review
Integrating machine learning and biosensors in microfluidic devices: A review
Machine learning-driven approaches for synthesizing carbon dots and their applications in photoelectrochemical sensors
Battery safety: Machine learning-based prognostics
Machine learning approaches in predicting allosteric sites
Machine learning applications in forensic DNA profiling: A critical review
T-cell receptor binding prediction: A machine learning revolution
Using physics to extend the range of machine learning models for an aerodynamic, hydraulic and combusting system: The toy model concept
Deep Learning models for the analysis of time series: A practical introduction for the statistical physics practitioner
Toward high-performance energy and power battery cells with machine learning-based optimization of electrode manufacturing
A brief review of machine learning-assisted Mg alloy design, processing, and property predictions
Supramolecular Chemistry: Exploring the Use of Electronic Structure, Molecular Dynamics, and Machine Learning Approaches
Machine learning in drug delivery
Machine learning for the advancement of membrane science and technology: A critical review
The interface of machine learning and carbon quantum dots: From coordinated innovative synthesis to practical application in water control and electrochemistry
Physics-Informed Online Learning by Moving Horizon Estimation: Learning Recurrent Neural Networks in Gray-box Models
Revolutionizing chronic lymphocytic leukemia diagnosis: A deep dive into the diverse applications of machine learning
Machine learning software to learn negligible elements of the Hamiltonian matrix
Applications of machine learning in antibody discovery, process development, manufacturing and formulation: Current trends, challenges, and opportunities
Machine learning framework for wastewater circular economy — Towards smarter nutrient recoveries
Best practices for machine learning in antibody discovery and development
A research landscape bibliometric analysis on climate change for last decades: Evidence from applications of machine learning
Personalized assessment and training of neurosurgical skills in virtual reality: An interpretable machine learning approach
Machine Learning for Chemistry: Basics and Applications
Enhanced affinity propagation clustering with a modified extreme learning machine for segmentation and classification of hyperspectral imaging
"""

authors_raw = """
Gwiyeong Moon, Jongha Lee, Hyunwoong Lee, Hajun Yoo, Kwanhwi Ko, Seongmin Im, Donghyun Kim
Anna Procopio, Giuseppe Cesarelli, Leandro Donisi, Alessio Merola, Francesco Amato, Carlo Cosentino
Samuel-Soma M. Ajibade, Gloria Nnadwa Alhassan, Abdelhamid Zaidi, Olukayode Ayodele Oki, Joseph Bamidele Awotunde, Emeka Ogbuju, Kayode A. Akintoye
Binh Duong Nguyen, Pavlo Potapenko, Aytekin Demirci, Kishan Govind, Sébastien Bompas, Stefan Sandfeld
George Obaido, Ibomoiye Domor Mienye, Oluwaseun F. Egbelowo, Ikiomoye Douglas Emmanuel, Adeola Ogunleye, Blessing Ogbuokiri, Pere Mienye, Kehinde Aruleba
Liuchao Jin, Xiaoya Zhai, Kang Wang, Kang Zhang, Dazhong Wu, Aamer Nazir, Jingchao Jiang, Wei-Hsin Liao
Gianni Antonelli, Joanna Filippi, Michele D'Orazio, Giorgia Curci, Paola Casti, Arianna Mencattini, Eugenio Martinelli
Roya Mohammadzadeh kakhki, Mojtaba Mohammadpoor
Jingyuan Zhao, Xuning Feng, Quanquan Pang, Michael Fowler, Yubo Lian, Minggao Ouyang, Andrew F. Burke
Francho Nerín-Fonz, Zoe Cournia
Mark Barash, Dennis McNevin, Vladimir Fedorenko, Pavel Giverts
Anna Weber, Aurélien Pélissier, María Rodríguez Martínez
Indranil Brahma, Robert Jennings, Bradley Freid
Alfredo Crespo-Otero, Pau Esteve, Massimiliano Zanin
Marc Duquesnoy, Chaoyue Liu, Vishank Kumar, Elixabete Ayerbe, Alejandro A. Franco
Yanhui Cheng, Lifei Wang, Chaoyang Yang, Yunli Bai, Hongxia Wang, Weili Cheng, Hanuma Reddy Tiyyagura, Alexander Komissarov, Kwang Seon Shin
Matheus C. Colaço, Vinícius A. Glitz, Amanda K. Jacobs, Vinícius C. Port, Giovanni F. Caramori
Adam J. Gormley
Gergo Ignacz, Lana Bader, Aron K. Beke, Yasir Ghunaim, Tejus Shastry, Hakkim Vovusha, Matthew R. Carbone, Bernard Ghanem, Gyorgy Szekely
Marwa El-Azazy, Ahmed I. Osman, Mahmoud Nasr, Yassmin Ibrahim, Nessreen Al-Hashimi, Khalid Al-Saad, Mohammad A. Al-Ghouti, Mohamed F. Shibl, Ala'a H. Al-Muhtaseb, David W. Rooney, Ahmed S. El-Shafie
Kristoffer Fink Løwenstein, Daniele Bernardini, Alberto Bemporad, Lorenzo Fagiano
Mohamed Elhadary, Amgad Mohamed Elshoeibi, Ahmed Badr, Basel Elsayed, Omar Metwally, Ahmed Mohamed Elshoeibi, Mervat Mattar, Khalil Alfarsi, Salem AlShammari, Awni Alshurafa, Mohamed Yassin
Chen Qu, Paul L. Houston, Qi Yu, Priyanka Pandey, Riccardo Conte, Apurba Nandi, Joel M. Bowman
Thanh Tung Khuat, Robert Bassett, Ellen Otte, Alistair Grevis-James, Bogdan Gabrys
Allan Soo, Li Gao, Ho Kyong Shon
Leonard Wossnig, Norbert Furtmann, Andrew Buchanan, Sandeep Kumar, Victor Greiff
Samuel-Soma M. Ajibade, Abdelhamid Zaidi, Festus Victor Bekun, Anthonia Oluwatosin Adediran, Mbiatke Anthony Bassey
Fei Li, Zhibao Qin, Kai Qian, Shaojun Liang, Chengli Li, Yonghang Tai
Yun-Fei Shi, Zheng-Xin Yang, Sicong Ma, Pei-Lin Kang, Cheng Shang, P. Hu, Zhi-Pan Liu
V. Antony Asir Daniel, K. Vijayalakshmi, Priyanka Pramod Pawar, Deepak Kumar, A. Bhuvanesh, A. Josephine Christilda
"""

metadata = []
abstracts = abstracts_raw.split('\n')
keywords = keywords_raw.split('\n')
titles = title_raw.split('\n')
authors = authors_raw.split('\n')

# #read files
# query = "artificial intelligence"
# with open('resources/abstracts2.txt','r') as f:
#     data=f.read().lower().strip()

# data=data.split('-next-')
# data=[ab.strip().split('\n')[:] for ab in data][:]

# titles = [ab[0] for ab in data]
# authors = [ab[1] for ab in data]
# abstracts = [ab[2] for ab in data]
# keywords = [ab[3] for ab in data]
# ###end

for i in range(len(abstracts)):
    if not abstracts[i]:
        continue
    metadata.append({
        'abstract': abstracts[i],
        'keyword': keywords[i],
        'title': titles[i],
        'authors': authors[i]
    })
output_file_path = 'resources/abstracts3.txt'

# Write the metadata back to the file
with open(output_file_path, 'w') as f:
    for entry in metadata:
        f.write(f"{entry['title']}\n")
        f.write(f"{entry['authors']}\n")
        f.write(f"{entry['abstract']}\n")
        f.write(f"{entry['keyword']}\n")
        f.write('-next-\n')

print(f"Total metadata: {len(metadata)}")
print(metadata)

Total metadata: 30
[{'abstract': 'Machine learning (ML) has drawn tremendous interest for its capacity to extract useful information that may be overlooked with conventional analysis techniques and for its versatility in a wide range of research domains, including biomedical sensing and imaging. In this perspective, we provide an overview focused on the uses and benefits of ML in areas of plasmonics in biology. ML methodologies for processing data from plasmonic biosensing and imaging systems by supervised and unsupervised learning to achieve enhanced detection and quantification of target analytes are described. In addition, deep learning-based approaches to improve the design of plasmonic structures are presented. Data analysis based on ML for classification, regression, and clustering by dimension reduction is presented. We also discuss ML-based prediction and design of plasmonic structures and sensors using discriminative and generative models. Challenges and the outlook for ML for

## Task C-2: We want to test how much the keyword metadata content matches the document and query. Concatenate the list of keywords (in metadata) of all documents and construct an inverted file, stating for each keyword the document number it matches. Let KK be this indexed file of keywords (ordered in alphabetical order). Write a script that computes the edit similarity between each keyword in the metadata and the query T (If T contains more than one query term, then the similarity is understood as the maximum similarity score (Edit metric) among all terms of query, e.g., if T = T1, T2, T3, then for keyword K, we have Sim(K,T) = max(Sim(K,T1), Sim(K,T2), Sim(K,T2)). Save the result as an array X whose size is size of all keywords (in metadata) in all documents (excluding repetition).

In [5]:
import nltk
from nltk.metrics.distance import edit_distance


class InformationRetrievalIndexedFrequency(InformationRetrievalIndexed):  # Could have improved InformationRetrievalIndexed, but it's in separated task, so I would override inheritance. This is not elegant in real project, requiring refactoring
    def __init__(self, metadata: list[dict[str, str]] = [], keyword_separator: list[str] = [',']) -> None:
        self.metadata: list[dict[str, str]] = metadata
        self.inverted_index: dict[str, list[int]] = {}
        self.inverted_index_built = False
        if len(keyword_separator) == 0:
            raise ValueError("Keyword separators must not be empty")
        self.keyword_separator = keyword_separator
        self.indexed_keywords: list[str] = []
        self.keyword_index: dict[str, int] = {}

    def build_inverted_index(self) -> None:
        for idx, article in enumerate(self.metadata):
            keywords = article['keyword']
            if len(self.keyword_separator) > 1:
                for separator in self.keyword_separator:
                    keywords = keywords.replace(separator, self.keyword_separator[0])
            keywords = keywords.split(',')
            self.metadata[idx]['keyword_parsed'] = sorted(list({
                keyword.strip().lower()
                for keyword in keywords if keyword.strip()
            }))

        set_keywords = {
            keyword
            for article in self.metadata
            for keyword in article['keyword_parsed']
        }

        self.indexed_keywords = sorted(list(set_keywords))
        self.keyword_index = {keyword: idx for idx, keyword in enumerate(self.indexed_keywords)}

        # Loop through each abstract and check for keywords
        self.inverted_index = {keyword: [] for keyword in set_keywords}
        for idx, article in enumerate(self.metadata):
            for kw in self.inverted_index:
                # Reuse the query_match_abstract method because if we just do plain "if kw in abstract", it will get bugged: "AI" matches "Pair"
                if InformationRetrieval.query_match_abstract(kw, article['abstract']):
                    self.inverted_index[kw].append(idx)

        self.inverted_index_built = True

    @staticmethod
    def min_edit_distance(keyword: str, text: str) -> tuple[int, str]:
        # Normalize the input by converting to lowercase
        keyword = keyword.lower()
        text = text.lower()

        # Tokenize
        text_words = text.split()
        keyword_words = keyword.split()

        keyword_length = len(keyword_words)
        min_distance = float('inf')
        min_string = ''

        # # old approach, sliding window on character-basis
        # for i in range(len(text)):
        #     for j in range(i + 1, len(text)):
        #         substring = text[i:j]
        #         distance = edit_distance(keyword, substring)
        #         if distance < min_distance:
        #             min_distance = distance
        #             min_string = substring

        # return (min_distance, min_string)

        # Sliding window over the text on word-basis
        for i in range(len(text_words) - keyword_length + 1):
            # Extract the current window of words from the text
            window = ' '.join(text_words[i:i + keyword_length])

            # Compute the edit distance between the keyword and this window
            distance = edit_distance(keyword, window)

            # If this distance is the minimum found so far, update the min_distance and min_string
            if distance < min_distance:
                min_distance = distance
                min_string = window

        # handle the case where the text is shorter than the keyword
        if min_string == '':
            min_string = text
            min_distance = edit_distance(keyword, text)

        return (min_distance, min_string)

    @staticmethod
    def similarity(keyword: str, text: str) -> float:
        dist, _ = InformationRetrievalIndexedFrequency.min_edit_distance(keyword, text)

        # Normalize the similarity score
        # the score is determined as (100% - edit distance percentage)
        # where edit distance percentage is the edit distance divided by maximum distance, which is length of longest string

        max_len = max(len(keyword), len(text))
        similarity = 1 - dist / max_len if max_len != 0 else 0
        return similarity

    def compute_similarity_with_string(self, str: str, keywords = None) -> list[float]:
        if keywords is None:
            keywords = self.indexed_keywords

        X = []
        for keyword in keywords:
            similarity = self.similarity(keyword, str)
            X.append(similarity)

        return X


    def compute_similarity_with_query(self, query: str) -> list[float]:
        return self.compute_similarity_with_string(query)

    # Part of task C-3
    def compute_similarity_keyword_with_title(self) -> list[list[float]]:
        result = [0.0] * len(self.indexed_keywords)

        for article in self.metadata:
            title_similarity = self.compute_similarity_with_string(article['title'], article['keyword_parsed'])
            for idx, similarity in enumerate(title_similarity):
                result[self.keyword_index[article['keyword_parsed'][idx]]] = max(similarity, result[self.keyword_index[article['keyword_parsed'][idx]]])

        return result

    # Part of task C-5
    # Using regex boundary \b to matches keyword to satisfy exact matching.
    # Because, for example, "AI" will matches "pair", but that is wrong.
    #                       but we cannot split all the words because then multiple words such as "machine learning" won't be matched
    @staticmethod
    def query_count_match_abstract(query: str, abstract: str) -> bool:
        pattern = re.compile(r'\b' + re.escape(query) + r'\b')
        return len(pattern.findall(abstract))

    # Part of task C-5
    def calculate_keyword_frequencies_in_text(self, text: str) -> list[int]:
        num_keywords = len(self.indexed_keywords)
        Mi = [0 for _ in range(num_keywords)]

        for j, keyword in enumerate(self.indexed_keywords):
            Mi[j] = InformationRetrievalIndexedFrequency.query_count_match_abstract(keyword, text.lower())

        return Mi

    # Part of task C-5
    def calculate_keyword_frequencies(self) -> list[list[int]]:
        M = []
        for article in self.metadata:
            M.append(self.calculate_keyword_frequencies_in_text(article['abstract']))

        return M


# irie = InformationRetrievalIndexedFrequency(metadata=metadata)
irie = InformationRetrievalIndexedFrequency(metadata=metadata, keyword_separator = [',', ' '])
irie.build_inverted_index()

In [6]:
X = irie.compute_similarity_with_query(query)
print(X)

[0.9180327868852459, 0.9016393442622951, 0.9836065573770492, 0.8688524590163934, 0.9016393442622951, 0.9672131147540983, 0.8688524590163934, 0.9180327868852459, 0.8852459016393442, 0.9344262295081968, 0.9180327868852459, 0.8852459016393442, 0.9016393442622951, 1.0, 0.9836065573770492, 0.8852459016393442, 0.8524590163934427, 0.8688524590163934, 0.9672131147540983, 0.9344262295081968, 0.8852459016393442, 0.9180327868852459, 0.9508196721311475, 0.9180327868852459, 0.8688524590163934, 0.9672131147540983, 0.9180327868852459, 1.0, 0.7704918032786885, 0.8688524590163934, 0.9016393442622951, 0.9016393442622951, 0.9836065573770492, 0.8360655737704918, 0.8688524590163934, 0.9016393442622951, 0.9180327868852459, 0.9344262295081968, 0.9508196721311475, 0.9016393442622951, 0.9672131147540983, 0.9344262295081968, 0.9508196721311475, 0.9344262295081968, 0.9344262295081968, 0.9344262295081968, 0.8032786885245902, 0.8852459016393442, 0.9344262295081968, 0.8852459016393442, 0.819672131147541, 0.93442622

## Task C-3: Repeat C-2 when considering the Edit similarity between each keyword K in the metadata and the title of the document containing K (considering the same rule applies when the title is made of several tokens so that the maximum similarity over all token is computed, and also when the keyword is found in more than one document). Save the result as new array Y.

In [7]:
Y = irie.compute_similarity_keyword_with_title()
print(Y)

[0.9482758620689655, 1.0, 0.9433962264150944, 0.8918918918918919, 1.0, 0.9864864864864865, 0.9375, 0.9324324324324325, 0.9655172413793104, 1.0, 1.0, 0.9324324324324325, 1.0, 0.972972972972973, 0.972972972972973, 0.9589041095890412, 1.0, 0.9464285714285714, 0.9864864864864865, 0.9090909090909091, 0.94, 1.0, 0.9459459459459459, 0.9508196721311475, 1.0, 1.0, 0.9137931034482758, 1.0, 0.9215686274509804, 0.9673202614379085, 0.961038961038961, 1.0, 0.9864864864864865, 0.9193548387096774, 0.9333333333333333, 0.9054054054054054, 1.0, 0.9459459459459459, 0.9594594594594594, 0.8909090909090909, 0.972972972972973, 0.9918032786885246, 0.9594594594594594, 0.9583333333333334, 1.0, 0.9663865546218487, 0.8777777777777778, 0.9916666666666667, 1.0, 1.0, 1.0, 0.9748427672955975, 1.0, 1.0, 0.972972972972973, 0.9054054054054054, 0.9166666666666666, 0.875, 0.8783783783783784, 0.7272727272727273, 0.935483870967742, 0.8918918918918919, 0.8783783783783784, 0.9459459459459459, 0.9178082191780822, 0.988372093023

## Task C-4: Use appropriate script to compute the Person correlation coefficient between X and Y and the associated p-value.

In [8]:
from scipy.stats import pearsonr


correlation, p_value = pearsonr(X, Y)

# Output the results
print(f"Pearson correlation coefficient: {correlation}")
print(f"P-value: {p_value}")

Pearson correlation coefficient: 0.42031665728783296
P-value: 2.1206328784571897e-14


## TaskC-5: Now we want to exploit the content of the abstract text. Initially, we want to test the extent to which the keywords of the metadata are part of the tokens of the abstract. Write a script that calculates for each keyword K, the frequency of K in the corresponding abstract. Save the result as a matrix M (n x m) where n stands for the number of documents and m the number of keywords.

In [9]:
M = irie.calculate_keyword_frequencies()
print(M)

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 13, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 11, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0

## Task C-6: Consider the query T used originally to construct the database, write a script that uses Boolean model to find out relevant document, utilizing matrix M (we assume the vocabulary is made only of keywords in KK).

In [10]:
keyword_frequencies_in_query = irie.calculate_keyword_frequencies_in_text(query)

relevant_documents = [0] * len(irie.metadata)
for i, document_frequencies in enumerate(M):
    for j, keyword_frequency in enumerate(document_frequencies):
        if keyword_frequency > 0 and keyword_frequencies_in_query[j] > 0:
            relevant_documents[i] = 1
            break

relevant_document_indices = [i for i, val in enumerate(relevant_documents) if val == 1]

print(f"Relevant documents matrix: {relevant_documents}")
print(f"Relevant documents: {relevant_document_indices}")

Relevant documents matrix: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Relevant documents: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]


## Task C-7: Repeat C-6 when you use tf-idf model.

In [11]:
import math


class InformationRetrievalIndexedTFIDF(InformationRetrievalIndexedFrequency):
    """
    Because TF-IDF does not support multiple word keyword, we have to rebuild inverted index again
    """
    def __init__(self, metadata: list[dict[str, str]] = [], keyword_separator: list[str] = [',', ' ']) -> None:
        super().__init__(metadata, keyword_separator)
        self.tf_idf_matrix = []
        self.idf = []

    def compute_tf(self, abstract: str, keywords: list[str]) -> list[float]:
        """Compute term frequency for each keyword in the abstract."""
        abstract_words = abstract.lower().split()
        total_words = len(abstract_words)

        tf = []
        for keyword in keywords:
            keyword_count = abstract_words.count(keyword.lower())
            tf.append(keyword_count / total_words if total_words > 0 else 0)

        return tf

    def compute_idf(self, documents: list[str], keywords: list[str]) -> list[float]:
        """Compute inverse document frequency for each keyword."""
        num_documents = len(documents)
        idf = []
        for keyword in keywords:
            # Count the number of documents that contain the keyword
            doc_count = sum(1 for doc in documents if keyword.lower() in doc.lower().split())
            idf_value = math.log(num_documents / (1 + doc_count))  # Add 1 to avoid division by zero
            idf.append(idf_value)

        return idf

    def build_tf_idf_matrix(self) -> None:
        """Build the TF-IDF matrix using the TF and IDF for each keyword and document."""
        documents = [article['abstract'] for article in self.metadata]

        # Calculate IDF for all keywords
        self.idf = self.compute_idf(documents, self.indexed_keywords)

        # Calculate TF and TF-IDF for each document
        self.tf_idf_matrix = []
        for article in self.metadata:
            tf = self.compute_tf(article['abstract'], self.indexed_keywords)
            tf_idf = [tf_val * idf_val for tf_val, idf_val in zip(tf, self.idf)]
            self.tf_idf_matrix.append(tf_idf)

    def calculate_tfidf_for_query(self, query: str) -> list[float]:
        """Compute TF-IDF scores for the query."""
        query_tf = self.compute_tf(query, self.indexed_keywords)
        query_tfidf = [tf_val * idf_val for tf_val, idf_val in zip(query_tf, self.idf)]
        return query_tfidf

    def find_relevant_documents(self, query: str) -> list[int]:
        """Find relevant documents using the TF-IDF model."""
        query_tfidf = self.calculate_tfidf_for_query(query)
        relevant_documents = [0] * len(self.metadata)

        for i, doc_tfidf in enumerate(self.tf_idf_matrix):
            # Check if any keyword in the query has a TF-IDF score > 0 in the document
            for j, tfidf_score in enumerate(doc_tfidf):
                if tfidf_score > 0 and query_tfidf[j] > 0:
                    relevant_documents[i] = 1
                    break

        return relevant_documents


irie_tfidf = InformationRetrievalIndexedTFIDF(metadata=metadata)
irie_tfidf.build_inverted_index()
irie_tfidf.build_tf_idf_matrix()
relevant_documents = irie_tfidf.find_relevant_documents(query)
relevant_document_indices = [i for i, val in enumerate(relevant_documents) if val == 1]

print(f"Relevant documents matrix: {relevant_documents}")
print(f"Relevant documents: {relevant_document_indices}")

Relevant documents matrix: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Relevant documents: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]


## Task C-8: We want to test the consistency between title of each document and the abstract text. Write a script that computes for each title, the FuzzyWuzzy score between the title and the corresponding abstract. Save the result in an array Z. We assume that the matching is accepted if the Fuzzy-wuzzy score is greater than 80%. Comment on the results.

In [12]:
%pip install fuzzywuzzy[speedup]

Note: you may need to restart the kernel to use updated packages.


In [13]:
from fuzzywuzzy import fuzz


class InformationRetrievalConsistency:
    def __init__(self, metadata: list[dict[str, str]]) -> None:
        self.metadata = metadata
        self.scores = []

    def compute_fuzzy_scores(self) -> list[float]:
        """
        Compute the FuzzyWuzzy score between the title and abstract of each document.
        """
        self.scores = []
        for article in self.metadata:
            # Compute the FuzzyWuzzy score (Levenshtein distance-based similarity ratio)
            score = fuzz.ratio(article['title'], article['abstract'])
            self.scores.append(score)

        return self.scores

    def check_consistency(self, threshold: int = 80) -> tuple[list[bool], float]:
        """
        Check consistency between title and abstract based on a threshold fuzzy score.

        Args:
            threshold (int): Minimum score to accept the match (default is 80%).

        Returns:
            tuple[list[bool], float]: List of booleans indicating if the match is accepted for each document
                                      and the percentage of accepted documents.
        """
        if len(self.scores) == 0:
            self.scores = self.compute_fuzzy_scores()

        accepted_documents = [score > threshold for score in self.scores]
        acceptance_rate = sum(accepted_documents) / len(self.scores) * 100

        return accepted_documents, acceptance_rate

consistency_checker = InformationRetrievalConsistency(metadata)
Z = consistency_checker.compute_fuzzy_scores()
accepted_documents, acceptance_rate = consistency_checker.check_consistency()

print(f"Fuzzy scores Z: {Z}")
print(f"Accepted documents matrix: {accepted_documents}")
relevant_document_indices = [i for i, val in enumerate(accepted_documents) if val == 1]
print(f"Relevant documents: {relevant_document_indices}")
print(f"Acceptance rate: {acceptance_rate:.2f}%")


Fuzzy scores Z: [10, 10, 12, 16, 18, 9, 8, 12, 5, 14, 12, 8, 13, 24, 15, 10, 15, 3, 12, 15, 15, 20, 26, 16, 11, 14, 13, 13, 8, 20]
Accepted documents matrix: [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]
Relevant documents: []
Acceptance rate: 0.00%


**My comment on the task**: Well, the result is quite expected, since the title is quite short - around 10 words, compare to the abstract, which can be 1 paragraph long to 1 A4 page, with probably in range of 250 words. That's 10 - 50x size. So it can be expected that Title cannot get to 80% Fuzzywuzzy score, because by default, FuzzyWuzzy `ratio` score use Levenshtein distance.

There are also different Fuzzywuzzy methods, such as `partial_ratio`, `token_sort_ratio` and `token_set_ratio`. In cases like this, it might be well suited to use `token_sort_ratio` and decrease the threshold, because this will help checking for similarity between 2 strings. Aisde from that, it might be better to use other sematic similarity methods with text understanding capability in order to do the task like this, because it's better to capture the underlying meaning of the text, rather than focusing on exact word matches.