In [1]:
import spacy as sp
from scipy import spatial
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Bag-of-Words (BOW)

Creating the corpus.

In [2]:
df = pd.read_excel('dataset.xlsx')

corpus = []

for text in df['Abstract']:
    if type(text) == str:
        corpus.append(text)

Creating the vectorizer.

In [3]:
nlp = sp.load('en_core_web_sm')

def spacy_tokenizer(doc):
  return [t.text for t in nlp(doc) if not t.is_punct]

vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, token_pattern=None)

Creating the BOW.

In [4]:
bow = vectorizer.fit_transform(corpus)

Calculating the cossine similarity between the documents.

In [5]:
cossine_sim = cosine_similarity(bow)

## TF-IDF

Creating vectorizer.

In [6]:
unwanted_pipes = ["ner", "parser"]

def spacy_tokenizer(doc):
  with nlp.disable_pipes(*unwanted_pipes):
    return [t.lemma_ for t in nlp(doc) if \
            not t.is_punct and \
            not t.is_space and \
            t.is_alpha]

vectorizer = TfidfVectorizer(tokenizer=spacy_tokenizer, token_pattern=None)
features = vectorizer.fit_transform(corpus)

Searching queries.

In [7]:
def top_k(arr, k):
  kth_largest = (k + 1) * -1
  return np.argsort(arr)[:kth_largest:-1]

In [8]:
query = ["biopiracy countries species"]
query_tfidf = vectorizer.transform(query)
cosine_similarities = cosine_similarity(features, query_tfidf).flatten()

top_related_indices = top_k(cosine_similarities, 5)

corpus[top_related_indices[0]]

'Between 1960 and 1982, the National Cancer Institute (NCI) screened over 180,000 microbial-derived, some 16,000 marine organism-derived, and over 114,000 plant-derived extracts. A number of clinically effective chemotherapeutic agents were developed, mainly through collaborative efforts with the public and private sectors. These agents include paclitaxel, camptothecin derivatives, various anthracyclines, bleomycin, actinomycin and mitomycin. Since 1986, collections of plants and marine invertebrates have been performed in over 25 tropical and subtropical countries worldwide through contracts with botanical and marine biological organizations, working in close collaboration with source country organizations and subject to agreements with the source country authorities. Over 120,000 extracts are stored at low temperatures in the NCI Natural Products Repository and are made available to the scientific community for testing in screens related to all human diseases, subject to the signing 

In [9]:
query = ["Brazil drugs"]
query_tfidf = vectorizer.transform(query)
cosine_similarities = cosine_similarity(features, query_tfidf).flatten()

top_related_indices = top_k(cosine_similarities, 5)

corpus[top_related_indices[0]]

'The development of our society has been based on the use of biodiversity, especially for medicines and nutrition. Brazil is the nation with the largest biodiversity in the world accounting for more than 15% of all living species. The devastation of biodiversity in Brazil is critical and may not only cause the loss of species and genes that encode enzymes involved in the complex metabolism of organisms, but also the loss of a rich chemical diversity, which is a potential source for bioeconomy based on natural products and new synthetic derivatives. Bioeconomy focus on the use of bio-based products, instead of fossil-based ones and could address some of the important challenges faced by society. Considering the chemical and biological diversity of Brazil, this review highlights the Brazilian natural products that were successfully used to develop new products and the value of secondary metabolites from Brazilian biodiversity with potential application for new products and technologies. 