In [1]:
import re
!pip install pdfminer.six
from pdfminer.high_level import extract_text
##library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA

# Use tensorflow version 2. This is for Colab.
%tensorflow_version 2.x

import tensorflow as tf
print (tf.__version__)
import tensorflow_hub as hub
from sklearn import preprocessing

from IPython.display import HTML
import logging
logging.getLogger('tensorflow').disabled = True #OPTIONAL - to disable outputs from Tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.
2.12.0


In [2]:
def extract_abstract_body(file_path):
    # Extract text from PDF
    text = extract_text(file_path)
    #print(text)

    # Regular expressions to locate abstract 
    abstract_pattern = re.compile(r'ABSTRACT: (.*?)\n\n', re.DOTALL)
    
    # Find abstract and body in the extracted text
    abstract_match = abstract_pattern.search(text)

    abstract = abstract_match.group(1).strip() if abstract_match else "Abstract not found"
    abstract = abstract.replace('\n', ' ')
    if "KEYWORDS" in abstract:
        index = abstract.find("KEYWORDS")
        abstract = abstract[:index]

    text = text.replace('\n', ' ')

    if "ASSOCIATED CONTENT" in text:
        index = text.find("ASSOCIATED CONTENT")
        text = text[:index]
    else:
        index = text.find("REFERENCE")
        text = text[:index]

    return abstract, text

In [3]:
file_names = ["acspolymersau.2c00009.pdf", "acscentsci.9b00476.pdf", "ma302048y.pdf"]

abstracts = []

texts = []

for i in range(0, len(file_names)):
    abstract, text = extract_abstract_body(file_names[i])

    abstracts.append(abstract)

    texts.append(text)

# Input your query

In [4]:
inquery = "experimental work about block copolymer phase behavior "

# TF_IDF

In [5]:
def ranking_tf_idf(inquery, data, file_names, number_to_return):
    vectorizer = TfidfVectorizer()

    n = len(data)
    if n <=0:

        return
        
    tf_idf = vectorizer.fit_transform(data)


    qtf_idf = vectorizer.transform([inquery])

    res = cosine_similarity(tf_idf, qtf_idf)

    res1 = res.ravel().argsort()[-n:]

    res1 = res1.tolist()
    res1.reverse()


    for i in range(0, len(res1)):
        print(i+1, file_names[res1[i]],
              " cosine similarity = ", 
              res[res1[i]][0],'\n')

In [6]:
ranking_tf_idf(inquery, abstracts, file_names, 3)

1 ma302048y.pdf  cosine similarity =  0.07326575733453383 

2 acspolymersau.2c00009.pdf  cosine similarity =  0.03869436906702286 

3 acscentsci.9b00476.pdf  cosine similarity =  0.017563242690388076 



In [7]:
ranking_tf_idf(inquery, texts, file_names, 3)

1 ma302048y.pdf  cosine similarity =  0.08011192161033635 

2 acspolymersau.2c00009.pdf  cosine similarity =  0.008109818546533996 

3 acscentsci.9b00476.pdf  cosine similarity =  0.006236318206144024 



# ELMO

In [10]:
def ranking_elmo(inquery, data, file_names, number_to_return):

    n = len(data)
    if n <=0:

        return

    url = "https://tfhub.dev/google/elmo/3" # get the model for the embeddings

    embed = hub.load(url)

    elmo_embeddings = embed.signatures['default'](tf.convert_to_tensor(data))['default']

    elmo_search_vect = embed.signatures['default'](tf.convert_to_tensor([inquery]))['default']

    elmo_res = cosine_similarity(elmo_search_vect,elmo_embeddings).flatten()

    elmo_res1 = elmo_res.ravel().argsort()[-len(elmo_res):]

    elmo_res1 = elmo_res1.tolist()

    elmo_res1.reverse()

    for i in range(0, len(elmo_res1)):
        print(i+1, 
              file_names[elmo_res1[i]], 
              "cosine similarity = ", 
              elmo_res[elmo_res1[i]], '\n')



In [11]:
ranking_elmo(inquery, abstracts, file_names, 3)

1 acscentsci.9b00476.pdf cosine similarity =  0.644523 

2 acspolymersau.2c00009.pdf cosine similarity =  0.62242067 

3 ma302048y.pdf cosine similarity =  0.55268824 



In [None]:
ranking_elmo(inquery, texts, file_names, 3)