In [2]:
! pip install PyPDF2 chromadb nltk -q



In [3]:
import os
import PyPDF2
import requests
import re
import json
import secrets
import numpy as np
import string
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions
from nltk.corpus import words
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('words')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\nilso\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nilso\AppData\Roaming\nltk_data...


In [4]:
client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
                                    persist_directory="/entries"
                                ))

Using embedded DuckDB with persistence: data will be stored in: /entries


In [5]:
def find_chars_idxs(sentence, chars):
  return [i for i, ltr in enumerate(sentence) if ltr == chars]

In [6]:
def exchange_chars(sentence, idxs):
  sentence_as_list = list(sentence)
  for period_idx in idxs:
    sentence_as_list[period_idx] = "[NOT_END_OF_SENTENCE]"
  
  return "".join(sentence_as_list)

In [7]:
def check_whether_period(sentence, period_idxs):
  periods_to_exchange = []
  for period in period_idxs:
    # Check for "i."
    if period != (len(sentence)-1):
      if sentence[period-2:period] == " i" and sentence[period:period+3] == ".e.":
        # print("[NOT_END_OF_SENTENCE] . after i in i.e.")
        periods_to_exchange.append(period)

      # Check for "e."
      elif sentence[period-3:period] == "i.e":
        # print("[NOT_END_OF_SENTENCE] after e in i.e.")
        periods_to_exchange.append(period)

      elif sentence[period-2:period] == " e" and sentence[period:period+3] == ".g.":
        # print("[NOT_END_OF_SENTENCE] . after i in i.e.")
        periods_to_exchange.append(period)

      elif sentence[period-3:period] == "e.g":
        # print("[NOT_END_OF_SENTENCE] after e in i.e.")
        periods_to_exchange.append(period)

      elif sentence[period-5:period] == "et al":
        # print("[NOT_END_OF_SENTENCE] after e in i.e.")
        periods_to_exchange.append(period)

      elif sentence[period-4:period] == "Prof":
        # print("[NOT_END_OF_SENTENCE] after e in i.e.")
        periods_to_exchange.append(period)

      elif sentence[period-1].isnumeric() and sentence[period+1].isnumeric():
        # print("[NOT_END_OF_SENTENCE] . after i in i.e.")
        periods_to_exchange.append(period)

  return exchange_chars(sentence, periods_to_exchange)

In [16]:
def get_pdf_text(pdf_path):
    if not os.path.exists("pdfs"):
        os.mkdir("pdfs")

    filename = os.path.join("pdfs", pdf_path.split("/")[-1])
    response = requests.get(pdf_path)
    with open(filename, "wb") as f:
        f.write(response.content)

    # creating a pdf file object
    pdfFileObj = open(filename, 'rb')
        
    # creating a pdf reader object
    pdf_reader = PyPDF2.PdfReader(pdfFileObj)

    # extract text
    total_text_list = []

    for i in range(len(pdf_reader.pages)):
        page_text = pdf_reader.pages[i].extract_text()
        total_text_list.append(page_text)

    pdf_text = " ".join(total_text_list)
    pdfFileObj.close()
    os.remove(filename)

    pdf_text = re.sub("\n", " ", pdf_text)

    modified_sentence = check_whether_period(pdf_text, find_chars_idxs(pdf_text, "."))

    pdf_sentences = re.split('[.!?]', modified_sentence)
    final_sentences = [mod_sentence.replace("[NOT_END_OF_SENTENCE]", ".") for mod_sentence in pdf_sentences]

    return pdf_text, final_sentences

### Test the above code with one example

In [17]:
text, sentences = get_pdf_text("http://arxiv.org/pdf/1412.6980")
sentences

['Published as a conference paper at ICLR 2015 ADAM : A M ETHOD FOR STOCHASTIC OPTIMIZATION Diederik P',
 ' Kingma* University of Amsterdam, OpenAI dpkingma@openai',
 'comJimmy Lei Ba∗ University of Toronto jimmy@psi',
 'utoronto',
 'ca ABSTRACT We introduce Adam , an algorithm for ﬁrst-order gradient-based optimization of stochastic objective functions, based on adaptive estimates of lower-order mo- ments',
 ' The method is straightforward to implement, is computationally efﬁcient, has little memory requirements, is invariant to diagonal rescaling of the gradients, and is well suited for problems that are large in terms of data and/or parameters',
 ' The method is also appropriate for non-stationary objectives and problems with very noisy and/or sparse gradients',
 ' The hyper-parameters have intuitive interpre- tations and typically require little tuning',
 ' Some connections to related algorithms, on which Adam was inspired, are discussed',
 ' We also analyze the theoretical con- ve

# Edit sentence list further

In [2]:
def check_sentence_for_sticked_words(sentence):
  # Limitation: Only recognises if 2 words are sticked together, not more
  split_sentence = sentence.split(" ")

  for idx, word in enumerate(split_sentence):
    word = word.lower()
    word = lemmatizer.lemmatize(word)
    if not word in words.words():
      for i in range(len(word)):
        word1 = word[:i]
        word1_lem = lemmatizer.lemmatize(word1)
        word2 = word[i:]
        word2_lem = lemmatizer.lemmatize(word2)

        if word1_lem in words.words() and word2_lem in words.words():
          split_sentence[idx] = f"{word1} {word2}"
    else:
      pass # Since word = word

  return " ".join(split_sentence)

In [5]:
sentence_list = []
metadata_list = []
id_list = []

In [6]:
def enter_sentences_json(paper, uid, idx):
    # read in pdf text and split into sentences
    try:
        _, sentences = get_pdf_text(paper["url"])
        id_iterable = 0

        # save each sentence
        for sentence in sentences:
            if len(sentence) > 0 and not sentence.isspace():
                sentence = check_sentence_for_sticked_words(sentence)
                sentence_list.append(sentence)
                _ ={
                    "title": paper["title"],
                    "authors": json.dumps(paper["authors"]),
                    "publication_year": paper["year"],
                    "journal_name": paper["journal"],
                    "place": "MISSING"
                }
                metadata_list.append(_)
                id_list.append(f"{uid}_{id_iterable}")
                
                # sentence_list.append(
                #     documents=sentence,
                #     metadatas={
                #         "title": paper["title"],
                #         "authors": json.dumps(paper["authors"]),
                #         "publication_year": paper["year"],
                #         "journal_name": paper["journal"],
                #         "place": "MISSING"
                #     },
                #     ids=f"{uid}_{id_iterable}"
                # )
                print(f"Paper: {idx} | ID sentence: {id_iterable}")
                id_iterable += 1
    except:
        print(f"Paper: {paper['title']} not readable")

In [7]:
with open("papers.json", "r") as f:
    paper_data = json.load(f)

len_papers= len(paper_data)
len_papers

1145

In [9]:
for idx, i in enumerate(paper_data[:30]):
    # generate unique id per paper
    uid = ''.join(secrets.choice(string.ascii_uppercase + string.digits) for i in range(6))

    # save sentences 
    enter_sentences_json(i, uid, idx)
    print(f"{idx} / {len_papers}")

Paper: 0 | ID sentence: 0
Paper: 0 | ID sentence: 1
Paper: 0 | ID sentence: 2
Paper: 0 | ID sentence: 3
Paper: 0 | ID sentence: 4
Paper: 0 | ID sentence: 5
Paper: 0 | ID sentence: 6
Paper: 0 | ID sentence: 7
Paper: 0 | ID sentence: 8
Paper: 0 | ID sentence: 9
Paper: 0 | ID sentence: 10
Paper: 0 | ID sentence: 11
Paper: 0 | ID sentence: 12
Paper: 0 | ID sentence: 13
Paper: 0 | ID sentence: 14
Paper: 0 | ID sentence: 15
Paper: 0 | ID sentence: 16
Paper: 0 | ID sentence: 17
Paper: 0 | ID sentence: 18
Paper: 0 | ID sentence: 19
Paper: 0 | ID sentence: 20
Paper: 0 | ID sentence: 21
Paper: 0 | ID sentence: 22
Paper: 0 | ID sentence: 23
Paper: 0 | ID sentence: 24
Paper: 0 | ID sentence: 25
Paper: 0 | ID sentence: 26
Paper: 0 | ID sentence: 27
Paper: 0 | ID sentence: 28
Paper: 0 | ID sentence: 29
Paper: 0 | ID sentence: 30
Paper: 0 | ID sentence: 31
Paper: 0 | ID sentence: 32
Paper: 0 | ID sentence: 33
Paper: 0 | ID sentence: 34
Paper: 0 | ID sentence: 35
Paper: 0 | ID sentence: 36
Paper: 0 | 

In [10]:
print("Length of IDs", len(id_list))
print("Length of sentences", len(sentence_list))
print("Length of metadatas", len(metadata_list))

Length of IDs 17163
Length of sentences 17163
Length of metadatas 17163


### Saving the lists as json files

In [11]:
with open("sentences.json", "w") as f:
    json.dump(sentence_list, f, indent=2)
with open("metadatas.json", "w") as f:
    json.dump(metadata_list, f, indent=2)
with open("sentence_ids.json", "w") as f:
    json.dump(id_list, f, indent=2)

# Create Collection & query it

In [None]:
paper_sentences_col = client.get_or_create_collection("paper_sentences")

In [None]:
paper_sentences_col.add(
    ids=id_list,
    documents=sentence_list,
    metadatas=metadata_list
)

## Query collection

In [None]:
results = paper_sentences_col.query(
    query_texts="impact of globalization",
    n_results=4
)

In [None]:
found_idx = int(results["ids"][0][3].split("_")[1])

next_idxs = [found_idx+i for i in range(1,6)]
before_idx = [found_idx - 1, found_idx]
query_idx_list = before_idx + next_idxs

queryable_idxs = [f"{results['ids'][0][3].split('_')[0]}_{i}" for i in query_idx_list]
queryable_idxs

In [None]:
paper_sentences_col.get(ids=queryable_idxs)['documents']

In [None]:
paragraph = ". ".join(paper_sentences_col.get(ids=queryable_idxs)['documents'])
paragraph

In [None]:
paragraph = paragraph.replace("  ", "")
paragraph