In [2]:
import spacy
import pytextrank
from googleapiclient.discovery import build
import re
import os

from spacy.tokens import Span
from dotenv import load_dotenv

### Fact Extraction

In [3]:
@spacy.registry.misc("plural_scrubber")
def plural_scrubber():
    def scrubber_func(span: Span) -> str:
#         singular_form = " ".join([token.lemma_ for token in span])
#         return singular_form
        return span.lemma_
    return scrubber_func

# Load summary input for fact extraction
with open("text.txt") as text:
    overall = text.read()
    
# Load a spaCy model
nlp = spacy.load("en_core_web_lg")

# Exclude stopwords that could be generated due to completion prompt
nlp.Defaults.stop_words |= {"transcript", "passage", "extract", "term", "video"}

# Add TopicRank component to pipeline with stopwords
nlp.add_pipe("topicrank", config={
                        "stopwords": {token:["NOUN"] for token in nlp.Defaults.stop_words},
                        "scrubber": {"@misc": "plural_scrubber"}})

# Perform fact extraction on overall summary and segment summaries
doc2 = nlp(overall)

# Create unique list of top 4 ranked phrases
phrases = {phrase.text for phrase in doc2._.phrases[:5]}

# for phrase in doc2._.phrases[:5]:
#     print(f"{phrase.text}, {phrase.rank}\n")


In [5]:
print(phrases)
    
# nlp.analyze_pipes(pretty=True)

# print([w.lemma_ for w in doc2])

{'man', 'machine', 'game', 'electronic computer'}


### Information Retrieval

In [6]:
# query google custom search engine for search results
load_dotenv()  # take environment variables from .env
google_key = os.getenv("G_KEY")
google_cx = os.getenv("G_CX")

service = build("customsearch", "v1", developerKey=google_key)

results = []
for term in phrases:
    resources = {}
    # make search through search engine api
    data = service.cse().list(
                            q=term,
                            cx=google_cx,
                            num=1,
                            siteSearch="wikipedia.org",
                            siteSearchFilter="i"
                            ).execute()
    
    # add resultant page title, link and icon to dict
    resources['title'], resources['url'] = data['items'][0]['title'], data['items'][0]['link']
    resources['icon'] = "/".join( (data['items'][0]['link']).split("/")[:3] ) + "/favicon.ico"
    results.append(resources)

print(results)

NameError: name 'os' is not defined