In [18]:
import os
import spacy
import requests
import pandas as pd

cwd = os.getcwd()

In [19]:
# load spacy language model
nlp = spacy.load('en_core_web_lg')

In [3]:
# load publicly available Jeopardy dataset
jeopardy = pd.read_csv(cwd+'/data/jeopardy_questions.csv')
jeopardy

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams
...,...,...,...,...,...,...,...
216925,4999,2006-05-11,Double Jeopardy!,RIDDLE ME THIS,$2000,This Puccini opera turns on the solution to 3 ...,Turandot
216926,4999,2006-05-11,Double Jeopardy!,"""T"" BIRDS",$2000,In North America this term is properly applied...,a titmouse
216927,4999,2006-05-11,Double Jeopardy!,AUTHORS IN THEIR YOUTH,$2000,"In Penny Lane, where this ""Hellraiser"" grew up...",Clive Barker
216928,4999,2006-05-11,Double Jeopardy!,QUOTATIONS,$2000,"From Ft. Sill, Okla. he made the plea, Arizona...",Geronimo


In [4]:
jeopardy.columns = map(lambda x: x.lower().strip(), jeopardy.columns)
jeopardy = jeopardy[0:1000] # reduce data size for this example

In [6]:
# Apply language model to each row
jeopardy["question_tokens"] = jeopardy["question"].apply(lambda x: nlp(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  jeopardy["question_tokens"] = jeopardy["question"].apply(lambda x: nlp(x))


In [7]:
# Sanity check: view first question
example_question = jeopardy.question[0]
example_question_tokens = jeopardy.question_tokens[0]
print("The first questions is:")
print(example_question)

The first questions is:
For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory


In [8]:
# Print individual tokens of first question
print("The tokens from the first question are:")
for token in example_question_tokens:
    print(token)

The tokens from the first question are:
For
the
last
8
years
of
his
life
,
Galileo
was
under
house
arrest
for
espousing
this
man
's
theory


In [9]:
# Part-of-speech tags for tokens in the first question
print("Here are the Part-of-speech tags for each token in the first question:")
for token in example_question_tokens:
    print(token.text, token.pos_, spacy.explain(token.pos_))

Here are the Part-of-speech tags for each token in the first question:
For ADP adposition
the DET determiner
last ADJ adjective
8 NUM numeral
years NOUN noun
of ADP adposition
his PRON pronoun
life NOUN noun
, PUNCT punctuation
Galileo PROPN proper noun
was AUX auxiliary
under ADP adposition
house NOUN noun
arrest NOUN noun
for ADP adposition
espousing VERB verb
this DET determiner
man NOUN noun
's PART particle
theory NOUN noun


In [11]:
# Dependency Parsing tags for tokens in the first question
for token in example_question_tokens:
    print(token.text, token.dep_, spacy.explain(token.dep_))

For prep prepositional modifier
the det determiner
last amod adjectival modifier
8 nummod numeric modifier
years pobj object of preposition
of prep prepositional modifier
his poss possession modifier
life pobj object of preposition
, punct punctuation
Galileo nsubj nominal subject
was ROOT root
under prep prepositional modifier
house compound compound
arrest pobj object of preposition
for prep prepositional modifier
espousing pcomp complement of preposition
this det determiner
man poss possession modifier
's case case marking
theory dobj direct object


In [12]:
# Visualize the dependency parsing tags for first question
# For, years, of, life, was, under, arrest, espousing, man, theory important
# 'years' most important token by number of prepositional phrases mapping to these two
spacy.displacy.render(example_question_tokens, style='dep',
                jupyter=True, options={'distance': 120})

In [13]:
# Chunking for first jeopardy sentence
for chunk in nlp(example_question_tokens).noun_chunks:
      print(chunk.text)

the last 8 years
his life
Galileo
house arrest
this man's theory


In [14]:
lemmatization_df = pd.DataFrame(data=[], \
  columns=["original","lemmatized"])

for i, token in enumerate(example_question_tokens):
    lemmatization_df.loc[i,"original"] = token.text
    lemmatization_df.loc[i,"lemmatized"] = token.lemma_

lemmatization_df

Unnamed: 0,original,lemmatized
0,For,for
1,the,the
2,last,last
3,8,8
4,years,year
5,of,of
6,his,his
7,life,life
8,",",","
9,Galileo,Galileo


In [16]:
# NER results - print the entity's text, start pos, end pos, NER label
# Galileo is a PRODUCT??
print("Text | Start | End | Label")
doc = nlp(example_question_tokens)
for token in doc.ents:
    print(token.text, '|', token.start_char, '|', token.end_char, '|', token.label_)

Text | Start | End | Label
the last 8 years | 4 | 20 | DATE
Galileo | 34 | 41 | PRODUCT


In [17]:
# Visualize NER results
spacy.displacy.render(doc, style='ent', jupyter=True, options={'distance': 120})

In [22]:
# Named Entity Linking (NEL) resolves a textual entity to a unique identifier in a knowledge base.
# Thinking of it as semantic search.
# using Google Knowledge Graph API
def named_entity_linking(query, entityType):
    if entityType=="PERSON":
        google = f"https://enterpriseknowledgegraph.googleapis.com/v1/projects/\
        nel_project/locations/nel_project_location/cloudKnowledgeGraphEntities:Search?query={query}&limit=LIMIT"
        resp = requests.get(google)
        print(resp)
        url = resp.json()['itemListElement'][0]['result']\
         ['detailedDescription']['url']
        description = resp.json()['itemListElement'][0]['result']\
         ['detailedDescription']['articleBody']
        return url, description
    else:
        return "no_match", "no_match"
    
example_sentence = "George Washington was an American political leader, \
military general, statesman, and Founding Father who served as the \
first president of the United States from 1789 to 1797.\n"

doc = nlp(example_sentence)

# Wikipedia descriptions and URLs for entities
for token in doc.ents:
    url, description = named_entity_linking(token.text, token.label_)
    print(token.text, token.label_, url, description)

<Response [401]>


KeyError: 'itemListElement'