Task 1

In [142]:
import requests
import numpy as np
import pandas as pd
import itertools
import string
from collections import Counter
from datetime import datetime
from urllib.parse import unquote
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

WIKIDATA_API_ENDPOINT = "https://www.wikidata.org/w/api.php"
WIKIPEDIA_API_ENDPOINT = "https://en.wikipedia.org/w/api.php"

HEADERS = {"User-Agent": "uni_coursework/2.0; nat.1.roongjirarat@kcl.ac.uk"}

PARAMS_QUERY_SEARCH = {
    "action":"query",
    "format":"json",
    "formatversion":"latest",
    "list":"search",
    "srsearch": "haswbstatement:P166=Q185667",
    "srlimit":"max"
}

PARAMS_GETCONTENT = {
    "action": "query",
    "format": "json",
    "titles": "",
    "prop": "extracts",
    "exlimit": "max"
}

PARAMS_WBGETENTITIES_LABELS = {
    "action": "wbgetentities",
    "format": "json",
    "ids": "",
    "sites": "",
    "props": "labels",
    "languages": "en",
    "sitefilter": "enwiki",
    "utf8": 1,
    "ascii": 1,
    "formatversion": "latest"
}

PARAMS_WBGETENTITIES_SITES = {
    "action": "wbgetentities",
    "format": "json",
    "ids": "",
    "sites": "",
    "props": "sitelinks/urls",
    "languages": "en",
    "sitefilter": "enwiki",
    "utf8": 1,
    "ascii": 1,
    "formatversion": "latest"
}

PARAMS_WBGETENTITIES_CLAIMS = {
    "action": "wbgetentities",
    "format": "json",
    "ids": "",
    "props": "claims",
    "languages": "en",
    "sitefilter": "",
    "formatversion": "latest"
}

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/nat_rng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/nat_rng/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/nat_rng/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/nat_rng/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
def get_turing_award_recipients():
    acm_award_entities = []
    search_response = requests.get(WIKIDATA_API_ENDPOINT, headers=HEADERS, params=PARAMS_QUERY_SEARCH)
    data = search_response.json()
    for result in data['query']['search']:
        acm_award_entities.append(result['title'])
    return acm_award_entities

print(get_turing_award_recipients())

Task 2

In [None]:
def get_wikipedia_content(entity_id):
    PARAMS_WBGETENTITIES_SITES["ids"] = entity_id
    wbgetentities_response = requests.get(WIKIDATA_API_ENDPOINT, headers=HEADERS, params=PARAMS_WBGETENTITIES_SITES)
    wbgetentities_data = wbgetentities_response.json()
    recipient_name = wbgetentities_data["entities"][entity_id]["sitelinks"]["enwiki"]["url"].split("https://en.wikipedia.org/wiki/")[1]

    PARAMS_GETCONTENT["titles"] = unquote(recipient_name)
    extracts_response = requests.get(WIKIPEDIA_API_ENDPOINT, headers=HEADERS, params=PARAMS_GETCONTENT)
    extracts_data = extracts_response.json()
    html_content = next(iter(extracts_data["query"]["pages"].values()))["extract"]
    content = BeautifulSoup(html_content, 'html.parser')
    if content.find("p", {"class":"mw-empty-elt"}):
        content.find("p", {"class":"mw-empty-elt"}).decompose()
    return str(content)

print(get_wikipedia_content("Q7143512"))

Task 3

In [None]:
wiki_data_dict = {"gender": "P21", "birth_date": "P569", "birth_city": "P19", 
                  "birth_country": "P17", "employer": "P108", "educated_at": "P69"}

def get_wikidata_label(entity_id):
    PARAMS_WBGETENTITIES_LABELS["ids"] = entity_id
    wbgetentities_response = requests.get(WIKIDATA_API_ENDPOINT, headers=HEADERS, params=PARAMS_WBGETENTITIES_LABELS)
    wbgetentities_data = wbgetentities_response.json()
    labels = next(iter(wbgetentities_data["entities"].values()))["labels"]
    value = labels["en"]["value"]
    return value

def get_wikidata_claims(entity_id):
    PARAMS_WBGETENTITIES_CLAIMS["ids"] = entity_id
    wbgetentities_response = requests.get(WIKIDATA_API_ENDPOINT, headers=HEADERS, params=PARAMS_WBGETENTITIES_CLAIMS)
    wbgetentities_data = wbgetentities_response.json()
    claims = next(iter(wbgetentities_data["entities"].values()))["claims"]
    return claims

def get_dict_values(entity_id):
    claims = get_wikidata_claims(entity_id)
    try:
        name = get_wikidata_label(entity_id).split(" (")[0]
    except KeyError:
        name = None
    try:
        intro = get_wikipedia_content(entity_id).split("<h2>")[0]
    except KeyError:
        intro = None   
    try: 
        gender = get_wikidata_label(claims[wiki_data_dict["gender"]][0]["mainsnak"]["datavalue"]["value"]["id"])
    except KeyError:
        gender = None
    try:
        birth_date = datetime.strptime(claims[wiki_data_dict["birth_date"]][0]["mainsnak"]["datavalue"]["value"]["time"], "+%Y-%m-%dT%XZ").strftime("%d %B %Y")
    except ValueError:
        birth_date = datetime.strptime(claims[wiki_data_dict["birth_date"]][0]["mainsnak"]["datavalue"]["value"]["time"], "+%Y-00-00T%XZ").strftime("%Y")
    except KeyError:
        birth_date = None
    try:
        birth_city = get_wikidata_label(claims[wiki_data_dict["birth_city"]][0]["mainsnak"]["datavalue"]["value"]["id"])
        birth_city_claims = get_wikidata_claims(claims[wiki_data_dict["birth_city"]][0]["mainsnak"]["datavalue"]["value"]["id"])
        birth_country = get_wikidata_label(birth_city_claims[wiki_data_dict["birth_country"]][0]["mainsnak"]["datavalue"]["value"]["id"])
        birth_place = "{}, {}".format(birth_city, birth_country)
    except KeyError:
        birth_place = None
    try:
        employer_list = []
        for i in range(len(claims[wiki_data_dict["employer"]])):
            employer = get_wikidata_label(claims[wiki_data_dict["employer"]][i]["mainsnak"]["datavalue"]["value"]["id"])
            employer_list.append(employer)
        employer = employer_list
    except KeyError:
        employer = None
    try:
        education_list = []
        for i in range(len(claims[wiki_data_dict["educated_at"]])):
            education = get_wikidata_label(claims[wiki_data_dict["educated_at"]][i]["mainsnak"]["datavalue"]["value"]["id"])
            education_list.append(education)
        education = education_list
    except KeyError:
        education = None
    return name, intro, gender, birth_date, birth_place, employer, education


award_winners = {"name": [], "intro": [], "gender": [], "birth_date": [], 
                 "birth_place": [], "employer": [], "educated_at": []}
acm_award_winners = get_turing_award_recipients()
for entity_id in acm_award_winners:
    name, intro, gender, birth_date, birth_place, employer, education = get_dict_values(entity_id)
    award_winners["name"].append(name)
    award_winners["intro"].append(intro)
    award_winners["gender"].append(gender)
    award_winners["birth_date"].append(birth_date)
    award_winners["birth_place"].append(birth_place)
    award_winners["employer"].append(employer)
    award_winners["educated_at"].append(education)

Task 4

In [None]:
for name in sorted(award_winners["name"]):
    print(name)

Task 5

In [128]:
award_winners_intro = pd.DataFrame(columns=["winner_name", "count_words", "count_sentences", "count_paragraphs", "common_words"])
award_winners_intro["winner_name"] = award_winners["name"]

def get_intro_stats(intro): 
    intro_html = BeautifulSoup(intro, "html.parser")
    intro_text = intro_html.get_text(" ")
    count_words = sum([word.strip(string.punctuation).isalnum() for word in intro_text.split()])
    count_sentences = len(sent_tokenize(intro_text))
    count_paragraphs = len(intro_html.find_all("p"))
    word_freqdist= nltk.FreqDist([word for word in word_tokenize(intro_text) if word not in list(string.punctuation)]).most_common(10)
    common_words = [word[0] for word in word_freqdist]
    return count_words, count_sentences, count_paragraphs, common_words

intro_stats = {"word_count": [], "sentence_count": [], "paragraph_count": [], "common_words": []}
for intro in award_winners["intro"]:
    count_word, count_sentences, count_paragraphs, common_words = get_intro_stats(intro)
    intro_stats["word_count"].append(count_word)
    intro_stats["sentence_count"].append(count_sentences)
    intro_stats["paragraph_count"].append(count_paragraphs)
    intro_stats["common_words"].append(common_words)

award_winners_intro["count_words"] = intro_stats["word_count"]
award_winners_intro["count_sentences"] = intro_stats["sentence_count"]
award_winners_intro["count_paragraphs"] = intro_stats["paragraph_count"]
award_winners_intro["common_words"] = intro_stats["common_words"]

# award_winners_intro.head()

Task 5

In [139]:
def process_common_words(intro):
    intro_text = BeautifulSoup(intro, "html.parser").get_text(" ")
    word_filter = set(stopwords.words('english') + list(string.punctuation) + ["``", "''", "–"])
    tokenized_intro = word_tokenize(intro_text)
    word_freqdist = nltk.FreqDist([word for word in tokenized_intro if word not in word_filter]).most_common(10)
    common_words = [word[0] for word in word_freqdist]
    return common_words

common_words_after_preprocessing = [process_common_words(intro) for intro in award_winners["intro"]]
award_winners_intro["common_words_after_preprocessing"] = common_words_after_preprocessing

print(award_winners_intro.head(10))

       winner_name  count_words  count_sentences  count_paragraphs  \
0  Tim Berners-Lee          348               17                 4   
1    Yoshua Bengio           90                4                 2   
2  Geoffrey Hinton          176                8                 3   
3     Donald Knuth          182                8                 3   
4  Richard M. Karp           91                3                 2   
5    Robert Tarjan           59                3                 1   
6        Vint Cerf           62                2                 1   
7      Judea Pearl          154                5                 2   
8    Herbert Simon          175                7                 2   
9    Marvin Minsky           51                2                 2   

                                        common_words  \
0    [the, of, and, Web, He, a, is, as, World, Wide]   
1  [and, the, of, Bengio, for, is, a, work, deep,...   
2    [the, and, of, for, in, Hinton, a, his, to, is]   
3  [t

3.2 Sub Activity

Task 3

In [None]:
long_intro_text = " ".join([BeautifulSoup(intro, "html.parser").get_text(" ") for intro in award_winners["intro"]])

word_filter = set(stopwords.words('english') + list(string.punctuation) + ["``", "''", "–"])
tokenized_intro = word_tokenize(long_intro_text)
intro_words = [word for word in tokenized_intro if word not in word_filter]

print("Number of unique words before stemming with Porter Stemmer: {}".format(len(list(nltk.FreqDist(intro_words)))))

porter_stemmer = PorterStemmer()
intro_words = [porter_stemmer.stem(word) for word in intro_words]

print("Number of unique words after stemming with Porter Stemmer: {}".format(len(list(nltk.FreqDist(intro_words)))))

Task 4

In [None]:
long_intro_text = " ".join([BeautifulSoup(intro, "html.parser").get_text(" ") for intro in award_winners["intro"]])

word_filter = set(stopwords.words('english') + list(string.punctuation) + ["``", "''", "–"])
tokenized_intro = word_tokenize(long_intro_text)
intro_words = [word for word in tokenized_intro if word not in word_filter]

print("Number of unique words before stemming with Snowball Stemmer: {}".format(len(list(nltk.FreqDist(intro_words)))))

snow_stemmer = SnowballStemmer(language='english')
intro_words = [snow_stemmer.stem(word) for word in intro_words]

print("Number of unique words after stemming with Snowball Stemmer: {}".format(len(list(nltk.FreqDist(intro_words)))))

Task 5

In [111]:
long_intro_text = " ".join([BeautifulSoup(intro, "html.parser").get_text(" ") for intro in award_winners["intro"]])

word_filter = set(stopwords.words('english') + list(string.punctuation) + ["``", "''", "–"])
tokenized_intro = word_tokenize(long_intro_text)
intro_words = [word for word in tokenized_intro if word not in word_filter]

print("Number of unique words before lemmatization with Word Net Lemmatizer: {}".format(len(list(nltk.FreqDist(intro_words)))))

wordnet_lemmatizer = WordNetLemmatizer()
intro_words = [wordnet_lemmatizer.lemmatize(word) for word in intro_words]

print("Number of unique words after lemmatization with Word Net Lemmatizer: {}".format(len(list(nltk.FreqDist(intro_words)))))

Number of unique words before lemmatization with Word Net Lemmatizer: 1766
Number of unique words after lemmatization with Word Net Lemmatizer: 1715


Task 6

In [144]:
award_winners_intro = award_winners_intro.assign(synonyms = np.nan, antonyms = np.nan)
common_words_after_preprocessing = award_winners_intro["common_words_after_preprocessing"]

def get_synonyms_antonyms(common_words):
    synonyms = []
    antonyms = []
    for word in common_words:
        for syn in wordnet.synsets(word):
            for l in syn.lemmas():
                synonyms.append(l.name())
                if l.antonyms():
                    antonyms.append(l.antonyms()[0].name())
    return synonyms, antonyms

for common_words in common_words_after_preprocessing:
    synonyms, antonyms = get_synonyms_antonyms(common_words_after_preprocessing[i])
    award_winners_intro["synonyms"][i] = synonyms
    award_winners_intro["antonyms"][i] = antonyms

print(award_winners_intro.head(10))

       winner_name  count_words  count_sentences  count_paragraphs  \
0  Tim Berners-Lee          348               17                 4   
1    Yoshua Bengio           90                4                 2   
2  Geoffrey Hinton          176                8                 3   
3     Donald Knuth          182                8                 3   
4  Richard M. Karp           91                3                 2   
5    Robert Tarjan           59                3                 1   
6        Vint Cerf           62                2                 1   
7      Judea Pearl          154                5                 2   
8    Herbert Simon          175                7                 2   
9    Marvin Minsky           51                2                 2   

                                        common_words  \
0    [the, of, and, Web, He, a, is, as, World, Wide]   
1  [and, the, of, Bengio, for, is, a, work, deep,...   
2    [the, and, of, for, in, Hinton, a, his, to, is]   
3  [t

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  award_winners_intro["synonyms"][i] = synonyms
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  award_winners_intro["antonyms"][i] = antonyms
