## Extract type facts from a Wikipedia file


### === Purpose ===

The goal of this lab is to write an algorithm that extracts the type of an entity by using Wikipedia articles.  
The algorithm's input contains both the title and the content of the Wikipedia article. The title is the entity for which we want to extract its type.

For example, we want to extract the type of the entity Leicester from its corresponding Wikipedia article. The input is:

    title: Leicester

    content: Leicester is a small city in England
    
and the goal is to return:

    Leicester TAB city


### === Provided Data ===

We provide:

1. a preprocessed version of the Simple Wikipedia (`wikipedia-first.txt`), which looks like above
2. a template for your code, `type_extraction.py`
3. a gold standard sample (`gold-standard-sample.tsv`).

### === Task ===

Complete the `extract_type()` function so that it extracts the type of the article entity from the content.
For example, for a content of "Leicester is a beautiful English city in the UK", it should return "city".
Exclude terms that are too abstract ("member of...", "way of..."), and try to extract exactly the noun.
You can also skip articles (e.g. `return None`) if you are not sure or if the text does not contain any type.
In order to ensure a fair evaluation, do not use any non-standard Python libraries.

Input:

April  
April is the fourth month of the year with 30 days.

Output:
April TAB month


### === Development and Testing ===

We provide a certain number of gold samples for validating your model.
Finally, we calculate a F1 score using following equation:

`F1 = (1 + beta * beta) * precision * recall / (beta * beta * precision + recall)`

with `beta = 0.5`, putting more weight on precision in that way.


### === Submission ===

1. Take your code, any necessary resources to run the code, and the output of your code on the test dataset (no need to put the other datasets!)
2. ZIP these files in a file called `firstName_lastName.zip`
3. submit it here before the deadline announced during the lab:


https://www.dropbox.com/request/kNTeo7yPFd6teTwTi61y
### === Contact ===

If you have any additional questions, you can send an email to: zacchary.sadeddine@telecom-paris.fr


In [None]:
"""
Don't modify this code.
"""

import sys
import re

class Page:
    """
    This class is used to store title and content of a wiki page
    """
    __author__ = "Jonathan Lajus"

    def __init__(self, title, content):
        self.content = content
        self.title = title
        if sys.version_info[0] < 3:
            self.title = title.decode("utf-8")
            self.content = content.decode("utf-8")

    def __eq__(self, other):
        return isinstance(other, self.__class__) and self.title == other.title and self.content == other.content

    def __ne__(self, other):
        return not self.__eq__(other)

    def __hash__(self):
        return hash((self.title, self.content))

    def __str__(self):
        return 'Wikipedia page: "' + (self.title.encode("utf-8") if sys.version_info[0] < 3 else self.title) + '"'

    def __repr__(self):
        return self.__str__()

    def _to_tuple(self):
        return self.title, self.content


class Parsy:
    """
    Parse a Wikipedia file, return page objects
    """
    __author__ = "Jonathan Lajus"

    def __init__(self, wikipediaFile):
        self.file = wikipediaFile

    def __iter__(self):
        title, content = None, ""
        with open(self.file, encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if not line and title is not None:
                    yield Page(title, content.rstrip())
                    title, content = None, ""
                elif title is None:
                    title = line
                elif title is not None:
                    content += line + " "


def eval_f1(gold_file, pred_file):

    # Dictionaries
    goldstandard = dict()
    student = dict()

    # Reading first file
    with open(gold_file, 'r', encoding="utf-8") as f:
        for line in f:
            temp = line.split("\t")
            if len(temp) != 2:
                print("The line:", line, "has an incorrect number of tabs")
            else:
                if temp[0] in goldstandard:
                    print(temp[0], " has two solutions")
                goldstandard[temp[0]] = str.lower(temp[1])

    # Reading second file
    with open(pred_file, 'r', encoding="utf-8") as f:
        for line in f:
            temp = line.split("\t")
            if len(temp) != 2:
                print("The line: '", line, "' has an incorrect number of tabs")
            else:
                if temp[0] in student:
                    print(temp[0], " has two solutions")
                student[temp[0]] = str.lower(temp[1])

    true_pos = 0
    false_pos = 0
    false_neg = 0

    for key in student:
        if key in goldstandard:
            if student[key] == goldstandard[key]:
                true_pos += 1
            else:
                false_pos += 1
                print("You got", key, "wrong. Expected output: ", goldstandard[key], ",given:", student[key])

    for key in goldstandard:
        if key not in student:
            false_neg += 1
            print("No solution was given for", key)

    if true_pos + false_pos != 0:
        precision = float(true_pos) / (true_pos + false_pos) * 100.0
    else:
        precision = 0.0

    if true_pos + false_neg != 0:
        recall = float(true_pos) / (true_pos + false_neg + false_pos) * 100.0
    else:
        recall = 0.0

    beta = 0.5

    if precision + recall != 0.0:
        f05 = (1 + beta * beta) * precision * recall / (beta * beta * precision + recall)
    else:
        f05 = 0.0

    # grade = 0.75 * precision + 0.25 * recall
    grade = f05

    print("Comment :=>>", "Precision:", precision, "%")
    print("Comment :=>>", "Recall:", recall, "%")
    print("Simulated Grade (F0.5) :=>>", grade, "%")


In [None]:
# a simplified wiki page document
wiki_file = 'wikipedia-first.txt'
# some gold samples for validation
gold_file = 'gold-standard-sample.tsv'
# predicted results generated by your model
# you are supposed to submit this file
result_file = 'results.tsv'

In [None]:
common_adverbs = [
    "very", "really", "just", "quite", "so", "too", "also", "well", "up", "only", "even", "still", "always",
    "never", "ever", "almost", "often", "usually", "sometimes", "frequently", "generally", "particularly",
    "especially", "instead", "actually", "probably", "obviously", "clearly", "essentially", "extremely",
    "exactly", "certainly", "definitely", "absolutely", "literally", "completely", "totally", "partially",
    "mostly", "largely", "greatly", "mainly", "nearly", "barely", "hardly", "merely", "simply", "relatively",
    "specifically", "naturally", "apparently", "effectively", "fundamentally", "adequately", "strongly",
    "significantly", "substantially", "primarily", "originally", "previously", "initially", "finally",
    "recently", "currently", "immediately", "directly", "eventually", "ultimately"
]
common_pronouns = [
    "I", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them", "my", "your", "his", "its",
    "our", "their", "this", "these", "those"
]
common_adjectives = [
    "good", "new", "first", "last", "long", "great", "little", "own", "other", "old", "right", "big", "high",
    "small", "large", "next", "early", "young", "important", "few", "public", "bad", "same", "able", "different",
    "local", "social", "general", "various", "current", "national", "individual", "special", "certain", "main",
    "major", "available", "known", "better", "low", "human", "early", "necessary", "large", "significant",
    "similar", "common", "particular", "full", "likely", "clear", "possible", "real", "popular", "hard",
    "economic", "single", "original", "entire", "recent", "personal", "open", "specific", "several", "strong",
    "simple", "final", "various", "standard", "necessary", "close", "serious", "difficult", "known", "physical",
    "successful", "available", "independent", "traditional", "professional", "related", "required", "federal",
    "financial", "legal", "international", "major", "primary", "common", "individual", "national", "particular"
]

words_to_remove = set(common_adverbs + common_pronouns + common_adjectives)

In [None]:
def clean_content(content):
    # Remove the words from the content
    pattern = re.compile(r'\b(' + '|'.join(re.escape(word)
                         for word in words_to_remove) + r')\b', re.IGNORECASE)
    cleaned_content = pattern.sub('', content)
    return cleaned_content

In [None]:
def extract_type(wiki_page):
    """
    :param wiki_page is an object contains a title and the first sentence from its wiki page.
    :return:
    """
    
    title = wiki_page.title
    content = wiki_page.content

    content = content = clean_content(wiki_page.content)

    pattern = re.compile(
        rf"{re.escape(title)} ((is a|is an|is the|are the|are|was a|was the|was an|were|were the|are a|were an|were a|are an) first|second|hard|[a-z]+al|[a-z]+est|.*? ([a-z][a-z][a-z][a-z]+))")
    pattern2 = re.compile(rf"{re.escape(title)} (.*?) ([a-zA-Z]+) that")

    m = pattern2.search(content)
    if m:
        return m.group(2)
    else:
        m = pattern.search(content)
        if m:
            return m.group(3)
        else:
            return None

    return None

In [None]:
def run():
    '''
    First, extract types from each sentence in the wiki file
    Next, use gold samples to evaluate your model
    :return:
    '''
    with open(result_file, 'w', encoding="utf-8") as output:
        for page in Parsy(wiki_file):
            typ = extract_type(page)
            if typ:
                output.write(page.title + "\t" + typ + "\n")

    # Evaluate on some gold samples for checking your model
    eval_f1(gold_file, result_file)


run()