<h2>IMPORTS</h2>

In [17]:
# imports go here
import db
import inflect
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [23]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\windows\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

<h2>Data Retrieval</h2>

In [18]:
# input : DatabaseHandler
# output : DataFrame
def GetDF(dh:db.DatabaseHandler, selector: str, eventID: int, splitBySentences: bool = False):
    df = dh.get_recordDataJoinedDF(selector=selector, ID=eventID)
    if splitBySentences:
        # df.set_index('id', inplace=True)
        df['answer'] = df['answer'].str.split('.')
        df = df.explode("answer", True)
        df = df.drop(df[df["answer"] == ""].index)
    return df


# dh = db.DatabaseHandler("testdb.db")  # db connection
# df = GetDF(dh, "event_id", 19, True)
# df


Unnamed: 0,id,event_title,speaker,question,answer
0,205,Of Sin and Death,KNC,What is the difference between a mistake and a...,A mistake isn't necessarily evil
1,205,Of Sin and Death,KNC,What is the difference between a mistake and a...,A sin is surely evil
3,206,Of Sin and Death,RIC,What is the difference between a mistake and a...,A sin is wicked in the eyes of god
4,206,Of Sin and Death,RIC,What is the difference between a mistake and a...,A mistake isn't always wicked in the eyes of God
6,207,Of Sin and Death,GRE,What is the difference between a mistake and a...,Human error is a sin
7,207,Of Sin and Death,GRE,What is the difference between a mistake and a...,Everything improper is sin
9,208,Of Sin and Death,PY,What is the difference between a mistake and a...,If a man defies a God then he sins
10,208,Of Sin and Death,PY,What is the difference between a mistake and a...,"A mistake can be done by accident, but a sin ..."
12,209,Of Sin and Death,LIV,What is the difference between a mistake and a...,Maybe a sin is a mistake done with evil intent
14,210,Of Sin and Death,TMS,What is the difference between a mistake and a...,Sin is commited


<h2>Preprocessing</h2>

In [24]:
# input : sentence/document (string); parameters
# output : a list of word tokens (list<string>)
def Preprocess_Document(doc:str, isLemma:bool=False, isStopWords:bool=False, isInflect:bool=False, isNumberFiltered:bool=True):
    inflector = inflect.engine()
    stopwordSet = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()
    punctuations = string.punctuation
    # if numbers are filtered, add that to the punctuation string
    if isNumberFiltered:
        punctuations += "1234567890"

    # case fold
    doc = doc.lower()

    # remove puncs
    doc = "".join([char for char in doc if char not in punctuations])

    # tokenize it.
    token_list = nltk.word_tokenize(doc)

    for i in range(len(token_list)):
        # if inflect
        if isInflect:
            if token_list[i].isdigit():
                token_list[i] = inflector.number_to_words(token_list[i])

        # if lemma
        if isLemma:
            tagged_word = nltk.pos_tag([token_list[i]])
            wordnet_pos = get_wordnet_pos(tagged_word[0][1])
            token_list[i] = lemmatizer.lemmatize(tagged_word[0][0], pos=wordnet_pos)
        
        # if stopword
        if isStopWords:
            if token_list[i] in stopwordSet or token_list[i].isdigit():
                token_list[i] = "#" # mark as #
        
    # remove the marked strings
    token_list = [token for token in token_list if token != "#"]
    return token_list

def get_wordnet_pos(tag):
    """Map POS tag to WordNet POS tag"""
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN # solves as noun by default.
    

['a', 'mistake', 'isnt', 'necessarily', 'evil']
