<h2>IMPORTS</h2>

In [79]:
# imports go here
import numpy as np
import db
import inflect
import string
import nltk
import gensim
import gensim.downloader as api
from nltk.test.gensim_fixt import setup_module
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer

<h2> Init (don't forget to do this when making the class, and only once.)

In [75]:
setup_module()
model = api.load("word2vec-google-news-300")

# see here : https://radimrehurek.com/gensim/downloader.html for saving.

<h2>Data Retrieval</h2>

In [26]:
# input : DatabaseHandler
# output : DataFrame
def GetDF(dh:db.DatabaseHandler, selector: str, eventID: int, splitBySentences: bool = False):
    df = dh.get_recordDataJoinedDF(selector=selector, ID=eventID)
    if splitBySentences:
        # df.set_index('id', inplace=True)
        df['answer'] = df['answer'].str.split('.')
        df = df.explode("answer", True)
        df.drop(df[df["answer"] == ""].index, inplace=True)
        df.reset_index(drop=True,inplace=True)

    return df


dh = db.DatabaseHandler("testdb.db")  # db connection
df = GetDF(dh, "event_id", 19, True)
df


Unnamed: 0,id,event_title,speaker,question,answer
0,205,Of Sin and Death,KNC,What is the difference between a mistake and a...,A mistake isn't necessarily evil
1,205,Of Sin and Death,KNC,What is the difference between a mistake and a...,A sin is surely evil
2,206,Of Sin and Death,RIC,What is the difference between a mistake and a...,A sin is wicked in the eyes of god
3,206,Of Sin and Death,RIC,What is the difference between a mistake and a...,A mistake isn't always wicked in the eyes of God
4,207,Of Sin and Death,GRE,What is the difference between a mistake and a...,Human error is a sin
5,207,Of Sin and Death,GRE,What is the difference between a mistake and a...,Everything improper is sin
6,208,Of Sin and Death,PY,What is the difference between a mistake and a...,If a man defies a God then he sins
7,208,Of Sin and Death,PY,What is the difference between a mistake and a...,"A mistake can be done by accident, but a sin ..."
8,209,Of Sin and Death,LIV,What is the difference between a mistake and a...,Maybe a sin is a mistake done with evil intent
9,210,Of Sin and Death,TMS,What is the difference between a mistake and a...,Sin is commited


<h2>Preprocessing</h2>

In [43]:
# input : sentence/document (string); parameters
# output : a list of word tokens (list<string>)
def PreprocessDocument(doc:str, isLemma:bool=False, isStopWords:bool=False, isInflect:bool=False, isNumberFiltered:bool=True):
    inflector = inflect.engine()
    stopwordSet = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()
    punctuations = string.punctuation
    # if numbers are filtered, add that to the punctuation string
    if isNumberFiltered:
        punctuations += "1234567890"

    # case fold
    doc = doc.lower()

    # remove puncs
    doc = "".join([char for char in doc if char not in punctuations])

    # tokenize it.
    token_list = nltk.word_tokenize(doc)

    for i in range(len(token_list)):
        # if inflect
        if isInflect:
            if token_list[i].isdigit():
                token_list[i] = inflector.number_to_words(token_list[i])

        # if lemma
        if isLemma:
            tagged_word = nltk.pos_tag([token_list[i]])
            wordnet_pos = getWordnetPos(tagged_word[0][1])
            token_list[i] = lemmatizer.lemmatize(tagged_word[0][0], pos=wordnet_pos)
        
        # if stopword
        if isStopWords:
            if token_list[i] in stopwordSet or token_list[i].isdigit():
                token_list[i] = "#" # mark as #
        
    # remove the marked strings
    token_list = [token for token in token_list if token != "#"]
    return token_list

def getWordnetPos(tag):
    """Map POS tag to WordNet POS tag"""
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN # solves as noun by default.
    

In [44]:
mySentence = df.loc[47]["answer"]
myTokenizedSentence = PreprocessDocument(mySentence, isStopWords=True, isLemma=True)

myTokenizedSentence

['heard',
 'jesus',
 'already',
 'accepted',
 'nonetheless',
 'commit',
 'suicide',
 'sinner',
 'still']

<h2>Get word set, whatever for

In [48]:
# input : list<str>
# output : set<str> --> no duplicates, 
# def GetWordSet(df:db.pd.DataFrame):
#     bigtext = ""

    
    

<h2> TF-IDF function

In [83]:
# input : list<str>
# output : Dataframe, matrix
def GetTFIDF(doclist:list, isPreprocessed=True):
    if not isPreprocessed:
        doclist = [PreprocessDocument(doc, isLemma=True, isStopWords=True) for doc in doclist]
    else:
        # just tokenize the thing
        doclist = [nltk.word_tokenize(doc) for doc in doclist]
        
    flat_doclist = [' '.join(doc) for doc in doclist] # turn into one big corpus
    vectorizer = TfidfVectorizer()
    matrix  = vectorizer.fit_transform(flat_doclist)
    tfidf_keys = vectorizer.get_feature_names_out()
    df_tfidf = db.pd.DataFrame(matrix.toarray(), columns=tfidf_keys)

    return df_tfidf, matrix


In [84]:
my_tfidf, my_matrix = GetTFIDF(df["answer"], isPreprocessed=True)
my_matrix

["A mistake is n't necessarily evil", 'A sin is surely evil', 'A sin is wicked in the eyes of god', "A mistake is n't always wicked in the eyes of God", 'Human error is a sin', 'Everything improper is sin', 'If a man defies a God then he sins', 'A mistake can be done by accident , but a sin is always commited', 'Maybe a sin is a mistake done with evil intent', 'Sin is commited', 'Mistake is incidental', 'Sin is maybe knowing that your actions have consequences and doing it anyway', "What then about a child that does n't know any better and is just following human nature ?", 'If a man marries a woman then abuses her , and the woman is to leave the man , would the woman have sinned ?', 'A sin may not be a mistake', 'As an example , war', 'It depends on which theology to decide whether a war is sinful or not', 'If a man does wicked things while not being sane , he also sins', "Is it still a sin if the sinner does n't know that he sins ?", 'Yes , perhaps a sin but a forgivable one ?', 'I r

<48x209 sparse matrix of type '<class 'numpy.float64'>'
	with 462 stored elements in Compressed Sparse Row format>

<h2> Word- and Sentence-Embedding

In [87]:
# input : list<str> : tokens of one document/sentence
# output : list<(str, list<int>[300])> : list of word-vector pair for each word available on the model
def WordEmbed(document: list, model):
    word_embed_pairs = []
    for word in document:
        if word in model:
            word_embed_pairs.append((word, model[word]))
    return word_embed_pairs

# input : list<(str, list<float>[300])>, str : word-vector pair list and preferred agg method.
# output : list<float>[300] : 300-d vector that represents an aggregated value of the input words


def SentenceEmbedUnweightedFunction(word_embed_pair_list: list, aggregateMethod: str = "avg"):
    wvs = []
    for pair in word_embed_pair_list:
        wvs.append(pair[1])
    if aggregateMethod == "avg":
        return np.mean(wvs, axis=0)
    else:
        return np.sum(wvs, axis=0)

# input : list<list<(str, list<float>[300])>>, str : list containing word-vector pairs and preferred agg method
# output : list<(str, list<int>[300])> : list containing sentence-vector pairs.


def SentenceEmbedUnweighted(word_embedded_docs: list, aggregateMethod: str = "avg"):
    sentence_embedded_docs = []
    for i in range(len(word_embedded_docs)):
        sentence_embedded_docs.append(SentenceEmbedUnweightedFunction(
            word_embedded_docs[i], aggregateMethod))
    return sentence_embedded_docs


'''
input :
list<list<(str, list<float>[300])>> : word-vector pair list
matrix : tf-idf matrix for the corresponding doc
int : the row we want
str : preferred agg method
'''
# output : list<float>[300] : 300-d vector that represents an aggregated value of the input words


def SentenceEmbedWeightedFunction(word_embed_pair_list: list, tfidf_matrix, index: int, aggregateMethod: str = "avg"):
    weighted_wvs = []
    # multiplies each word with its TF-IDF value in the corresponding row. Is 0 if word isn't found somehow.
    for pair in word_embed_pair_list:
        tfidf_weight = 0
        if pair[0] in tfidf_matrix:
            tfidf_weight = tfidf_matrix[pair[0]][index]
        weighted_wvs.append(pair[1] * tfidf_weight)
    # turn into array for fast aggregating
    weighted_wvs = np.array(weighted_wvs)
    if aggregateMethod == "avg":
        sentence_vector = np.mean(weighted_wvs, axis=0)
    else:
        sentence_vector = np.sum(weighted_wvs, axis=0)
    return sentence_vector

# input : list<list<(str, list<float>[300])>>, str : list containing word-vector pairs, TF-IDF matrix of the corpus, and preferred agg method
# output : list<(str, list<float>[300])> : list containing sentence-vector pairs.


def SentenceEmbedWeighted(word_embedded_docs: list, tfidf_matrix, aggregateMethod="avg"):
    sentence_embedded_docs = []
    for i in range(len(word_embedded_docs)):
        sentence_embedded_docs.append(SentenceEmbedWeightedFunction(
            word_embedded_docs[i], tfidf_matrix, i, aggregateMethod))
    return sentence_embedded_docs


In [89]:
docs = [PreprocessDocument(doc, isLemma=True, isStopWords=True) for doc in df["answer"]]
word_embedded_docs = []
for doc in docs:
    word_embedded_docs.append(WordEmbed(doc, model))

# sentence_embed("bababui", tfidf_matrix=my_tfidf, index=1)
doc_embeds = SentenceEmbedWeighted(word_embedded_docs, my_tfidf, "avg")
df["Document Embed"] = doc_embeds

Unnamed: 0,id,event_title,speaker,question,answer,Document Embed
0,205,Of Sin and Death,KNC,What is the difference between a mistake and a...,A mistake isn't necessarily evil,"[0.02912639, -0.032216568, 0.039019763, 0.0968..."
1,205,Of Sin and Death,KNC,What is the difference between a mistake and a...,A sin is surely evil,"[0.08294585, 0.026838994, 0.017556252, 0.12052..."
2,206,Of Sin and Death,RIC,What is the difference between a mistake and a...,A sin is wicked in the eyes of god,"[0.068886854, 0.0071485955, 0.026462458, 0.025..."
3,206,Of Sin and Death,RIC,What is the difference between a mistake and a...,A mistake isn't always wicked in the eyes of God,"[0.046191312, 0.01100249, 0.026769131, 0.03811..."
4,207,Of Sin and Death,GRE,What is the difference between a mistake and a...,Human error is a sin,"[0.038369473, -0.011711814, 0.0190797, 0.08243..."
5,207,Of Sin and Death,GRE,What is the difference between a mistake and a...,Everything improper is sin,"[0.043300632, -0.011632566, -0.022081481, -0.0..."
6,208,Of Sin and Death,PY,What is the difference between a mistake and a...,If a man defies a God then he sins,"[0.09101647, 0.005016355, 0.04910426, -0.01154..."
7,208,Of Sin and Death,PY,What is the difference between a mistake and a...,"A mistake can be done by accident, but a sin ...","[0.031938307, -0.0070673116, 0.020687412, -6.0..."
8,209,Of Sin and Death,LIV,What is the difference between a mistake and a...,Maybe a sin is a mistake done with evil intent,"[0.05111699, 0.011224811, 0.04982972, 0.038553..."
9,210,Of Sin and Death,TMS,What is the difference between a mistake and a...,Sin is commited,"[0.07623151, -0.01437326, 0.02278428, 0.034921..."


<h2>LDA Approach

<h2>Anomaly detection : DBSCAN

<h2>Anomaly detection : LOF

<h2>Anomaly detection : Isolation Forest (sklearn)

<h2>Final Function