<h2>IMPORTS</h2>

In [2]:
# imports go here
import numpy as np
import db
import inflect
import string
import nltk
import gensim
import contractions
import matplotlib.pyplot as plt
import gensim.downloader as api
from gensim import corpora, models
from nltk.test.gensim_fixt import setup_module
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
from sklearn.manifold import TSNE

<h2> Init (don't forget to do this when making the class, and only once.)

In [3]:
setup_module()
model = api.load("glove-wiki-gigaword-300")

# see here : https://radimrehurek.com/gensim/downloader.html for saving.

<h2>Data Retrieval</h2>

In [4]:
# input : DatabaseHandler
# output : DataFrame
def GetDF(dh:db.DatabaseHandler, selector: str, eventID: int, splitBySentences: bool = False):
    df = dh.get_recordDataJoinedDF(selector=selector, ID=eventID)
    if splitBySentences:
        # df.set_index('id', inplace=True)
        df['answer'] = df['answer'].str.split('.')
        df = df.explode("answer", True)
        df.drop(df[df["answer"] == ""].index, inplace=True)
        df.reset_index(drop=True,inplace=True)
    return df


dh = db.DatabaseHandler("testdb.db")  # db connection
df = GetDF(dh, "event_id", 20, False)
df

Unnamed: 0,id,event_title,speaker,question,answer
0,230,Of Choice and Life,KSMG,Pro life or pro choice?,I am pro choice. I feel bad for the baby. Peop...
1,231,Of Choice and Life,JJ,Pro life or pro choice?,"Logically, both make sense. Conflicted between..."
2,232,Of Choice and Life,JER,Pro life or pro choice?,"I'm pro life. Because in my belief, if a fetus..."
3,233,Of Choice and Life,YOT,Pro life or pro choice?,"Prochoice. If she was a victim of rape, etc, s..."
4,234,Of Choice and Life,GRE,Pro life or pro choice?,Prolife for religious reasons. Being religious...
5,235,Of Choice and Life,RIC,Pro life or pro choice?,Prochoice. Unless the baby is normal.
6,236,Of Choice and Life,GRE,Do you think abortion should be legal?,Legal - not really legal - legal for special c...
7,237,Of Choice and Life,MAR,Do you think abortion should be legal?,Agree with other solutions besides abortion - ...
8,238,Of Choice and Life,RIC,Do you think abortion should be legal?,Should be legal with criteria. Agree with Indo...
9,239,Of Choice and Life,YOR,Do you think abortion should be legal?,What counts as a person? Is fetus a person?


<h2>Preprocessing</h2>

In [5]:
# input : sentence/document (string); parameters
# output : a list of word tokens (list<string>)
def PreprocessDocument(doc:str, isLemma:bool=False, isStopWords:bool=False, isInflect:bool=False, isNumberFiltered:bool=True):
    inflector = inflect.engine()
    stopwordSet = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()
    punctuations = string.punctuation
    # if numbers are filtered, add that to the punctuation string
    if isNumberFiltered:
        punctuations += "1234567890"

    # case fold
    doc = doc.lower()

    # remove puncs
    doc = "".join([char for char in doc if char not in punctuations])

    # tokenize it.
    token_list = nltk.word_tokenize(doc)

    for i in range(len(token_list)):
        # if inflect
        if isInflect:
            if token_list[i].isdigit():
                token_list[i] = inflector.number_to_words(token_list[i])

        # if lemma
        if isLemma:
            tagged_word = nltk.pos_tag([token_list[i]])
            wordnet_pos = getWordnetPos(tagged_word[0][1])
            token_list[i] = lemmatizer.lemmatize(tagged_word[0][0], pos=wordnet_pos)
        
        # if stopword
        if isStopWords:
            if token_list[i] in stopwordSet or token_list[i].isdigit():
                token_list[i] = "#" # mark as #
        
    # remove the marked strings
    token_list = [token for token in token_list if token != "#"]

    if token_list:
        return token_list
    return [""]

def getWordnetPos(tag):
    """Map POS tag to WordNet POS tag"""
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN # solves as noun by default.
    

In [6]:
mySentence = df.loc[10]["answer"]
myTokenizedSentence = PreprocessDocument(mySentence, isStopWords=True, isLemma=True)
myTokenizedSentence

['christian', 'still', 'fall', 'christian', 'rule']

<h2>Get word set, whatever for

In [7]:
def get_word_set(df):
    bigtext = ""
    # join in lower case
    for i in range(len(df)):
        bigtext += " {}".format(df[i].lower())
    bigtext = contractions.fix(bigtext) # remove contractions
    bigtext = "".join([char for char in bigtext if char not in string.punctuation]) # remove punctuations
    big_text_tokens = PreprocessDocument(bigtext, isLemma=True) # put in blender like dick
    return set(big_text_tokens) # return as set

myWordSet = get_word_set(df["answer"])
print(len(myWordSet))

288


<h2> TF-IDF function

In [23]:
# input : list<str>
# output : Dataframe, matrix
def GetTFIDF(doclist:list, isPreprocessed=True):
    if not isPreprocessed:
        doclist = [PreprocessDocument(doc, isLemma=True, isStopWords=True) for doc in doclist]
    else:
        # just tokenize the thing
        doclist = [nltk.word_tokenize(doc) for doc in doclist]
        
    flat_doclist = [' '.join(doc) for doc in doclist] # turn into one big corpus
    vectorizer = TfidfVectorizer()
    matrix  = vectorizer.fit_transform(flat_doclist)
    tfidf_keys = vectorizer.get_feature_names_out()
    df_tfidf = db.pd.DataFrame(matrix.toarray(), columns=tfidf_keys)

    return df_tfidf, matrix


def GetTFIDF_Gensim(doclist:list, isPreprocessed=True):
    if not isPreprocessed:
        doclist = [PreprocessDocument(doc, isLemma=True, isStopWords=True, isInflect=True) for doc in doclist]
    else:
        doclist = [nltk.word_tokenize(doc) for doc in doclist]
    dictionary = corpora.Dictionary(doclist)

    return dictionary



In [24]:
my_tfidf, my_matrix = GetTFIDF(df["answer"], isPreprocessed=True)
my_tfidf

goofy = GetTFIDF_Gensim(df["answer"], False)
len(goofy)

233

<h2> Word- and Sentence-Embedding

In [13]:
# input : list<str> : tokens of one document/sentence
# output : list<(str, list<int>[300])> : list of word-vector pair for each word available on the model
def WordEmbed(document: list, model):
    word_embed_pairs = []
    for word in document:
        if word in model:
            word_embed_pairs.append((word, model[word]))
    return word_embed_pairs

# input : list<(str, list<float>[300])>, str : word-vector pair list and preferred agg method.
# output : list<float>[300] : 300-d vector that represents an aggregated value of the input words


def SentenceEmbedUnweightedFunction(word_embed_pair_list: list, aggregateMethod: str = "avg"):
    wvs = []
    for pair in word_embed_pair_list:
        wvs.append(pair[1])
    if aggregateMethod == "avg":
        return np.mean(wvs, axis=0)
    else:
        return np.sum(wvs, axis=0)

# input : list<list<(str, list<float>[300])>>, str : list containing word-vector pairs and preferred agg method
# output : list<(str, list<int>[300])> : list containing sentence-vector pairs.


def SentenceEmbedUnweighted(word_embedded_docs: list, aggregateMethod: str = "avg"):
    sentence_embedded_docs = []
    for i in range(len(word_embedded_docs)):
        sentence_embedded_docs.append(SentenceEmbedUnweightedFunction(
            word_embedded_docs[i], aggregateMethod))
    return sentence_embedded_docs


'''
input :
list<list<(str, list<float>[300])>> : word-vector pair list
matrix : tf-idf matrix for the corresponding doc
int : the row we want
str : preferred agg method
'''
# output : list<float>[300] : 300-d vector that represents an aggregated value of the input words


def SentenceEmbedWeightedFunction(word_embed_pair_list: list, tfidf_matrix, index: int, aggregateMethod: str = "avg"):
    weighted_wvs = []
    # multiplies each word with its TF-IDF value in the corresponding row. Is 0 if word isn't found somehow.
    for pair in word_embed_pair_list:
        tfidf_weight = 0
        if pair[0] in tfidf_matrix:
            tfidf_weight = tfidf_matrix[pair[0]][index]
        weighted_wvs.append(pair[1] * tfidf_weight)
    # turn into array for fast aggregating
    weighted_wvs = np.array(weighted_wvs)
    if aggregateMethod == "avg":
        sentence_vector = np.mean(weighted_wvs, axis=0)
    else:
        sentence_vector = np.sum(weighted_wvs, axis=0)
    return sentence_vector

# input : list<list<(str, list<float>[300])>>, str : list containing word-vector pairs, TF-IDF matrix of the corpus, and preferred agg method
# output : list<(str, list<float>[300])> : list containing sentence-vector pairs.


def SentenceEmbedWeighted(word_embedded_docs: list, tfidf_matrix, aggregateMethod="avg"):
    sentence_embedded_docs = []
    for i in range(len(word_embedded_docs)):
        sentence_embedded_docs.append(SentenceEmbedWeightedFunction(
            word_embedded_docs[i], tfidf_matrix, i, aggregateMethod))
    return sentence_embedded_docs


<h3> A list of docs that will be used for everything would still be necessary, after all 👇

In [16]:
docs = [PreprocessDocument(doc, isLemma=True, isStopWords=True) for doc in df["answer"]]
word_embedded_docs = []
for doc in docs:
    word_embedded_docs.append(WordEmbed(doc, model))

# sentence_embed("bababui", tfidf_matrix=my_tfidf, index=1)
doc_embeds = SentenceEmbedWeighted(word_embedded_docs, my_tfidf, "avg")
df["Document Embed"] = doc_embeds
df.dropna(inplace=True)
df.reset_index(inplace=True) # don't forget to add this after every row-altering operation.
df

Unnamed: 0,index,id,event_title,speaker,question,answer,Document Embed
0,0,230,Of Choice and Life,KSMG,Pro life or pro choice?,I am pro choice. I feel bad for the baby. Peop...,"[-0.022362433, 0.01713121, 0.01336585, -0.0077..."
1,1,231,Of Choice and Life,JJ,Pro life or pro choice?,"Logically, both make sense. Conflicted between...","[-0.007254202, 0.009839708, -0.00799253, -0.01..."
2,2,232,Of Choice and Life,JER,Pro life or pro choice?,"I'm pro life. Because in my belief, if a fetus...","[-0.014276878, 0.0022278614, -0.010473907, 0.0..."
3,3,233,Of Choice and Life,YOT,Pro life or pro choice?,"Prochoice. If she was a victim of rape, etc, s...","[-0.022501977, 0.024691759, 0.005002156, 0.017..."
4,4,234,Of Choice and Life,GRE,Pro life or pro choice?,Prolife for religious reasons. Being religious...,"[-0.0040696473, -0.017911343, -0.02254271, 0.0..."
5,5,235,Of Choice and Life,RIC,Pro life or pro choice?,Prochoice. Unless the baby is normal.,"[-0.098942615, 0.024454754, -0.070881665, 0.05..."
6,6,236,Of Choice and Life,GRE,Do you think abortion should be legal?,Legal - not really legal - legal for special c...,"[0.004587771, -0.0013291183, 0.0049727913, 0.0..."
7,7,237,Of Choice and Life,MAR,Do you think abortion should be legal?,Agree with other solutions besides abortion - ...,"[-0.01381315, 0.01531402, 0.0022275664, -0.004..."
8,8,238,Of Choice and Life,RIC,Do you think abortion should be legal?,Should be legal with criteria. Agree with Indo...,"[-0.009043147, 0.021918792, -0.030498233, -0.0..."
9,9,239,Of Choice and Life,YOR,Do you think abortion should be legal?,What counts as a person? Is fetus a person?,"[-0.22057891, 0.014038535, -0.20972323, -0.011..."


<h2>LDA Approach

In [71]:
'''
input:
- doclist : list<list<str>> --> list of tokenized sentences/docs
- num_topics : number of inferred topics.
'''
'''
output:
- docFeatureList : list<list<float>> --> topic distribution for each sentence/doc
'''
def GetLDADistribution(doclist: list, topics: int = 5):
    new_corpus = []
    for i in range(len(docs)):
        doc = [(j, my_matrix[i, j]) for j in my_matrix[i].indices]
        new_corpus.append(doc)
    gensim_dict = corpora.Dictionary.from_corpus(new_corpus)
    lda_model = gensim.models.LdaModel(new_corpus, num_topics=10, id2word=gensim_dict)
    goofy_ahh_doc_topic_distributions = lda_model[new_corpus]
    docFeatureList = []
    for doc_topic_dist in goofy_ahh_doc_topic_distributions:
        featureList = [0.0 for i in range(0, 10)]
        for topic_dist in doc_topic_dist:
            featureList[topic_dist[0]] = topic_dist[1]
        docFeatureList.append(featureList)
    return docFeatureList

# myTopicDist = GetLDADistribution(docs, 10)
# for topicDist in myTopicDist:
#     print(topicDist)

[0.015178566, 0.015176928, 0.015178298, 0.01517892, 0.015176928, 0.015178965, 0.015177403, 0.86339366, 0.015181374, 0.015178913]
[0.8715595, 0.014267717, 0.014270598, 0.014271023, 0.014267717, 0.0142730465, 0.014269685, 0.014276249, 0.014271883, 0.014272554]
[0.011866485, 0.011865227, 0.011866371, 0.011866245, 0.011865227, 0.011867544, 0.011865903, 0.011869738, 0.011866222, 0.8932011]
[0.019781291, 0.01978034, 0.821959, 0.019786734, 0.01978034, 0.019783786, 0.019781467, 0.019782048, 0.0197826, 0.019782448]
[0.014406338, 0.014404694, 0.014406862, 0.014408974, 0.014404694, 0.01441362, 0.8703269, 0.014411853, 0.014407115, 0.014409003]
[0.030097462, 0.03009658, 0.03009934, 0.03010569, 0.03009658, 0.030103652, 0.030097155, 0.030099232, 0.72910494, 0.030099368]
[0.015671797, 0.015671214, 0.015672458, 0.015671609, 0.015671214, 0.8589464, 0.015671996, 0.01567902, 0.015671827, 0.015672453]
[0.018617284, 0.018616846, 0.01861919, 0.018619861, 0.018616846, 0.01862093, 0.01861727, 0.832436, 0.01861

<h2>Anomaly detection : DBSCAN

In [17]:
'''
inputs:
- vectors : list<list<float>> --> list of features corresponding to each doc/sentence
- epsilon : float --> the radius within which points are considered connected.
- min : int --> minimum amount of connected points for a point to be considered a core point of a cluster.
'''
'''
output:
clusters : list<int> --> a list of integers to assign each data point to a cluster. -1 means outlier.
'''
def GetDBSCANClusters(vectors, epsilon:float, min:int):
    dbscan = DBSCAN(eps=epsilon, min_samples=min)
    clusters = dbscan.fit_predict(vectors)
    # plt.title("to the depths of depravity {} and the cusp of blasphemy {}.".format(epsilon, min))
    # plt.scatter(vectors[:, 0], vectors[:, 1], c=clusters)
    # plt.show()
    print(clusters)
    return clusters



In [18]:
GetDBSCANClusters(list(df["Document Embed"]), 1, 2)


[ 0  0  0  0  0 -1  0  0  0 -1 -1  0  0  0  0  0  0  0  0  0]


array([ 0,  0,  0,  0,  0, -1,  0,  0,  0, -1, -1,  0,  0,  0,  0,  0,  0,
        0,  0,  0], dtype=int64)

<h2>Shrink and draw with TSNE

In [19]:
# returns a tsne shrinkage also...
def plot_documents(df:db.pd.DataFrame, isPrint=False):
    labels = np.arange(0, df.index.stop, 1)
    values = list(df["Document Embed"]) # don't forget to list it first, then np array it later.

    # train model
    tsne_model = TSNE(perplexity=20, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(np.array(values))

    # plot
    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
    
    if isPrint:
        plt.figure(figsize=(20, 20)) 
        for i in range(len(x)):
            plt.scatter(x[i],y[i])
            plt.annotate(labels[i],
                        xy=(x[i], y[i]),
                        xytext=(5, 2),
                        textcoords='offset points',
                        ha='right',
                        va='bottom')
        plt.show()
    # use the thing to find new clusters.
    return new_values

In [77]:
# clusters = GetDBSCANClusters(list(df["Document Embed"]), 0.6, 5)
# df["Cluster Assignment"] = clusters
# dfSupposedOutliers = df.loc[df["Cluster Assignment"] == -1]
# dfSupposedGoods = df.loc[df["Cluster Assignment"] != -1]
# dfSupposedGoods.reset_index(inplace=True)
# dfSupposedOutliers.reset_index(inplace=True)

# for i in range(len(dfSupposedOutliers.index)):
#     print(dfSupposedOutliers.loc[i]["question"], " | ", dfSupposedOutliers.loc[i]["answer"])

# print("#" * 80)
# for i in range(len(dfSupposedGoods.index)):
#     print(dfSupposedGoods.loc[i]["question"], " | ", dfSupposedGoods.loc[i]["answer"])

'''
inputs :
- clusters : list<int> --> a list of clusters assigned to each doc/sentence
- df : DataFrame --> the dataframe in question
'''
'''
outputs:
- dfOutliers : DataFrame --> the dataframe whose answers have been marked as outliers.
- dfGoods : DataFrame --> the dataframe whose answers have not been marked as outliers.
'''
def ReturnClusters(clusters:list, df:db.pd.DataFrame):
    df["Cluster Assignment"] = clusters
    dfGoods = df.loc[df["Cluster Assignment"] != -1]
    dfOutliers = df.loc[df["Cluster Assignment"] == -1]
    return dfOutliers, dfGoods

# outliers, goods = ReturnClusters(clusters, df)


Unnamed: 0,index,id,event_title,speaker,question,answer,Document Embed,Cluster Assignment
0,0,230,Of Choice and Life,KSMG,Pro life or pro choice?,I am pro choice. I feel bad for the baby. Peop...,"[-0.022362433, 0.01713121, 0.01336585, -0.0077...",0
1,1,231,Of Choice and Life,JJ,Pro life or pro choice?,"Logically, both make sense. Conflicted between...","[-0.007254202, 0.009839708, -0.00799253, -0.01...",0
2,2,232,Of Choice and Life,JER,Pro life or pro choice?,"I'm pro life. Because in my belief, if a fetus...","[-0.014276878, 0.0022278614, -0.010473907, 0.0...",0
3,3,233,Of Choice and Life,YOT,Pro life or pro choice?,"Prochoice. If she was a victim of rape, etc, s...","[-0.022501977, 0.024691759, 0.005002156, 0.017...",0
4,4,234,Of Choice and Life,GRE,Pro life or pro choice?,Prolife for religious reasons. Being religious...,"[-0.0040696473, -0.017911343, -0.02254271, 0.0...",0
6,6,236,Of Choice and Life,GRE,Do you think abortion should be legal?,Legal - not really legal - legal for special c...,"[0.004587771, -0.0013291183, 0.0049727913, 0.0...",0
7,7,237,Of Choice and Life,MAR,Do you think abortion should be legal?,Agree with other solutions besides abortion - ...,"[-0.01381315, 0.01531402, 0.0022275664, -0.004...",0
8,8,238,Of Choice and Life,RIC,Do you think abortion should be legal?,Should be legal with criteria. Agree with Indo...,"[-0.009043147, 0.021918792, -0.030498233, -0.0...",0
11,11,241,Of Choice and Life,YOT,Do you think abortion should be legal?,"If it has a heartbeat, can it be called life? ...","[0.006337575, 0.01460853, -0.017543007, 0.0111...",0
12,12,242,Of Choice and Life,JJ,Do you think abortion should be legal?,when something grows doesnt it count as life a...,"[-0.052422117, 0.030819645, -0.032700323, -0.0...",0


<h2>Anomaly detection : LOF

In [204]:
def GetLDADistribution(doclist: list, topics: int = 5, use_tfidf: bool = True):
    new_corpus = []
    
    if use_tfidf:
        for i in range(len(doclist)):
            doc = [(j, my_matrix[i, j]) for j in my_matrix[i].indices]
            new_corpus.append(doc)
            gensim_dict = corpora.Dictionary.from_corpus(new_corpus)        
    else:
        gensim_dict = corpora.Dictionary(doclist)
        new_corpus = [gensim_dict.doc2bow(doc) for doc in doclist]
        
    lda_model = gensim.models.LdaModel(new_corpus, num_topics=topics, id2word=gensim_dict)
    goofy_ahh_doc_topic_distributions = lda_model[new_corpus]
    
    docFeatureList = []
    for doc_topic_dist in goofy_ahh_doc_topic_distributions:
        featureList = [0.0 for i in range(0, topics)]
        for topic_dist in doc_topic_dist:
            featureList[topic_dist[0]] = topic_dist[1]
        docFeatureList.append(featureList)
    
    return docFeatureList

In [205]:
thing = GetLDADistribution(docs, 2, False)
for item in thing:
    print(item)

[0.9594759, 0.040524174]
[0.97850245, 0.021497536]
[0.28016606, 0.7198339]
[0.043442782, 0.9565572]
[0.9731329, 0.026867067]
[0.82650346, 0.17349651]
[0.023954507, 0.9760455]
[0.92979133, 0.07020863]
[0.34196287, 0.6580371]
[0.123561874, 0.87643814]
[0.09156124, 0.90843874]
[0.030009115, 0.9699909]
[0.052324902, 0.94767505]
[0.015818512, 0.98418146]
[0.906932, 0.09306796]
[0.03421068, 0.96578926]
[0.02900701, 0.970993]
[0.018301684, 0.98169833]
[0.97891814, 0.021081883]
[0.9852539, 0.014746079]


<h2>Anomaly detection : Isolation Forest (sklearn)

<h2>Final Class

In [224]:
class AnomalyDetector():
    def __init__(self, dbName: str = "",  dh=None, model=None) -> None:
        if dh is None:
            self.dh = db.DatabaseHandler(dbName=dbName)
        else:
            self.dh = dh
        if model is None:
            self.model = api.load("glove-wiki-gigaword-300")
        else:
            self.model = model
    # def __init__(self, eventID: int, isSplit: bool) -> None:
    #     self.dh = db.DatabaseHandler("test.db")
    #     self.df = self.GetDF(self.dh, "eventID", eventID, isSplit)
    #     self.model = api.load("glove-wiki-gigaword-300")
    #     pass

    '''
    inputs :
    - dh : DatabaseHandler --> to retrieve data from database
    - eventID : int --> we're doing this by event, so straight to the eventID
    - selector : str --> pretty much formality.
    - splitBySentences : bool --> Split each doc into sentences or not. Defaults to no.
    '''
    '''
    outputs:
    None, just setting
    '''

    def SetDF(self, dh: db.DatabaseHandler, eventID: int, selector: str = "event_id", splitBySentences: bool = False):
        self.df = self.dh.get_recordDataJoinedDF(selector=selector, ID=eventID)
        if splitBySentences:
            # df.set_index('id', inplace=True)
            self.df['answer'] = self.df['answer'].str.split('.')
            self.df = self.df.explode("answer", True)
            self.df.drop(self.df[self.df["answer"] == ""].index, inplace=True)
            self.df.reset_index(drop=True, inplace=True)

    '''
    inputs :
    - dh : DatabaseHandler --> to retrieve data from database
    - eventID : int --> we're doing this by event, so straight to the eventID
    - selector : str --> pretty much formality.
    - splitBySentences : bool --> Split each doc into sentences or not. Defaults to no.
    '''
    '''
    outputs:
    - df : DataFrame --> dataframe containing the thing we're gonna be using.
    '''

    def GetDF(self, dh: db.DatabaseHandler, eventID: int, selector: str = "event_id", splitBySentences: bool = False):
        df = dh.get_recordDataJoinedDF(selector=selector, ID=eventID)
        if splitBySentences:
            # df.set_index('id', inplace=True)
            df['answer'] = df['answer'].str.split('.')
            df = df.explode("answer", True)
            df.drop(df[df["answer"] == ""].index, inplace=True)
            df.reset_index(drop=True, inplace=True)
        return df

    '''
    inputs:
    - doc : str --> a string representing a sentence/document.
    - isLemma : bool --> use lemmatizer or not? Defaults to not.
    - isStopWords : bool --> use stopwords or not? Defaults to not.
    - isInflect : bool --> use inflections (you're --> you are) or not? Defaults to not.
    - isNumberFiltered :  bool --> delete numbers in the string? Defaults to yes. 
    '''
    '''
    output : list<str> --> a list of word tokens (list<string>)
    '''

    def PreprocessDocument(self, doc: str, isLemma: bool = False, isStopWords: bool = False, isInflect: bool = False, isNumberFiltered: bool = True):
        inflector = inflect.engine()
        stopwordSet = set(stopwords.words("english"))
        lemmatizer = WordNetLemmatizer()
        punctuations = string.punctuation
        # if numbers are filtered, add that to the punctuation string
        if isNumberFiltered:
            punctuations += "1234567890"

        # case fold
        doc = doc.lower()

        # remove puncs
        doc = "".join([char for char in doc if char not in punctuations])

        # tokenize it.
        token_list = nltk.word_tokenize(doc)

        for i in range(len(token_list)):
            # if inflect
            if isInflect:
                if token_list[i].isdigit():
                    token_list[i] = inflector.number_to_words(token_list[i])

            # if lemma
            if isLemma:
                tagged_word = nltk.pos_tag([token_list[i]])
                wordnet_pos = getWordnetPos(tagged_word[0][1])
                token_list[i] = lemmatizer.lemmatize(
                    tagged_word[0][0], pos=wordnet_pos)

            # if stopword
            if isStopWords:
                if token_list[i] in stopwordSet or token_list[i].isdigit():
                    token_list[i] = "#"  # mark as #

        # remove the marked strings
        token_list = [token for token in token_list if token != "#"]

        if token_list:
            return token_list
        return [""]

    '''
    inputs:
    - tag : str --> the tag obtained from POS tagging.
    '''
    '''
    outputs:
    - str --> Wordnet POS tag.
    '''
    def getWordnetPos(tag):
        """Map POS tag to WordNet POS tag"""
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN  # solves as noun by default.

    '''
    inputs:
    - doclist : list<str> --> list of doc/sentences.
    - isProcessed : bool --> has it already been preprocessed? Defaults to True.
    '''
    '''
    outputs:
    - df_tfidf : Dataframe --> the TFIDF matrix in df form. 
    - matrix : matrix --> the TFIDF matrix purely. mainly for LDA purposes.
    '''

    def GetTFIDF(self, doclist: list, isPreprocessed=True):
        if not isPreprocessed:
            doclist = [PreprocessDocument(
                doc, isLemma=True, isStopWords=True) for doc in doclist]
        # else:
        #     # just tokenize the thing
        #     doclist = [nltk.word_tokenize(doc) for doc in doclist]
        # i think the thing has already been tokenized. That's the problem.
        flat_doclist = [' '.join(doc)
                        for doc in doclist]  # turn into one big corpus
        vectorizer = TfidfVectorizer()
        matrix = vectorizer.fit_transform(flat_doclist)
        tfidf_keys = vectorizer.get_feature_names_out()
        df_tfidf = db.pd.DataFrame(matrix.toarray(), columns=tfidf_keys)

        return df_tfidf, matrix

    # input : list<str> : tokens of one document/sentence
    # output : list<(str, list<int>[300])> : list of word-vector pair for each word available on the model
    def WordEmbed(self, document: list, model):
        word_embed_pairs = []
        for word in document:
            if word in model:
                word_embed_pairs.append((word, model[word]))
        return word_embed_pairs

    # input : list<(str, list<float>[300])>, str : word-vector pair list and preferred agg method.
    # output : list<float>[300] : 300-d vector that represents an aggregated value of the input words
    def SentenceEmbedUnweightedFunction(self, word_embed_pair_list: list, aggregateMethod: str = "avg"):
        wvs = []
        for pair in word_embed_pair_list:
            wvs.append(pair[1])
        if aggregateMethod == "avg":
            return np.mean(wvs, axis=0)
        else:
            return np.sum(wvs, axis=0)

    # input : list<list<(str, list<float>[300])>>, str : list containing word-vector pairs and preferred agg method
    # output : list<(str, list<int>[300])> : list containing sentence-vector pairs.
    def SentenceEmbedUnweighted(self, word_embedded_docs: list, aggregateMethod: str = "avg"):
        sentence_embedded_docs = []
        for i in range(len(word_embedded_docs)):
            sentence_embedded_docs.append(SentenceEmbedUnweightedFunction(
                word_embedded_docs[i], aggregateMethod))
        return sentence_embedded_docs

    '''
    input :
    list<list<(str, list<float>[300])>> : word-vector pair list
    matrix : tf-idf matrix for the corresponding doc
    int : the row we want
    str : preferred agg method
    '''
    # output : list<float>[300] : 300-d vector that represents an aggregated value of the input words

    def SentenceEmbedWeightedFunction(self, word_embed_pair_list: list, tfidf_matrix, index: int, aggregateMethod: str = "avg"):
        weighted_wvs = []
        # multiplies each word with its TF-IDF value in the corresponding row. Is 0 if word isn't found somehow.
        for pair in word_embed_pair_list:
            tfidf_weight = 0
            if pair[0] in tfidf_matrix:
                tfidf_weight = tfidf_matrix[pair[0]][index]
            weighted_wvs.append(pair[1] * tfidf_weight)
        # turn into array for fast aggregating
        weighted_wvs = np.array(weighted_wvs)
        if aggregateMethod == "avg":
            sentence_vector = np.mean(weighted_wvs, axis=0)
        else:
            sentence_vector = np.sum(weighted_wvs, axis=0)
        return sentence_vector

    # input : list<list<(str, list<float>[300])>>, str : list containing word-vector pairs, TF-IDF matrix of the corpus, and preferred agg method
    # output : list<(str, list<float>[300])> : list containing sentence-vector pairs.
    def SentenceEmbedWeighted(self, word_embedded_docs: list, tfidf_matrix, aggregateMethod="avg"):
        sentence_embedded_docs = []
        for i in range(len(word_embedded_docs)):
            sentence_embedded_docs.append(SentenceEmbedWeightedFunction(
                word_embedded_docs[i], tfidf_matrix, i, aggregateMethod))
        return sentence_embedded_docs

    '''
    input:
    - doclist : list<list<str>> --> list of tokenized sentences/docs
    - topics : int --> number of inferred topics.
    - use_tfidf : bool --> use TFIDF or not? defaults to yes.
    '''
    '''
    output:
    - docFeatureList : list<list<float>> --> topic distribution for each sentence/doc
    '''

    def GetLDADistribution(self, doclist: list, topics: int = 5, use_tfidf: bool = True):
        new_corpus = []
        
        if use_tfidf:
            for i in range(len(doclist)):
                doc = [(j, self.tfidf_matrix[i, j]) for j in self.tfidf_matrix[i].indices]
                new_corpus.append(doc)
                gensim_dict = corpora.Dictionary.from_corpus(new_corpus)        
        else:
            gensim_dict = corpora.Dictionary(doclist)
            new_corpus = [gensim_dict.doc2bow(doc) for doc in doclist]
            
        lda_model = gensim.models.LdaModel(new_corpus, num_topics=topics, id2word=gensim_dict)
        goofy_ahh_doc_topic_distributions = lda_model[new_corpus]
        
        docFeatureList = []
        for doc_topic_dist in goofy_ahh_doc_topic_distributions:
            featureList = [0.0 for i in range(0, topics)]
            for topic_dist in doc_topic_dist:
                featureList[topic_dist[0]] = topic_dist[1]
            docFeatureList.append(featureList)
        
        return docFeatureList

    '''
    inputs:
    - vectors : list<list<float>> --> list of features corresponding to each doc/sentence
    - epsilon : float --> the radius within which points are considered connected.
    - min : int --> minimum amount of connected points for a point to be considered a core point of a cluster.
    '''
    '''
    output:
    clusters : list<int> --> a list of integers to assign each data point to a cluster. -1 means outlier.
    '''

    def GetDBSCANClusters(self, vectors, epsilon: float, min: int):
        dbscan = DBSCAN(eps=epsilon, min_samples=min)
        clusters = dbscan.fit_predict(vectors)
        # plt.title("to the depths of depravity {} and the cusp of blasphemy {}.".format(epsilon, min))
        # plt.scatter(vectors[:, 0], vectors[:, 1], c=clusters)
        # plt.show()
        # print(clusters)
        return clusters

    '''
    inputs :
    - clusters : list<int> --> a list of clusters assigned to each doc/sentence
    - df : DataFrame --> the dataframe in question
    '''
    '''
    outputs:
    - dfOutliers : DataFrame --> the dataframe whose answers have been marked as outliers.
    - dfGoods : DataFrame --> the dataframe whose answers have not been marked as outliers.
    '''

    def ReturnClusters(self, clusters: list, df: db.pd.DataFrame):
        df["Cluster Assignment"] = clusters
        dfGoods = df.loc[df["Cluster Assignment"] != -1]
        dfOutliers = df.loc[df["Cluster Assignment"] == -1]
        return dfOutliers, dfGoods

    def GetAnomalies_DBSCAN_Embedding(self, isWeighted: bool = True, aggregateMethod: str = "avg", epsilon: float = 0.01, minsamp: int = 2):
        # df and model are obtained by invoking a separate function, and it is assumed to be already available when invoking this function.

        # preprocess each doc/sentence
        self.preprocessedDocs = [self.PreprocessDocument(
            doc, isLemma=True, isStopWords=True) for doc in self.df["answer"]]

        # extract feature with embedding
        self.wordEmbeddedDocs = [self.WordEmbed(
            doc, self.model) for doc in self.preprocessedDocs]

        # if weighted, prepare TF-IDF and embed sentences with weight.
        if isWeighted:
            self.tfidf_df, self.tfidf_matrix = self.GetTFIDF(
                self.preprocessedDocs)
            self.doc_embeds = self.SentenceEmbedWeighted(
                self.wordEmbeddedDocs, self.tfidf_df, aggregateMethod)
        else:
            self.doc_embeds = self.SentenceEmbedUnweighted(
                self.wordEmbeddedDocs, aggregateMethod)

        # append embedding to each document
        if self.doc_embeds:
            self.df["Document Embed"] = self.doc_embeds

        # apply DBSCAN
        self.clusters = self.GetDBSCANClusters(
            list(self.df["Document Embed"]), epsilon, minsamp)

        # return the dfs
        return self.ReturnClusters(self.clusters, self.df)

    def GetAnomalies_DBSCAN_LDA(self, isWeighted: bool = True, topics: int = 5, epsilon: float = 0.01, minsamp: int = 5):
        # df and model are obtained by invoking a separate function, and it is assumed to be already available when invoking this function.

        # preprocess each doc/sentence
        self.preprocessedDocs = [self.PreprocessDocument(
            doc, isLemma=True, isStopWords=True) for doc in self.df["answer"]]

        # extract feature with embedding
        self.wordEmbeddedDocs = [self.WordEmbed(
            doc, self.model) for doc in self.preprocessedDocs]

        # if weighted, prepare tf-idf matrix.
        if isWeighted:
            self.tfidf_df, self.tfidf_matrix = self.GetTFIDF(
                self.preprocessedDocs)
        
        # use the in-house options for weighted or not.
        self.doc_embeds = self.GetLDADistribution(
            self.preprocessedDocs, topics=topics, use_tfidf=isWeighted)

        # append embedding to each document
        if self.doc_embeds:
            self.df["Document Embed"] = self.doc_embeds

        # apply DBSCAN
        self.clusters = self.GetDBSCANClusters(
            list(self.df["Document Embed"]), epsilon, minsamp)

        # return the dfs
        return self.ReturnClusters(self.clusters, self.df)

    def GetAnomalies(self, method: str, model, isWeighted: bool = True, aggregateMethod: str = "avg", epsilon=0.01, minsamp=2, topics=5):
        # initialize
        # extract the dataset
        self.df = self.GetDF()


In [225]:
goofyDH = ad.dh
goofyModel = ad.model

ad = AnomalyDetector(dh=goofyDH, model=goofyModel)

In [226]:
ad.SetDF(ad.dh, 19, "event_id", splitBySentences=True)

In [227]:
myOutliers, myGoods = ad.GetAnomalies_DBSCAN_LDA(False, topics=5, epsilon=0.6, minsamp=2)

In [229]:
myGoods

Unnamed: 0,id,event_title,speaker,question,answer,Document Embed,Cluster Assignment
0,205,Of Sin and Death,KNC,What is the difference between a mistake and a...,A mistake isn't necessarily evil,"[0.040007103, 0.040260684, 0.5996429, 0.040007...",0
1,205,Of Sin and Death,KNC,What is the difference between a mistake and a...,A sin is surely evil,"[0.050733026, 0.05167625, 0.050830543, 0.05037...",1
2,206,Of Sin and Death,RIC,What is the difference between a mistake and a...,A sin is wicked in the eyes of god,"[0.04065759, 0.04073258, 0.83709264, 0.0403079...",0
3,206,Of Sin and Death,RIC,What is the difference between a mistake and a...,A mistake isn't always wicked in the eyes of God,"[0.02875328, 0.028849144, 0.029649774, 0.02880...",1
4,207,Of Sin and Death,GRE,What is the difference between a mistake and a...,Human error is a sin,"[0.051153332, 0.05164998, 0.051087644, 0.05037...",1
5,207,Of Sin and Death,GRE,What is the difference between a mistake and a...,Everything improper is sin,"[0.050962675, 0.052271932, 0.050853908, 0.7952...",2
6,208,Of Sin and Death,PY,What is the difference between a mistake and a...,If a man defies a God then he sins,"[0.04075263, 0.041490786, 0.0412334, 0.835983,...",2
7,208,Of Sin and Death,PY,What is the difference between a mistake and a...,"A mistake can be done by accident, but a sin ...","[0.033426974, 0.8655161, 0.033683017, 0.033384...",3
8,209,Of Sin and Death,LIV,What is the difference between a mistake and a...,Maybe a sin is a mistake done with evil intent,"[0.033608258, 0.034077697, 0.033972435, 0.0334...",1
9,210,Of Sin and Death,TMS,What is the difference between a mistake and a...,Sin is commited,"[0.06717221, 0.73141354, 0.06747436, 0.0669311...",3
