In [24]:
# imports go here
import numpy as np
import db
import pandas as pd
import inflect
import string
import nltk
import gensim
import contractions
import matplotlib.pyplot as plt
import gensim.downloader as api
from gensim import corpora, models
from nltk.test.gensim_fixt import setup_module
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
from sklearn.ensemble import IsolationForest

# this is where everything we've experimented on will be implemented.


class AnomalyDetector():
    def __init__(self, dbName: str = "",  dh=None, model=None, modelName="glove-wiki-gigaword-300") -> None:
        if dh is None:
            self.dh = db.DatabaseHandler(dbName=dbName)
        else:
            self.dh = dh
        if model is None:
            if modelName != "":
                self.model = api.load(modelName)
        else:
            self.model = model

    '''
    inputs :
    - dh : DatabaseHandler --> to retrieve data from database
    - eventID : int --> we're doing this by event, so straight to the eventID
    - selector : str --> pretty much formality.
    - splitBySentences : bool --> Split each doc into sentences or not. Defaults to no.
    '''
    '''
    outputs:
    None, just setting
    '''

    def SetDFFromDB(self, dh: db.DatabaseHandler, eventID: int, selector: str = "event_id", splitBySentences: bool = False):
        self.df = self.dh.get_recordDataJoinedDF(selector=selector, ID=eventID)
        if splitBySentences:
            # df.set_index('id', inplace=True)
            self.df['answer'] = self.df['answer'].str.split('.')
            self.df = self.df.explode("answer", True)
            self.df.drop(self.df[self.df["answer"] == ""].index, inplace=True)
            self.df.reset_index(drop=True, inplace=True)

    # ditto above, but takes a pre-made DF instead.
    def SetDF(self, df:db.pd.DataFrame, splitBySentences:bool=False):
        self.df = df
        if splitBySentences:
            # df.set_index('id', inplace=True)
            self.df['answer'] = self.df['answer'].str.split('.')
            self.df = self.df.explode("answer", True)
            self.df.drop(self.df[self.df["answer"] == ""].index, inplace=True)
            self.df.reset_index(drop=True, inplace=True)

    def SetModel(self, modelName:str="glove-wiki-gigaword-300"):
        self.model = api.load(modelName)

    '''
    inputs :
    - dh : DatabaseHandler --> to retrieve data from database
    - eventID : int --> we're doing this by event, so straight to the eventID
    - selector : str --> pretty much formality.
    - splitBySentences : bool --> Split each doc into sentences or not. Defaults to no.
    '''
    '''
    outputs:
    - df : DataFrame --> dataframe containing the thing we're gonna be using.
    '''

    def GetDF(self, dh: db.DatabaseHandler, eventID: int, selector: str = "event_id", splitBySentences: bool = False):
        df = dh.get_recordDataJoinedDF(selector=selector, ID=eventID)
        if splitBySentences:
            # df.set_index('id', inplace=True)
            df['answer'] = df['answer'].str.split('.')
            df = df.explode("answer", True)
            df.drop(df[df["answer"] == ""].index, inplace=True)
            df.reset_index(drop=True, inplace=True)
        return df

    '''
    inputs:
    - doc : str --> a string representing a sentence/document.
    - isLemma : bool --> use lemmatizer or not? Defaults to not.
    - isStopWords : bool --> use stopwords or not? Defaults to not.
    - isInflect : bool --> use inflections (you're --> you are) or not? Defaults to not.
    - isNumberFiltered :  bool --> delete numbers in the string? Defaults to yes. 
    '''
    '''
    output : list<str> --> a list of word tokens (list<string>)
    '''

    def PreprocessDocument(self, doc: str, isLemma: bool = False, isStopWords: bool = False, isInflect: bool = False, isNumberFiltered: bool = True):
        inflector = inflect.engine()
        stopwordSet = set(stopwords.words("english"))
        lemmatizer = WordNetLemmatizer()
        punctuations = string.punctuation
        # if numbers are filtered, add that to the punctuation string
        if isNumberFiltered:
            punctuations += "1234567890"

        # case fold
        doc = doc.lower()

        # remove puncs
        doc = "".join([char for char in doc if char not in punctuations])

        # tokenize it.
        token_list = nltk.word_tokenize(doc)

        for i in range(len(token_list)):
            # if inflect
            if isInflect:
                if token_list[i].isdigit():
                    token_list[i] = inflector.number_to_words(token_list[i])

            # if lemma
            if isLemma:
                tagged_word = nltk.pos_tag([token_list[i]])
                wordnet_pos = self.getWordnetPos(tagged_word[0][1])
                token_list[i] = lemmatizer.lemmatize(
                    tagged_word[0][0], pos=wordnet_pos)

            # if stopword
            if isStopWords:
                if token_list[i] in stopwordSet or token_list[i].isdigit():
                    token_list[i] = "#"  # mark as #

        # remove the marked strings
        token_list = [token for token in token_list if token != "#"]

        if token_list:
            return token_list
        return [""]

    '''
    inputs:
    - tag : str --> the tag obtained from POS tagging.
    '''
    '''
    outputs:
    - str --> Wordnet POS tag.
    '''

    def getWordnetPos(self, tag):
        """Map POS tag to WordNet POS tag"""
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN  # solves as noun by default.

    '''
    inputs:
    - doclist : list<str> --> list of doc/sentences.
    - isProcessed : bool --> has it already been preprocessed? Defaults to True.
    '''
    '''
    outputs:
    - df_tfidf : Dataframe --> the TFIDF matrix in df form. 
    - matrix : matrix --> the TFIDF matrix purely. mainly for LDA purposes.
    '''

    def GetTFIDF(self, doclist: list, isPreprocessed=True):
        if not isPreprocessed:
            doclist = [self.PreprocessDocument(
                doc, isLemma=True, isStopWords=True) for doc in doclist]
        # else:
        #     # just tokenize the thing
        #     doclist = [nltk.word_tokenize(doc) for doc in doclist]
        # i think the thing has already been tokenized. That's the problem.
        flat_doclist = [' '.join(doc)
                        for doc in doclist]  # turn into one big corpus
        vectorizer = TfidfVectorizer()
        matrix = vectorizer.fit_transform(flat_doclist)
        tfidf_keys = vectorizer.get_feature_names_out()
        df_tfidf = db.pd.DataFrame(matrix.toarray(), columns=tfidf_keys)

        return df_tfidf, matrix

    # input : list<str> : tokens of one document/sentence
    # output : list<(str, list<int>[300])> : list of word-vector pair for each word available on the model
    def WordEmbed(self, document: list, model):
        word_embed_pairs = []
        for word in document:
            if word in model:
                word_embed_pairs.append((word, model[word]))
        return word_embed_pairs

    # input : list<(str, list<float>[300])>, str : word-vector pair list and preferred agg method.
    # output : list<float>[300] : 300-d vector that represents an aggregated value of the input words
    def SentenceEmbedUnweightedFunction(self, word_embed_pair_list: list, aggregateMethod: str = "avg"):
        wvs = []
        for pair in word_embed_pair_list:
            wvs.append(pair[1])
        if aggregateMethod == "avg":
            return np.mean(wvs, axis=0)
        else:
            return np.sum(wvs, axis=0)

    # input : list<list<(str, list<float>[300])>>, str : list containing word-vector pairs and preferred agg method
    # output : list<(str, list<int>[300])> : list containing sentence-vector pairs.
    def SentenceEmbedUnweighted(self, word_embedded_docs: list, aggregateMethod: str = "avg"):
        sentence_embedded_docs = []
        for i in range(len(word_embedded_docs)):
            sentence_embedded_docs.append(self.SentenceEmbedUnweightedFunction(
                word_embedded_docs[i], aggregateMethod))
        return sentence_embedded_docs

    '''
    input :
    list<list<(str, list<float>[300])>> : word-vector pair list
    matrix : tf-idf matrix for the corresponding doc
    int : the row we want
    str : preferred agg method
    '''
    # output : list<float>[300] : 300-d vector that represents an aggregated value of the input words

    def SentenceEmbedWeightedFunction(self, word_embed_pair_list: list, tfidf_matrix, index: int, aggregateMethod: str = "avg"):
        weighted_wvs = []
        # multiplies each word with its TF-IDF value in the corresponding row. Is 0 if word isn't found somehow.
        for pair in word_embed_pair_list:
            tfidf_weight = 0
            if pair[0] in tfidf_matrix:
                tfidf_weight = tfidf_matrix[pair[0]][index]
            weighted_wvs.append(pair[1] * tfidf_weight)
        # turn into array for fast aggregating
        weighted_wvs = np.array(weighted_wvs)
        if aggregateMethod == "avg":
            sentence_vector = np.mean(weighted_wvs, axis=0)
        else:
            sentence_vector = np.sum(weighted_wvs, axis=0)
        return sentence_vector

    # input : list<list<(str, list<float>[300])>>, str : list containing word-vector pairs, TF-IDF matrix of the corpus, and preferred agg method
    # output : list<(str, list<float>[300])> : list containing sentence-vector pairs.
    def SentenceEmbedWeighted(self, word_embedded_docs: list, tfidf_matrix, aggregateMethod="avg"):
        sentence_embedded_docs = []
        for i in range(len(word_embedded_docs)):
            sentence_embedded_docs.append(self.SentenceEmbedWeightedFunction(
                word_embedded_docs[i], tfidf_matrix, i, aggregateMethod))
        return sentence_embedded_docs

    '''
    input:
    - doclist : list<list<str>> --> list of tokenized sentences/docs
    - topics : int --> number of inferred topics.
    - use_tfidf : bool --> use TFIDF or not? defaults to yes.
    '''
    '''
    output:
    - docFeatureList : list<list<float>> --> topic distribution for each sentence/doc
    '''

    def GetLDADistribution(self, doclist: list, topics: int = 5, use_tfidf: bool = True):
        new_corpus = []

        if use_tfidf:
            for i in range(len(doclist)):
                doc = [(j, self.tfidf_matrix[i, j])
                       for j in self.tfidf_matrix[i].indices]
                new_corpus.append(doc)
                gensim_dict = corpora.Dictionary.from_corpus(new_corpus)
        else:
            gensim_dict = corpora.Dictionary(doclist)
            new_corpus = [gensim_dict.doc2bow(doc) for doc in doclist]

        lda_model = gensim.models.LdaModel(
            new_corpus, num_topics=topics, id2word=gensim_dict)
        goofy_ahh_doc_topic_distributions = lda_model[new_corpus]

        docFeatureList = []
        for doc_topic_dist in goofy_ahh_doc_topic_distributions:
            featureList = [0.0 for i in range(0, topics)]
            for topic_dist in doc_topic_dist:
                featureList[topic_dist[0]] = topic_dist[1]
            docFeatureList.append(featureList)

        return docFeatureList

    '''
    inputs:
    - vectors : list<list<float>> --> list of features corresponding to each doc/sentence
    - epsilon : float --> the radius within which points are considered connected.
    - min : int --> minimum amount of connected points for a point to be considered a core point of a cluster.
    '''
    '''
    output:
    clusters : list<int> --> a list of integers to assign each data point to a cluster. -1 means outlier.
    '''

    def GetDBSCANClusters(self, vectors, epsilon: float, min: int):
        dbscan = DBSCAN(eps=epsilon, min_samples=min)
        clusters = dbscan.fit_predict(vectors)
        # plt.title("to the depths of depravity {} and the cusp of blasphemy {}.".format(epsilon, min))
        # plt.scatter(vectors[:, 0], vectors[:, 1], c=clusters)
        # plt.show()
        # print(clusters)
        return clusters

    '''
    inputs :
    - clusters : list<int> --> a list of clusters assigned to each doc/sentence
    - df : DataFrame --> the dataframe in question
    - isReturnSeparate : bool --> split return or not. Defaults to split (for some reason...)
    '''
    '''
    outputs:
    - dfOutliers : DataFrame --> the dataframe whose answers have been marked as outliers.
    - dfGoods : DataFrame --> the dataframe whose answers have not been marked as outliers.
    '''

    def ReturnClusters(self, clusters: list, df: db.pd.DataFrame, isReturnSeparate:bool=True):
        df["Cluster Assignment"] = clusters
        if isReturnSeparate:
            dfGoods = df.loc[df["Cluster Assignment"] != -1]
            dfOutliers = df.loc[df["Cluster Assignment"] == -1]
            return dfOutliers, dfGoods
        else:
            df.reset_index(inplace=True)
            return df

    '''
    inputs:
    - method : str --> LDA or Embedding.
    - isWeighted : bool --> use weights or not
    - nTopics : int --> for LDA.
    '''
    '''
    - outputs : none. This is an internal function
    '''
    def ExtractFeatures(self, method:str="LDA", isWeighted:bool=True, nTopics:int=5, aggregateMethod:str="avg"):
        if method == "LDA":
            # if weighted, prepare tf-idf matrix.
            if isWeighted:
                self.tfidf_df, self.tfidf_matrix = self.GetTFIDF(
                    self.preprocessedDocs)

            # use the in-house options for weighted or not.
            self.doc_embeds = self.GetLDADistribution(
                self.preprocessedDocs, topics=nTopics, use_tfidf=isWeighted)
        else:
            # if weighted, prepare TF-IDF and embed sentences with weight.
            if isWeighted:
                self.tfidf_df, self.tfidf_matrix = self.GetTFIDF(
                    self.preprocessedDocs)
                self.doc_embeds = self.SentenceEmbedWeighted(
                    self.wordEmbeddedDocs, self.tfidf_df, aggregateMethod)
            else:
                self.doc_embeds = self.SentenceEmbedUnweighted(
                    self.wordEmbeddedDocs, aggregateMethod)
                                                                                                         

    def GetAnomalies_DBSCAN_Embedding(self, isWeighted: bool = True, aggregateMethod: str = "avg", epsilon: float = 0.01, minsamp: int = 2, isReturnSeparate:bool=True):
        # df and model are obtained by invoking a separate function, and it is assumed to be already available when invoking this function.

        # preprocess each doc/sentence
        self.preprocessedDocs = [self.PreprocessDocument(
            doc, isLemma=True, isStopWords=True) for doc in self.df["answer"]]

        # extract feature with embedding
        self.wordEmbeddedDocs = [self.WordEmbed(
            doc, self.model) for doc in self.preprocessedDocs]

        # if weighted, prepare TF-IDF and embed sentences with weight.
        if isWeighted:
            self.tfidf_df, self.tfidf_matrix = self.GetTFIDF(
                self.preprocessedDocs)
            self.doc_embeds = self.SentenceEmbedWeighted(
                self.wordEmbeddedDocs, self.tfidf_df, aggregateMethod)
        else:
            self.doc_embeds = self.SentenceEmbedUnweighted(
                self.wordEmbeddedDocs, aggregateMethod)

        # append embedding to each document
        if self.doc_embeds:
            self.df["Document Embed"] = self.doc_embeds
            self.df = self.df.dropna(subset=["Document Embed"]) # preventing NaN in the simplest fucking way in know.

        # apply DBSCAN
        self.clusters = self.GetDBSCANClusters(
            list(self.df["Document Embed"]), epsilon, minsamp)

        # return the dfs
        return self.ReturnClusters(self.clusters, self.df, isReturnSeparate=isReturnSeparate)

    def GetAnomalies_DBSCAN_LDA(self, isWeighted: bool = True, topics: int = 5, epsilon: float = 0.01, minsamp: int = 5, isReturnSeparate:bool=True):
        # df and model are obtained by invoking a separate function, and it is assumed to be already available when invoking this function.

        # preprocess each doc/sentence
        self.preprocessedDocs = [self.PreprocessDocument(
            doc, isLemma=True, isStopWords=True) for doc in self.df["answer"]]

        # if weighted, prepare tf-idf matrix.
        if isWeighted:
            self.tfidf_df, self.tfidf_matrix = self.GetTFIDF(
                self.preprocessedDocs)

        # use the in-house options for weighted or not.
        self.doc_embeds = self.GetLDADistribution(
            self.preprocessedDocs, topics=topics, use_tfidf=isWeighted)

        # append embedding to each document
        if self.doc_embeds:
            self.df["Document Embed"] = self.doc_embeds
            self.df = self.df.dropna(subset=["Document Embed"]) # preventing NaN in the simplest fucking way in know.

        # apply DBSCAN
        self.clusters = self.GetDBSCANClusters(
            list(self.df["Document Embed"]), epsilon, minsamp)

        # return the dfs
        return self.ReturnClusters(self.clusters, self.df, isReturnSeparate=isReturnSeparate)

    def GetAnomalies(self, method: str, model, isWeighted: bool = True, aggregateMethod: str = "avg", epsilon=0.01, minsamp=2, topics=5):
        # initialize
        # extract the dataset
        self.df = self.GetDF()
    
    def GetAnomalies_IsolationForest_Embedding(self, isWeighted:bool=True, aggregateMethod:str="avg", isReturnSeparate:bool=True):
        # df and model are obtained by invoking a separate function, and it is assumed to be already available when invoking this function.

        # preprocess each doc/sentence
        self.preprocessedDocs = [self.PreprocessDocument(
            doc, isLemma=True, isStopWords=True) for doc in self.df["answer"]]

        # extract feature with embedding
        self.wordEmbeddedDocs = [self.WordEmbed(
            doc, self.model) for doc in self.preprocessedDocs]

        # if weighted, prepare TF-IDF and embed sentences with weight.
        if isWeighted:
            self.tfidf_df, self.tfidf_matrix = self.GetTFIDF(
                self.preprocessedDocs)
            self.doc_embeds = self.SentenceEmbedWeighted(
                self.wordEmbeddedDocs, self.tfidf_df, aggregateMethod)
        else:
            self.doc_embeds = self.SentenceEmbedUnweighted(
                self.wordEmbeddedDocs, aggregateMethod)

        # append embedding to each document
        if self.doc_embeds:
            self.df["Document Embed"] = self.doc_embeds
            self.df = self.df.dropna(subset=["Document Embed"]) # preventing NaN in the simplest fucking way in know.
        
        # apply Isolation Forest
        self.ifResults = self.GetIFResults(vector=list(self.df["Document Embed"]))

        return self.ReturnClusters(self.ifResults, self.df, isReturnSeparate=isReturnSeparate)
        

    def GetIFResults(self, vector):
        isolationForest = IsolationForest(n_estimators=100, contamination=0.1)
        isolationForest.fit(vector)
        IFResults = isolationForest.decision_function(vector)

        # minus values yield anomalies.
        for i in range(len(IFResults)):
            if IFResults[i] >= 0:
                IFResults[i] = 0
            else:
                IFResults[i] = -1
        return IFResults



In [25]:
dh = db.DatabaseHandler("testdb.db")
ad = AnomalyDetector(dh=dh, modelName="glove-wiki-gigaword-300")

In [30]:
ad.SetDFFromDB(ad.dh, 18, splitBySentences=False)


In [23]:
ad.df

Unnamed: 0,id,event_title,speaker,question,answer,Document Embed
0,1,Why Did God Create Us?,PY,How did we come to exist?,It is not by my hand that I was once again gi...,"[-0.037256964, -0.005106877, -0.014977815, -0...."
1,2,Why Did God Create Us?,GRE,How did we come to exist?,Even if a thousand men fall at your side and ...,"[-0.020165913, 0.013082339, -0.014995785, -0.0..."
2,3,Why Did God Create Us?,PY,Do you think you need to exist?,We don't really need to exist. Even if I don't...,"[-0.042173408, 0.023831157, -0.027844723, 0.02..."
3,4,Why Did God Create Us?,JJ,Do you think you need to exist?,"I don't think we need to exist. As Peter said,...","[-0.03246461, 0.012234184, -0.014134587, 0.040..."
4,5,Why Did God Create Us?,TMS,Do you think you need to exist?,I think we are a part of a bigger picture.,"[-0.003818011, 0.05617611, -0.049015313, -0.08..."
5,6,Why Did God Create Us?,TMS,Do you think you need to exist?,"Yes, because we're a part of a community where...","[-0.037814233, -0.048823945, -0.019741789, -0...."
6,7,Why Did God Create Us?,DJ,Do you think you need to exist?,"In general, humans have made many impacts. Th...","[0.038543474, 0.02189518, -0.03183997, -0.0730..."
7,8,Why Did God Create Us?,RIC,Do you think you need to exist?,"This got me thinking, if we question the purpo...","[-0.04423244, -0.00738029, -0.009012498, -0.03..."
8,9,Why Did God Create Us?,GRE,"From a bigger perspective, why did God create us?",It's a mystery. We won't know. Religion said s...,"[-0.023103386, -0.0318319, -0.05187923, -0.011..."
9,10,Why Did God Create Us?,TMS,"From a bigger perspective, why did God create us?",I think God himself needs an outlet to His lov...,"[-0.06028241, -0.038177412, -0.0744806, -0.008..."


In [32]:
anomaly, good = ad.GetAnomalies_IsolationForest_Embedding(isWeighted=True, aggregateMethod="avg", isReturnSeparate=True)
anomaly

Unnamed: 0,index,id,event_title,speaker,question,answer,Document Embed,Cluster Assignment
28,29,9,Why Did God Create Us?,GRE,"From a bigger perspective, why did God create us?",It's a mystery,"[-0.17551, -0.082361, 0.35437, 0.057444, 0.627...",-1.0
30,31,9,Why Did God Create Us?,GRE,"From a bigger perspective, why did God create us?","Religion said so, but then it is religion, no...","[-0.067476034, -0.18543604, -0.1247678, 0.0038...",-1.0
35,36,12,Why Did God Create Us?,PAU,"From a bigger perspective, why did God create us?",How should I know,"[-0.21054, 0.1382, 0.035328, 0.03977, -0.10913...",-1.0
50,52,23,Why Did God Create Us?,GRE,"If there is a creator, How would the creator b...",A dictator is still a dictator,"[0.028574442, -0.33913907, 0.25452283, -0.5800...",-1.0
52,54,23,Why Did God Create Us?,GRE,"If there is a creator, How would the creator b...",This is how dictatorship works,"[0.17178635, 0.06104046, -0.026714057, -0.2459...",-1.0
56,58,25,Why Did God Create Us?,ANT,Why would a Holy God create a creation that ha...,I am unsure of what aspect it is,"[0.03384018, 0.05262956, -0.0057979524, -0.195...",-1.0
57,59,26,Why Did God Create Us?,YOR,Why would a Holy God create a creation that ha...,Holiness is a character,"[-0.23910463, -0.06778326, -0.1642397, -0.1269...",-1.0
60,62,27,Why Did God Create Us?,GRE,Why would a Holy God create a creation that ha...,This concept is too Christian,"[-0.05881997, -0.08591559, -0.19163653, 0.0403...",-1.0
