<h2>IMPORTS</h2>

In [223]:
# imports go here
import numpy as np
import db
import inflect
import string
import nltk
import gensim
import matplotlib.pyplot as plt
import gensim.downloader as api
from nltk.test.gensim_fixt import setup_module
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
from sklearn.manifold import TSNE

<h2> Init (don't forget to do this when making the class, and only once.)

In [216]:
setup_module()
model = api.load("glove-wiki-gigaword-300")

# see here : https://radimrehurek.com/gensim/downloader.html for saving.

<h2>Data Retrieval</h2>

In [316]:
# input : DatabaseHandler
# output : DataFrame
def GetDF(dh:db.DatabaseHandler, selector: str, eventID: int, splitBySentences: bool = False):
    df = dh.get_recordDataJoinedDF(selector=selector, ID=eventID)
    if splitBySentences:
        # df.set_index('id', inplace=True)
        df['answer'] = df['answer'].str.split('.')
        df = df.explode("answer", True)
        df.drop(df[df["answer"] == ""].index, inplace=True)
        df.reset_index(drop=True,inplace=True)
    return df


dh = db.DatabaseHandler("testdb.db")  # db connection
df = GetDF(dh, "event_id", 20, False)
df

Unnamed: 0,id,event_title,speaker,question,answer
0,230,Of Choice and Life,KSMG,Pro life or pro choice?,I am pro choice. I feel bad for the baby. Peop...
1,231,Of Choice and Life,JJ,Pro life or pro choice?,"Logically, both make sense. Conflicted between..."
2,232,Of Choice and Life,JER,Pro life or pro choice?,"I'm pro life. Because in my belief, if a fetus..."
3,233,Of Choice and Life,YOT,Pro life or pro choice?,"Prochoice. If she was a victim of rape, etc, s..."
4,234,Of Choice and Life,GRE,Pro life or pro choice?,Prolife for religious reasons. Being religious...
5,235,Of Choice and Life,RIC,Pro life or pro choice?,Prochoice. Unless the baby is normal.
6,236,Of Choice and Life,GRE,Do you think abortion should be legal?,Legal - not really legal - legal for special c...
7,237,Of Choice and Life,MAR,Do you think abortion should be legal?,Agree with other solutions besides abortion - ...
8,238,Of Choice and Life,RIC,Do you think abortion should be legal?,Should be legal with criteria. Agree with Indo...
9,239,Of Choice and Life,YOR,Do you think abortion should be legal?,What counts as a person? Is fetus a person?


<h2>Preprocessing</h2>

In [170]:
# input : sentence/document (string); parameters
# output : a list of word tokens (list<string>)
def PreprocessDocument(doc:str, isLemma:bool=False, isStopWords:bool=False, isInflect:bool=False, isNumberFiltered:bool=True):
    inflector = inflect.engine()
    stopwordSet = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()
    punctuations = string.punctuation
    # if numbers are filtered, add that to the punctuation string
    if isNumberFiltered:
        punctuations += "1234567890"

    # case fold
    doc = doc.lower()

    # remove puncs
    doc = "".join([char for char in doc if char not in punctuations])

    # tokenize it.
    token_list = nltk.word_tokenize(doc)

    for i in range(len(token_list)):
        # if inflect
        if isInflect:
            if token_list[i].isdigit():
                token_list[i] = inflector.number_to_words(token_list[i])

        # if lemma
        if isLemma:
            tagged_word = nltk.pos_tag([token_list[i]])
            wordnet_pos = getWordnetPos(tagged_word[0][1])
            token_list[i] = lemmatizer.lemmatize(tagged_word[0][0], pos=wordnet_pos)
        
        # if stopword
        if isStopWords:
            if token_list[i] in stopwordSet or token_list[i].isdigit():
                token_list[i] = "#" # mark as #
        
    # remove the marked strings
    token_list = [token for token in token_list if token != "#"]

    if token_list:
        return token_list
    return [""]

def getWordnetPos(tag):
    """Map POS tag to WordNet POS tag"""
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN # solves as noun by default.
    

In [253]:
mySentence = df.loc[25]["answer"]
myTokenizedSentence = PreprocessDocument(mySentence, isStopWords=True, isLemma=True)
myTokenizedSentence

['']

<h2>Get word set, whatever for

In [48]:
# input : list<str>
# output : set<str> --> no duplicates, 
# def GetWordSet(df:db.pd.DataFrame):
#     bigtext = ""
    

<h2> TF-IDF function

In [156]:
# input : list<str>
# output : Dataframe, matrix
def GetTFIDF(doclist:list, isPreprocessed=True):
    if not isPreprocessed:
        doclist = [PreprocessDocument(doc, isLemma=True, isStopWords=True) for doc in doclist]
    else:
        # just tokenize the thing
        doclist = [nltk.word_tokenize(doc) for doc in doclist]
        
    flat_doclist = [' '.join(doc) for doc in doclist] # turn into one big corpus
    vectorizer = TfidfVectorizer()
    matrix  = vectorizer.fit_transform(flat_doclist)
    tfidf_keys = vectorizer.get_feature_names_out()
    df_tfidf = db.pd.DataFrame(matrix.toarray(), columns=tfidf_keys)

    return df_tfidf, matrix


In [317]:
my_tfidf, my_matrix = GetTFIDF(df["answer"], isPreprocessed=True)
my_tfidf

Unnamed: 0,100,12,abilities,abort,aborted,abortion,abortions,about,accommodate,actually,...,with,world,worry,worse,worst,would,wrong,year,yet,you
0,0.0,0.0,0.0,0.0,0.0,0.0,0.198725,0.0,0.0,0.0,...,0.116523,0.174683,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.138777,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.114713
2,0.0,0.0,0.0,0.129121,0.0,0.0,0.0,0.0,0.0,0.0,...,0.07571,0.0,0.0,0.0,0.0,0.129121,0.0,0.0,0.0,0.187637
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.126945,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.138002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.114073
7,0.0,0.0,0.0,0.0,0.0,0.17208,0.0,0.0,0.0,0.0,...,0.127209,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.348811,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


<h2> Word- and Sentence-Embedding

In [158]:
# input : list<str> : tokens of one document/sentence
# output : list<(str, list<int>[300])> : list of word-vector pair for each word available on the model
def WordEmbed(document: list, model):
    word_embed_pairs = []
    for word in document:
        if word in model:
            word_embed_pairs.append((word, model[word]))
    return word_embed_pairs

# input : list<(str, list<float>[300])>, str : word-vector pair list and preferred agg method.
# output : list<float>[300] : 300-d vector that represents an aggregated value of the input words


def SentenceEmbedUnweightedFunction(word_embed_pair_list: list, aggregateMethod: str = "avg"):
    wvs = []
    for pair in word_embed_pair_list:
        wvs.append(pair[1])
    if aggregateMethod == "avg":
        return np.mean(wvs, axis=0)
    else:
        return np.sum(wvs, axis=0)

# input : list<list<(str, list<float>[300])>>, str : list containing word-vector pairs and preferred agg method
# output : list<(str, list<int>[300])> : list containing sentence-vector pairs.


def SentenceEmbedUnweighted(word_embedded_docs: list, aggregateMethod: str = "avg"):
    sentence_embedded_docs = []
    for i in range(len(word_embedded_docs)):
        sentence_embedded_docs.append(SentenceEmbedUnweightedFunction(
            word_embedded_docs[i], aggregateMethod))
    return sentence_embedded_docs


'''
input :
list<list<(str, list<float>[300])>> : word-vector pair list
matrix : tf-idf matrix for the corresponding doc
int : the row we want
str : preferred agg method
'''
# output : list<float>[300] : 300-d vector that represents an aggregated value of the input words


def SentenceEmbedWeightedFunction(word_embed_pair_list: list, tfidf_matrix, index: int, aggregateMethod: str = "avg"):
    weighted_wvs = []
    # multiplies each word with its TF-IDF value in the corresponding row. Is 0 if word isn't found somehow.
    for pair in word_embed_pair_list:
        tfidf_weight = 0
        if pair[0] in tfidf_matrix:
            tfidf_weight = tfidf_matrix[pair[0]][index]
        weighted_wvs.append(pair[1] * tfidf_weight)
    # turn into array for fast aggregating
    weighted_wvs = np.array(weighted_wvs)
    if aggregateMethod == "avg":
        sentence_vector = np.mean(weighted_wvs, axis=0)
    else:
        sentence_vector = np.sum(weighted_wvs, axis=0)
    return sentence_vector

# input : list<list<(str, list<float>[300])>>, str : list containing word-vector pairs, TF-IDF matrix of the corpus, and preferred agg method
# output : list<(str, list<float>[300])> : list containing sentence-vector pairs.


def SentenceEmbedWeighted(word_embedded_docs: list, tfidf_matrix, aggregateMethod="avg"):
    sentence_embedded_docs = []
    for i in range(len(word_embedded_docs)):
        sentence_embedded_docs.append(SentenceEmbedWeightedFunction(
            word_embedded_docs[i], tfidf_matrix, i, aggregateMethod))
    return sentence_embedded_docs


In [318]:
docs = [PreprocessDocument(doc, isLemma=True, isStopWords=True) for doc in df["answer"]]
word_embedded_docs = []
for doc in docs:
    word_embedded_docs.append(WordEmbed(doc, model))

# sentence_embed("bababui", tfidf_matrix=my_tfidf, index=1)
doc_embeds = SentenceEmbedWeighted(word_embedded_docs, my_tfidf, "avg")
df["Document Embed"] = doc_embeds
df.dropna(inplace=True)
df.reset_index(inplace=True) # don't forget to add this after every row-altering operation.
df

Unnamed: 0,index,id,event_title,speaker,question,answer,Document Embed
0,0,230,Of Choice and Life,KSMG,Pro life or pro choice?,I am pro choice. I feel bad for the baby. Peop...,"[0.004874857, 0.008332232, 0.002196011, 0.0127..."
1,1,231,Of Choice and Life,JJ,Pro life or pro choice?,"Logically, both make sense. Conflicted between...","[0.0005951547, 0.008526833, 0.0021405662, 0.00..."
2,2,232,Of Choice and Life,JER,Pro life or pro choice?,"I'm pro life. Because in my belief, if a fetus...","[0.004796755, 0.0028883053, 0.001583014, 0.013..."
3,3,233,Of Choice and Life,YOT,Pro life or pro choice?,"Prochoice. If she was a victim of rape, etc, s...","[0.024734557, 0.0059358147, 0.008119139, 0.030..."
4,4,234,Of Choice and Life,GRE,Pro life or pro choice?,Prolife for religious reasons. Being religious...,"[0.018496333, -0.007036122, 0.011387986, 0.025..."
5,5,235,Of Choice and Life,RIC,Pro life or pro choice?,Prochoice. Unless the baby is normal.,"[-0.0024630986, 0.008384034, 0.0044532903, 0.0..."
6,6,236,Of Choice and Life,GRE,Do you think abortion should be legal?,Legal - not really legal - legal for special c...,"[-0.019265054, 0.017447688, -0.018532839, 0.00..."
7,7,237,Of Choice and Life,MAR,Do you think abortion should be legal?,Agree with other solutions besides abortion - ...,"[-0.0006021686, 0.008229874, 0.0176754, 0.0222..."
8,8,238,Of Choice and Life,RIC,Do you think abortion should be legal?,Should be legal with criteria. Agree with Indo...,"[-0.004423037, 0.011602763, 0.0057513397, 0.02..."
9,9,239,Of Choice and Life,YOR,Do you think abortion should be legal?,What counts as a person? Is fetus a person?,"[0.101447776, -0.09087756, 0.021135462, 0.0546..."


<h2>LDA Approach

In [147]:
def GetLDADistribution():
    pass

<h2>Anomaly detection : DBSCAN

In [180]:
def GetDBSCANClusters(vectors, epsilon:float, min:float):
    dbscan = DBSCAN(eps=epsilon, min_samples=min)
    clusters = dbscan.fit_predict(vectors)
    # plt.title("to the depths of depravity {} and the cusp of blasphemy {}.".format(epsilon, min))
    # plt.scatter(vectors[:, 0], vectors[:, 1], c=clusters)
    # plt.show()
    print(clusters)
    return clusters
    
    

In [319]:
GetDBSCANClusters(list(df["Document Embed"]), 1, 2)


[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)

<h2>Shrink and draw with TSNE

In [182]:
from sklearn.manifold import TSNE

# returns a tsne shrinkage also...
def plot_documents(df:db.pd.DataFrame, isPrint=False):
    labels = np.arange(0, df.index.stop, 1)
    values = list(df["Document Embed"]) # don't forget to list it first, then np array it later.

    # train model
    tsne_model = TSNE(perplexity=20, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(np.array(values))

    # plot
    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
    
    if isPrint:
        plt.figure(figsize=(20, 20)) 
        for i in range(len(x)):
            plt.scatter(x[i],y[i])
            plt.annotate(labels[i],
                        xy=(x[i], y[i]),
                        xytext=(5, 2),
                        textcoords='offset points',
                        ha='right',
                        va='bottom')
        plt.show()
    # use the thing to find new clusters.
    return new_values

In [322]:
clusters = GetDBSCANClusters(list(df["Document Embed"]), 0.3, 2)
df["Cluster Assignment"] = clusters
dfSupposedOutliers = df.loc[df["Cluster Assignment"] == -1]
dfSupposedGoods = df.loc[df["Cluster Assignment"] != -1]
dfSupposedGoods.reset_index(inplace=True)
dfSupposedOutliers.reset_index(inplace=True)

for i in range(len(dfSupposedOutliers.index)):
    print(dfSupposedOutliers.loc[i]["question"], " | ", dfSupposedOutliers.loc[i]["answer"])

print("#" * 80)
for i in range(len(dfSupposedGoods.index)):
    print(dfSupposedGoods.loc[i]["question"], " | ", dfSupposedGoods.loc[i]["answer"])


[ 0  0  0  0  0 -1  0  0  0 -1 -1  0  0  0  0  0  0  0  0  0]
Pro life or pro choice?  |  Prochoice. Unless the baby is normal.
Do you think abortion should be legal?  |  What counts as a person? Is fetus a person?
Do you think abortion should be legal?  |  Does not being a Christian still fall under Christian rules?
################################################################################
Pro life or pro choice?  |  I am pro choice. I feel bad for the baby. People have abortions for a reason. If it is forced, it will grow imperfectly. Born into the world in a state that is not optimal. With prochoice, her fate is more manageable.
Pro life or pro choice?  |  Logically, both make sense. Conflicted between the two because pro life - you remove their chance of being born, pro choice - there are events that make the baby if born not 100 percent healthy. Also if the parents are not ready, financially etc. Life is a life - no matter what form it is
Pro life or pro choice?  |  I'm pro 

<h2>Anomaly detection : LOF

<h2>Anomaly detection : Isolation Forest (sklearn)

<h2>Final Function