In [200]:
import pandas as pd
import numpy as np
from gensim.models import HdpModel
from gensim import corpora
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import matthews_corrcoef, ConfusionMatrixDisplay, accuracy_score, RocCurveDisplay

In [22]:
converter = lambda x: list(map(lambda y: y.strip("'"), x.strip("[]").split(", ")))
data = pd.read_csv('data/data.csv', converters={"Email" : converter, "Lemmatized" : converter}, index_col=0)

In [188]:
def distarray(dist, numtopics):
    array = np.zeros((len(dist), numtopics))
    for i in range(len(dist)):
        for pair in dist[i]:
            topic, prob = pair
            array[i, topic] = prob
    return array

kf = KFold(n_splits=10, shuffle=True, random_state=63) # a specific random state is chosen for reproducibility only
folds = list(kf.split(data))

# non-lemmatized version
bestmcc = 0
i = 0
for fold in folds:
    train, test = fold
    traindata = data.loc[train]
    testdata = data.loc[test]
    dictionary = corpora.Dictionary(traindata.Email)
    traincorpus = list(map(dictionary.doc2bow, traindata.Email))
    testcorpus = list(map(dictionary.doc2bow, testdata.Email))
    hdp = HdpModel(traincorpus, dictionary)
    traindist = list(hdp[traincorpus])
    testdist = list(hdp[testcorpus])
    numtopics = len(hdp.get_topics())
    trainarray = distarray(traindist, numtopics)
    testarray = distarray(testdist, numtopics)
    rf = RandomForestClassifier(random_state=63)
    rf = rf.fit(trainarray, traindata.Spam)
    testresults = rf.predict(testarray)
    trueresults = list(testdata.Spam)
    mcc = matthews_corrcoef(trueresults, testresults)
    if mcc > bestmcc:
        bestmcc = mcc
        model = hdp
        classifier = rf
        bestsplit = i
    i += 1