In [130]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models.coherencemodel import CoherenceModel
import gensim
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

In [148]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [re.sub(r'\W+', '', token) for token in tokens if re.sub(r'\W+', '', token)]
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

In [162]:
values = ['building on past work', 'ease of implementation', 'fairness', 'novelty', 'performance', 'reproducibility']

In [163]:
for value in values:
    print('Value:', value)
    df = pd.read_csv(f'parth code/cs224n-final/topics/{value}.csv')
    df = df[df.Document.notna()]
    df.Representation = df.Representation.map(eval)
    vectorizer = CountVectorizer(stop_words='english')
    data_vectorized = vectorizer.fit_transform(df.Document)
    corpus = gensim.matutils.Sparse2Corpus(data_vectorized, documents_columns=False)
    dictionary = gensim.corpora.Dictionary.from_corpus(corpus, id2word=dict((id, word) for word, id in vectorizer.vocabulary_.items()))
    data_processed = df.Document.map(preprocess_text)
    topics = df['Representation'].drop_duplicates()
    print('Num Topics:', len(topics))
    coherence_model = CoherenceModel(topics=topics, texts=data_processed, dictionary=dictionary, coherence='u_mass')
    coherence = coherence_model.get_coherence()
    print('Coherence:', coherence)

Value: building on past work
Num Topics: 11
Coherence: -2.5962737034813648
Value: ease of implementation
Num Topics: 1
Coherence: -7.825203541329992
Value: fairness
Num Topics: 1
Coherence: -14.289849745465883
Value: novelty
Num Topics: 1
Coherence: -1.3689293867310974
Value: performance
Num Topics: 10
Coherence: -3.259315154759512
Value: reproducibility
Num Topics: 4
Coherence: -3.64009805505403
