In [18]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from gensim import corpora, models
from gensim.models.coherencemodel import CoherenceModel
import gensim
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np
import json
import os
from collections import defaultdict

In [19]:
df = pd.read_csv('cleaned_acl_gemini.csv')
df = df[df.response.notna()]
df

Unnamed: 0,acl_id,value,response
0,W12-2037,performance,Experimental results using Wikipedia as the co...
6,P07-1058,performance,The current performance level only stresses th...
8,C14-1013,building on past work,"We extract the terms from Twitter, and from th..."
11,C14-1013,novelty,We evaluate our system with the local search t...
12,C14-1013,performance,"For precision at rank three, the improvement f..."
...,...,...,...
5749,W19-1410,reproducibility,We use the same POS tag sets for all language ...
5750,W16-4804,building on past work,State-of-the-art approaches to related languag...
5753,W16-4804,novelty,This paper describes the GW/LT3 contribution t...
5754,W16-4804,performance,GW/LT3 ranked first in the out-of-domain evalu...


In [20]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [re.sub(r'\W+', '', token) for token in tokens if re.sub(r'\W+', '', token)]
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

In [21]:
for v in df.value.unique():
    print(v)
    data = df[df.value == v].response
    print('Number of documents:', len(data))
    vectorizer = CountVectorizer(stop_words='english', min_df=2)
    data_vectorized = vectorizer.fit_transform(data)
    corpus = gensim.matutils.Sparse2Corpus(data_vectorized, documents_columns=False)
    dictionary = corpora.Dictionary.from_corpus(corpus, id2word=dict((id, word) for word, id in vectorizer.vocabulary_.items()))
    data_processed = data.apply(preprocess_text)
    num_topics_list = [2, 3, 4, 5, 6, 8, 10, 15, 20, 25, 30]
    best_model = None
    best_num_topics = 0
    best_coherence = -float('inf')
    for n in num_topics_list:
        lda_model = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=n, passes=20, random_state=0)
        coherence = CoherenceModel(model=lda_model, texts=data_processed, dictionary=dictionary, coherence='u_mass').get_coherence()
        if coherence > best_coherence:
            best_model = lda_model
            best_num_topics = n
            best_coherence = coherence
    print('Best number of topics:', best_num_topics)
    print('Best topic coherence:', best_coherence)
    topics = data_processed.map(lambda doc: max(best_model[dictionary.doc2bow(doc)], key=lambda x: x[1])[0])
    probs = data_processed.map(lambda doc: max(best_model[dictionary.doc2bow(doc)], key=lambda x: x[1])[1])
    acl_id = df[df.value == v].acl_id
    topic_words_dict = {i: [w for w, p in t] for i, t in best_model.show_topics(num_words=10, formatted=False)}
    topic_words = topics.map(topic_words_dict)
    labeled_topics = pd.DataFrame({'acl_id': acl_id, 'topic': topics, 'topic_words': topic_words, 'probability': probs})
    labeled_topics.to_csv(f'topic_data/{v}.csv')

performance
Number of documents: 766
Best number of topics: 2
Best topic coherence: -1.3238947170084712
building on past work
Number of documents: 576
Best number of topics: 3
Best topic coherence: -1.3323251786671475
novelty
Number of documents: 573
Best number of topics: 2
Best topic coherence: -1.4978714306686434
ease of implementation
Number of documents: 50
Best number of topics: 30
Best topic coherence: -6.843512833280342
reproducibility
Number of documents: 139
Best number of topics: 2
Best topic coherence: -1.6876909811274983
fairness
Number of documents: 4
Best number of topics: 4
Best topic coherence: -5.518855957761511
