In [1]:
%%capture
!pip install contextualized-topic-models

## Import General Utility Libraries

In [2]:
import re
import urllib
import gzip
import io
import csv
import random
from collections import defaultdict
from tqdm import tqdm
import nltk

In [3]:
from nltk.corpus import stopwords as stop_words

nltk.download('stopwords')
stopwords = list(stop_words.words("english"))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Where to store the data file. If you want, you can adjust the path.

In [4]:
path_before_1990 = '/content/drive/My Drive/titles_before_1990.txt'
path_from_1990_to_2009 = '/content/drive/My Drive/titles_from_1990_to_2009.txt'
path_from_2010 = '/content/drive/My Drive/titles_from_2010.txt'

Execute the following cell only once to download the data and write it as a file to your google drive. Afterwards, skip this cell or comment it out.

In [5]:
from google.colab import drive
drive.mount('/content/drive')

# to download the data manually or get more information, go to: https://dblp.org/faq/How+can+I+download+the+whole+dblp+dataset.html
url = 'https://dblp.uni-trier.de/xml/dblp.xml.gz'
# num_titles = 500000  # the (max)number of titles to load


def load_gzip_file(url):
    """Download Gzip-file."""
    response = urllib.request.urlopen(url)
    compressed_file = io.BytesIO(response.read())
    decompressed_file = gzip.GzipFile(fileobj=compressed_file)
    return decompressed_file

def extract_titles(input_file, max_num=40000):
    """Extract title and publication year of dblp papers, given as input file.

    Divide the papers into 3 time periods.

    Collect max max_num papers per time period.
    """
    pairs_before_1990 = []
    count_before_1990 = 0
    pairs_from_1990_to_2009 = []
    count_from_1990_to_2009 = 0
    pairs_from_2010 = []
    count_from_2010 = 0
    got_title = False
    for line in tqdm(input_file):
        line_str = line.decode('utf-8')
        if got_title:
            # we have a title and check for the corresponding year
            year_result = re.search(r'<year>(.*)</year>', line_str)
            if year_result:
                # we also have the year and thus save the title-year pair
                year = int(year_result.group(1))
                if year < 1990:
                    pairs_before_1990.append((title, year))
                    count_before_1990 += 1
                elif year < 2010:
                    pairs_from_1990_to_2009.append((title, year))
                    count_from_1990_to_2009 += 1
                else:
                    pairs_from_2010.append((title, year))
                    count_from_2010 += 1
                got_title = False
        else:
            # we have no title and search for title
            result = re.search(r'<title>(.*)</title>', line_str)
            if result:
                title = result.group(1)
                if len(title.split(' ')) < 3:
                    # only include titles with at least four words
                    continue
                got_title = True

        if count_before_1990 >= max_num and count_from_1990_to_2009 >= max_num and count_from_2010 >= max_num:
            return pairs_before_1990, pairs_from_1990_to_2009, pairs_from_2010

    return pairs_before_1990, pairs_from_1990_to_2009, pairs_from_2010

def save_data(pairs, file_path):
    with open(file_path, 'w') as fout:
        writer = csv.writer(fout)
        for pair in pairs:
            writer.writerow(pair)

in_file = load_gzip_file(url)
pairs_before_1990, pairs_from_1990_to_2009, pairs_from_2010 = extract_titles(in_file)
save_data(pairs_before_1990, path_before_1990)
save_data(pairs_from_1990_to_2009, path_from_1990_to_2009)
save_data(pairs_from_2010, path_from_2010)

Mounted at /content/drive


16407618it [00:31, 520496.90it/s]


In [6]:
pairs_before_1990[:5]

[('Object Model Capabilities For Distributed Object Management.', 1989),
 ('Distributed Object Management Technology.', 1988),
 ('Muffin: A Distributed Database Machine', 1979),
 ('Algebraical Optimization of FTA-Expressions', 1988),
 ('Wissensrepr&auml;sentation und Maschinelles Lernen', 1987)]

Mount your google drive (in case it is not yet mounted) so that the newly created files are available.

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# LDA

In [8]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

num_lda_topics = 10

In [9]:
# get the titles of the papers
def get_titles(data_path):
  with open(data_path) as fin:
    reader = csv.reader(fin)
    titles = [row[0] for row in reader]
  return titles

Let's perform some simple preprocessing:

In [10]:
# Here we also considered using stemming, 
# but some of the results produced by stemming do not make sense :-(
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z ]', '', text)
    text = text.lower()
    return text

Now we turn the documents (or titles in this case) into a matrix feature representation.

In [11]:
def vectorize_titles(titles, num_features, max_df=0.95, min_df=0.01):
  tf_vectorizer = CountVectorizer(max_df=max_df, min_df=min_df, max_features=num_features, stop_words='english')
  tf = tf_vectorizer.fit_transform(titles)
  tf_feature_names = tf_vectorizer.get_feature_names_out()
  return tf, tf_feature_names

In [12]:
# print the result of topic modeling, 12 words per topic
def print_topics(lda, tf_feature_names):
  for topic_idx, topic in enumerate(lda.components_):
      print(f'Topic {topic_idx+1}:', end=' ')
      print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-12 - 1:-1]]))

### Before the 1990s:

In [13]:
NUM_OF_FEATURES = 10000
titles_before_1990 = get_titles(path_before_1990)
titles_1990_preprocessed = [preprocess_text(title) for title in titles_before_1990]

In [14]:
titles_1990_preprocessed[:10]

['object model capabilities for distributed object management',
 'distributed object management technology',
 'muffin a distributed database machine',
 'algebraical optimization of ftaexpressions',
 'wissensrepraumlsentation und maschinelles lernen',
 'an algebraic characterization of stuf',
 'zur systemarchitektur von lilog',
 'mengenorientierte auswertung von anfragen in der logikprogrammiersprache prolog',
 'definite resolution over constraint languages',
 'dokumentation der syntax der liloggrammatik']

In [15]:
tf_before_1990, feature_names_before_1990 = vectorize_titles(titles_1990_preprocessed, NUM_OF_FEATURES)
lda_before_1990 = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=5,
                                            learning_method='online', random_state=42).fit(tf_before_1990)
print_topics(lda_before_1990, feature_names_before_1990)

Topic 1: note problem functions optimal method technical linear decision solution problems algorithm using
Topic 2: control new implementation digital optimal linear approach design theory using systems problems
Topic 3: software processing applications finite research parallel digital computer data design theory information
Topic 4: analysis application languages performance algorithms networks theory computer data design digital linear
Topic 5: programming simulation linear digital computer problems language languages approach parallel using networks
Topic 6: design algorithm data networks information approach digital performance using parallel computer linear
Topic 7: computer using theory linear problems algorithms parallel models digital performance decision design
Topic 8: language recognition sets time pattern solution linear using problems parallel problem approach
Topic 9: logic distributed programs parallel networks using computer functions approach design algorithms theory
T

Topics:
0. Graph/networks algorithms (seems to be mostly about algorithms that (maybe) operate on graphs/networks)
1. pattern recognition (and maybe robotics)
2. ...

### From 1990 to 2009:

Add your code for topic modelling the period from 1990 to 2009 here...

In [16]:
titles_1990_to_2009 = get_titles(path_from_1990_to_2009)
titles_1990_to_2009_preprocessed = [preprocess_text(title) for title in titles_1990_to_2009]

In [17]:
tf_1990_to_2009, feature_names_1990_to_2009 = vectorize_titles(titles_1990_to_2009_preprocessed,
                                                               NUM_OF_FEATURES)
lda_1990_to_2009 = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=5,
                                            learning_method='online', random_state=42).fit(tf_1990_to_2009)
print_topics(lda_1990_to_2009, feature_names_1990_to_2009)

Topic 1: algorithm linear new problem algorithms optimal robust equations efficient detection optimization multiple
Topic 2: networks approach nonlinear network models problems neural wireless mobile evaluation scheduling robust
Topic 3: systems based distributed nonlinear linear robust control approach optimal adaptive evaluation multiple
Topic 4: control analysis methods software development computing robust nonlinear optimal linear adaptive systems
Topic 5: applications scheme web power efficient wireless mobile new robust control adaptive networks
Topic 6: model performance time graphs image parallel digital evaluation algorithms robust optimal linear
Topic 7: using method dynamic simulation equations nonlinear detection models problems multiple efficient new
Topic 8: adaptive application estimation modeling learning fuzzy theory recognition robust nonlinear control approach
Topic 9: design information management evaluation approach robust systems development new optimal applicatio

### From 2010 onwards:

Add your code for topic modelling the period from 2010 onwards here...

In [18]:
titles_from_2010 = get_titles(path_from_2010)
titles_from_2010_preprocessed = [preprocess_text(title) for title in titles_from_2010]

In [19]:
tf_from_2010, feature_names_from_2010 = vectorize_titles(titles_from_2010_preprocessed,
                                                         NUM_OF_FEATURES)
lda_from_2010 = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=5,
                                          learning_method='online', random_state=42).fit(tf_from_2010)
print_topics(lda_from_2010, feature_names_from_2010)

Topic 1: detection design linear mobile time stochastic recognition computing images segmentation optimal using
Topic 2: control learning efficient machine deep optimal nonlinear tracking adaptive distributed fuzzy stochastic
Topic 3: networks model algorithm wireless novel sensor energy based distributed improved optimal neural
Topic 4: analysis optimization study distributed power hybrid multiple case smart feature performance energy
Topic 5: information application framework applications methods prediction social management scheduling cloud computing energy
Topic 6: data approach nonlinear image new classification tracking online improved research review fuzzy
Topic 7: systems evaluation sensing nonlinear linear performance stochastic distributed fuzzy optimal control tracking
Topic 8: method estimation robust scheme problem problems communication nonlinear optimal based new improved
Topic 9: network adaptive neural dynamic modeling sensor selection deep nonlinear wireless tracking 

# Combined Topic Models

Method developed by [Bianchi et al. 2021](https://aclanthology.org/2021.acl-short.96/).

[A 6min presentation of the paper by one of the authors.](https://underline.io/lecture/25716-pre-training-is-a-hot-topic-contextualized-document-embeddings-improve-topic-coherence)

Code: [https://github.com/MilaNLProc/contextualized-topic-models](https://github.com/MilaNLProc/contextualized-topic-models)

Tutorial: [https://colab.research.google.com/drive/1fXJjr_rwqvpp1IdNQ4dxqN4Dp88cxO97?usp=sharing](https://colab.research.google.com/drive/1fXJjr_rwqvpp1IdNQ4dxqN4Dp88cxO97?usp=sharing)

Again, perform topic modelling for the three time periods - this time using the combined topic models (CTMs).

You can use and adapt the code from the tutorial linked above.

Use the available GPU for faster running times.

In [20]:
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessingStopwords

NUM_CTM_TOPICS = 10

In [21]:
# remove stop words
def ctm_get_text(titles):
  sp = WhiteSpacePreprocessingStopwords(titles, stopwords_list=stopwords)
  preprocessed_documents, unpreprocessed_corpus, vocab, retained_indices = sp.preprocess()
  return preprocessed_documents, unpreprocessed_corpus, vocab, retained_indices

In [22]:
# get the topic feature names with desired length
def ctm_get_topics(ctm_model, num_topics = 10, num_words = 12):
  for idx in range(num_topics):
    print(f'Topic {idx+1}:', end=' ')
    print(' '.join(ctm_model.get_topic_lists(num_words)[idx]))

### Before the 1990s:

In [23]:
# Here we give different names to different time period, to avoid possible mistakes
tp_1990 = TopicModelDataPreparation("all-mpnet-base-v2")

In [24]:
prep_docs_1990, unprep_corpus_1990, vocab_1990, retained_indices_1990 = ctm_get_text(titles_1990_preprocessed)

In [25]:
prep_docs_1990[:2]

['object model capabilities distributed object management',
 'distributed object management technology']

In [26]:
train_set_1990 = tp_1990.fit(text_for_contextual=unprep_corpus_1990, text_for_bow=prep_docs_1990)

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/197 [00:00<?, ?it/s]

In [27]:
tp_1990.vocab[:10]

array(['abelian', 'absolute', 'abstract', 'abstraction', 'abstracts',
       'academic', 'acceptance', 'access', 'accuracy', 'acm'],
      dtype=object)

In [28]:
ctm_before_1990 = CombinedTM(bow_size=len(tp_1990.vocab), contextual_size=768, n_components=10, num_epochs=10)
ctm_before_1990.fit(train_set_1990)

Epoch: [10/10]	 Seen Samples: [393600/393860]	Train Loss: 33.31867883263565	Time: 0:00:04.285560: : 10it [00:45,  4.55s/it]
100%|██████████| 616/616 [00:02<00:00, 214.45it/s]


In [29]:
ctm_get_topics(ctm_before_1990)

Topic 1: logic theory theorem symbolic set association logics modal meeting proof calculus propositional
Topic 2: information computer system data management retrieval systems review science database chemical decision
Topic 3: recognition using analysis digital image pattern method approach application processing detection images
Topic 4: networks network simulation von der performance de zur und communication des local
Topic 5: computers introduction research future operations chess editor letter report technology guest history
Topic 6: language design languages programming software implementation program hardware machine memory environment development
Topic 7: control systems optimal model time linear stochastic adaptive estimation analysis nonlinear distributed
Topic 8: magnetic surfaces degrees risk sets focus conversion forecasting enumerable generalization geometric compact
Topic 9: algorithm problem note problems technical linear solution programming equations scheduling algorit

### From 1990 to 2009

In [30]:
tp_1990_to_2009 = TopicModelDataPreparation("all-mpnet-base-v2")

In [31]:
prep_docs_1990_to_2009, unprep_corpus_1990_to_2009, vocab_1990_to_2009, retained_indices_1990_to_2009 = ctm_get_text(titles_1990_to_2009_preprocessed)

In [32]:
prep_docs_1990_to_2009[:2]

['evaluation objectoriented developments',
 'incremental migration information systems']

In [33]:
train_set_1990_to_2009 = tp_1990_to_2009.fit(text_for_contextual=unprep_corpus_1990_to_2009,
                                text_for_bow=prep_docs_1990_to_2009)

Batches:   0%|          | 0/1631 [00:00<?, ?it/s]

In [34]:
tp_1990_to_2009.vocab[:10]

array(['ab', 'absolute', 'abstract', 'abstraction', 'ac', 'academic',
       'acceptance', 'access', 'accuracy', 'accurate'], dtype=object)

In [35]:
ctm_1990_to_2009 = CombinedTM(bow_size=len(tp_1990_to_2009.vocab), contextual_size=768, n_components=10, num_epochs=10)
ctm_1990_to_2009.fit(train_set_1990_to_2009)

Epoch: [10/10]	 Seen Samples: [3260800/3260830]	Train Loss: 38.012924295037955	Time: 0:00:33.719083: : 10it [05:36, 33.68s/it]
100%|██████████| 5096/5096 [00:21<00:00, 232.75it/s]


In [36]:
ctm_get_topics(ctm_1990_to_2009)

Topic 1: systems control linear robust nonlinear stability feedback adaptive optimal uncertain discretetime output
Topic 2: number graphs complete trees classes automata groups note graph theorem sets complexity
Topic 3: system design distributed software decision process framework applications support development language simulation
Topic 4: networks wireless mobile network routing access performance sensor protocol dynamic efficient service
Topic 5: information web review research technology electronic paper book case online internet use
Topic 6: using image based recognition classification images detection neural segmentation feature face algorithm
Topic 7: problems problem method equations solutions methods solution solving numerical optimization boundary differential
Topic 8: analysis data models study model molecular functional dynamics structure comparison brain fmri
Topic 9: power frequency channels cmos channel estimation low signal circuit high noise modulation
Topic 10: unde

### From 2010 onwards

In [37]:
tp_2010 = TopicModelDataPreparation("all-mpnet-base-v2")

In [38]:
prep_docs_2010, unprep_corpus_2010, vocab_2010, retained_indices_2010 = ctm_get_text(titles_from_2010_preprocessed)

In [39]:
prep_docs_2010[:2]

['attacks exploiting execution', 'der']

In [40]:
train_set_2010 = tp_2010.fit(text_for_contextual=unprep_corpus_2010,
                                text_for_bow=prep_docs_2010)



Batches:   0%|          | 0/4626 [00:00<?, ?it/s]

In [41]:
tp_2010.vocab[:10]

  and should_run_async(code)


array(['abstract', 'ac', 'academic', 'accelerated', 'accelerating',
       'acceleration', 'acceptance', 'access', 'accessibility',
       'accuracy'], dtype=object)

In [42]:
ctm_from_2010 = CombinedTM(bow_size=len(tp_2010.vocab), contextual_size=768, n_components=10, num_epochs=10)
ctm_from_2010.fit(train_set_2010)

  and should_run_async(code)
Epoch: [10/10]	 Seen Samples: [9250560/9250700]	Train Loss: 45.619418312211046	Time: 0:01:35.642144: : 10it [15:50, 95.09s/it]
100%|██████████| 14455/14455 [01:03<00:00, 227.57it/s]


In [43]:
ctm_get_topics(ctm_from_2010)

Topic 1: optimization multiobjective algorithm system hybrid problem power swarm scheduling particle electric planning
Topic 2: image images sparse segmentation fusion reconstruction feature based sensing remote color detection
Topic 3: number selfadaptive complexity spaces minimum graphs degree note weight metric bound multi
Topic 4: molecular magnetic field connectivity thermal temperature resonance measurements functional changes radiation simulations
Topic 5: nonlinear systems linear class equations control differential equation boundary solutions stability fractional
Topic 6: computing cloud internet smart applications things special security iot secure issue edge
Topic 7: learning deep neural network machine classification recognition convolutional detection prediction using graph
Topic 8: analysis fuzzy decision data model series models time making regression using application
Topic 9: online social technology media knowledge review information case research digital use perspect

  and should_run_async(code)
