In [None]:
!pip install contextualized-topic-models==2.3.0

Collecting contextualized-topic-models==2.3.0
  Downloading contextualized_topic_models-2.3.0-py2.py3-none-any.whl (35 kB)
Collecting sentence-transformers>=1.1.1 (from contextualized-topic-models==2.3.0)
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ipywidgets==7.5.1 (from contextualized-topic-models==2.3.0)
  Downloading ipywidgets-7.5.1-py2.py3-none-any.whl (121 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.6/121.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ipython==7.16.3 (from contextualized-topic-models==2.3.0)
  Downloading ipython-7.16.3-py3-none-any.whl (783 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m783.1/783.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting jedi<=0.17.2,>=0.10 (from ipython==7.16.3-

## Import General Utility Libraries

In [None]:
import re
import urllib
import gzip
import io
import csv
import random
from collections import defaultdict
from tqdm import tqdm

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

# Loading the dataset

Where to store the data file. If you want, you can adjust the path.

In [None]:
path_before_1990 = '/content/drive/My Drive/titles_before_1990.txt'
path_from_1990_to_2009 = '/content/drive/My Drive/titles_from_1990_to_2009.txt'
path_from_2010 = '/content/drive/My Drive/titles_from_2010.txt'

Execute the following cell only once to download the data and write it as a file to your google drive. Afterwards, skip this cell or comment it out.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# to download the data manually or get more information, go to: https://dblp.org/faq/How+can+I+download+the+whole+dblp+dataset.html
url = 'https://dblp.uni-trier.de/xml/dblp.xml.gz'
# num_titles = 500000  # the (max)number of titles to load


def load_gzip_file(url):
    """Download Gzip-file."""
    response = urllib.request.urlopen(url)
    compressed_file = io.BytesIO(response.read())
    decompressed_file = gzip.GzipFile(fileobj=compressed_file)
    return decompressed_file

def extract_titles(input_file, max_num=50000):
    """Extract title and publication year of dblp papers, given as input file.

    Divide the papers into 3 time periods.

    Collect max max_num papers per time period.
    """
    pairs_before_1990 = []
    count_before_1990 = 0
    pairs_from_1990_to_2009 = []
    count_from_1990_to_2009 = 0
    pairs_from_2010 = []
    count_from_2010 = 0
    got_title = False
    for line in tqdm(input_file):
        line_str = line.decode('utf-8')
        if got_title:
            # we have a title and check for the corresponding year
            year_result = re.search(r'<year>(.*)</year>', line_str)
            if year_result:
                # we also have the year and thus save the title-year pair
                year = int(year_result.group(1))
                if year < 1990 and count_before_1990 <= max_num:
                    pairs_before_1990.append((title, year))
                    count_before_1990 += 1
                elif year < 2010 and count_from_1990_to_2009 <= max_num:
                    pairs_from_1990_to_2009.append((title, year))
                    count_from_1990_to_2009 += 1
                elif year >= 2010 and count_from_2010 <= max_num:
                    pairs_from_2010.append((title, year))
                    count_from_2010 += 1
                got_title = False
        else:
            # we have no title and search for title
            result = re.search(r'<title>(.*)</title>', line_str)
            if result:
                title = result.group(1)
                if len(title.split(' ')) < 3:
                    # only include titles with at least four words
                    continue
                got_title = True

        if count_before_1990 >= max_num and count_from_1990_to_2009 >= max_num and count_from_2010 >= max_num:
            return pairs_before_1990, pairs_from_1990_to_2009, pairs_from_2010

    return pairs_before_1990, pairs_from_1990_to_2009, pairs_from_2010

def save_data(pairs, file_path):
    with open(file_path, 'w') as fout:
        writer = csv.writer(fout)
        for pair in pairs:
            writer.writerow(pair)

in_file = load_gzip_file(url)
pairs_before_1990, pairs_from_1990_to_2009, pairs_from_2010 = extract_titles(in_file)
save_data(pairs_before_1990, path_before_1990)
save_data(pairs_from_1990_to_2009, path_from_1990_to_2009)
save_data(pairs_from_2010, path_from_2010)

Mounted at /content/drive


17871458it [00:46, 385083.31it/s]


Mount your google drive (in case it is not yet mounted) so that the newly created files are available.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pairs_before_1990[:10]

[('Object Model Capabilities For Distributed Object Management.', 1989),
 ('Distributed Object Management Technology.', 1988),
 ('Muffin: A Distributed Database Machine', 1979),
 ('Algebraical Optimization of FTA-Expressions', 1988),
 ('Wissensrepr&auml;sentation und Maschinelles Lernen', 1987),
 ('An Algebraic Characterization of STUF', 1988),
 ('Zur Systemarchitektur von LILOG', 1987),
 ('Mengenorientierte Auswertung von Anfragen in der Logikprogrammiersprache PROLOG',
  1988),
 ('Definite Resolution over Constraint Languages', 1988),
 ('Dokumentation der Syntax der LILOG-Grammatik', 1988)]

# Part1: LDA

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

num_lda_topics = 5

## Before the 1990s:

In [None]:
with open(path_before_1990) as fin:
    reader = csv.reader(fin)
    titles = [row[0] for row in reader]

### preprocessing

Let's perform some simple preprocessing:

In [None]:
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z ]', '', text)
    text = text.lower()
    return text

prepro_titles_1 = [preprocess_text(title) for title in titles]

In [None]:
prepro_titles_1[:10]

['object model capabilities for distributed object management',
 'distributed object management technology',
 'muffin a distributed database machine',
 'algebraical optimization of ftaexpressions',
 'wissensrepraumlsentation und maschinelles lernen',
 'an algebraic characterization of stuf',
 'zur systemarchitektur von lilog',
 'mengenorientierte auswertung von anfragen in der logikprogrammiersprache prolog',
 'definite resolution over constraint languages',
 'dokumentation der syntax der liloggrammatik']

### Vectorization

Now we turn the documents (or titles in this case) into a matrix feature representation.

In [None]:
num_features = 10000
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_features, stop_words='english')
tf = tf_vectorizer.fit_transform(prepro_titles_1)
tf_feature_names = tf_vectorizer.get_feature_names_out()

### Fit the LDA model

In [None]:
lda = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=5, learning_method='online', random_state=42).fit(tf)

### Generate topics

In [None]:
for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-12 - 1:-1]]))

Topic 0: design algorithm theory networks algorithms application simulation computing fast testing der trees
Topic 1: systems logic model linear functions models control distributed complexity time programs comments
Topic 2: information note optimal problem network development memory circuits retrieval graphs using automatic
Topic 3: computer systems analysis software using problems programming language parallel method approach digital
Topic 4: data review languages applications pp machines pattern evaluation science new finite recognition


Topics:
0. Systems Design and Control
1. Analysis of Parallel Algorithms and Models
2. Computer Logic and Programming
3. Algorithms and Methods in Machine Learning
4. Information Processing and Structures

## From 1990 to 2009:

### Preprocessing

**1. Language detection:** detect the language used in the text dataset

In [None]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993225 sha256=7a2d3e19a4843f674921fba0bbd44894c868e84c7f96eaf59f0f8b1ed5f15013
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [None]:
import os
from langdetect import detect

def detect_language_text(file_path):
    detected_languages = set()

    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    for line in lines:
        language = detect(line.strip())
        detected_languages.add(language)

    return list(detected_languages)

In [None]:
langs_set_2 = detect_language_text("/content/drive/MyDrive/titles_from_1990_to_2009.txt")

In [None]:
langs_set_2

['cy',
 'af',
 'tl',
 'it',
 'cs',
 'de',
 'pl',
 'vi',
 'da',
 'sl',
 'so',
 'hu',
 'ro',
 'id',
 'no',
 'ca',
 'es',
 'et',
 'en',
 'sk',
 'hr',
 'fi',
 'fr',
 'pt',
 'sv',
 'tr',
 'lt',
 'nl']

**2. Translation:** translate the dataset into English

In [None]:
!pip install googletrans==4.0.0-rc1

Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.1/55.1 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2023.1.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading idna-2.10-py2.py3-none-any.whl (58 kB)
[2K     [90

In [None]:
from googletrans import Translator
import os

def translate_non_english_to_english(file_path, output_file):
    translator = Translator()

    if not os.path.exists(file_path):
        print("File not found.")
        return

    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

        # Split text into sentences or paragraphs (adjust based on your data structure)
        sentences = text.split('.')  # Split by sentences, adjust as needed

        translated_text = []
        for sentence in sentences:
            if sentence.strip():  # Check if the sentence is not empty
                if not all(ord(char) < 128 for char in sentence):  # Check if the sentence contains non-ASCII characters
                    translation = translator.translate(sentence, dest='en').text
                    translated_text.append(translation)
                else:
                    translated_text.append(sentence)  # Keep English sentences unchanged

        # Join translated and original English sentences
        result = '. '.join(translated_text)  # Adjust joining as per your original data structure

        # Write translated result to an output file
        with open(output_file, 'w', encoding='utf-8') as output:
            output.write(result)

In [None]:
file_path = '/content/drive/MyDrive/titles_from_1990_to_2009.txt'
output_file = '/content/drive/MyDrive/translated_titles_from_1990_to_2009.txt'

translate_non_english_to_english(file_path, output_file)

**3. Normally Preprocessing:** tokenization, stop word removal, lemmatization, numerical characters removal

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
def preprocess_titles(text):
    # Tokenization
    tokens = word_tokenize(text)

    # Stop word removal
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

    # Joining the preprocessed tokens back to form the processed text
    processed_text = ' '.join(lemmatized_tokens)

    # Numerical characters removal
    processed_text = re.sub(r'\d+', '', text)

    return processed_text

In [None]:
file_path = "//content/drive/MyDrive/translated_titles_from_1990_to_2009.txt"
prepro_titles_2 = []

with open(file_path, 'r', encoding='utf-8') as file:
    for title in file:
        prepro_titles = preprocess_titles(title)
        prepro_titles_2.append(prepro_titles)

In [None]:
prepro_titles_2[:10]

['An Evaluation of Object-Oriented DBMS Developments:  Edition. ,\n',
 'DARWIN: On the Incremental Migration of Legacy Information Systems,\n',
 '"Integrating Heterogeneous, Autonomous, Distributed Applications Using the DOM Prototype. ",\n',
 'Integrating Object-Oriented Applications and Middleware with Relational Databases. ,\n',
 'Towards a Transaction Management System for DOM. ,\n',
 "A 'RISC' Object Model for Object System Interoperation: Concepts and Applications. ,\n",
 'MetaObject Protocol Concepts for a RISC Object Model. ,\n',
 'Object Data Language Facilities for Multimedia Data Types. ,\n',
 'Object Data Model Facilities for Multimedia Data Types. ,\n',
 'Experiments with Dispatching in a Distributed Object System. ,\n']

### Vectorization

Now we turn the documents (or titles in this case) into a matrix feature representation.

In [None]:
num_features = 10000
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_features, stop_words='english')
tf = tf_vectorizer.fit_transform(prepro_titles_2)
tf_feature_names = tf_vectorizer.get_feature_names_out()

### Fit the LDA model

In [None]:
lda = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=5, learning_method='online', random_state=42).fit(tf)

### Generate topics

**5 topics:**

In [None]:
for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-12 - 1:-1]]))

Topic 0: based information approach network systems model web current modeling frequency method neural
Topic 1: analysis time networks performance digital systems ghz mobile multi wireless single real
Topic 2: control cmos design fuzzy using dynamic adaptive phase sub eacute converter oacute
Topic 3: using high low voltage study mode speed applications case logic level mw
Topic 4: power data based graphs management decision process support knowledge circuit bit systems


Topics:
0. Information Modeling and Network Systems
1. Performance Analysis of Digital Systems
2. Control and Design using CMOS and Fuzzy Logic
3. Study of Voltage and Logic Speed
4. Power Management and Decision Support Systems

**10 topics:**

In [None]:
lda = LatentDirichletAllocation(n_components=10, max_iter=5, learning_method='online', random_state=42).fit(tf)

In [None]:
for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-12 - 1:-1]]))

Topic 0: neural voltage networks adaptive frequency noise self evaluation sensor retrieval intelligent online
Topic 1: control analysis based network performance ghz multi wireless mhz mobile networks detection
Topic 2: method architecture problem induction bit filter loop set fully sets controlled large
Topic 3: high graphs speed case level memory video scheduling non quality direct band
Topic 4: data applications management technology mode image oacute new techniques computer la en
Topic 5: information time low systems current sub single support real decision multiple parallel
Topic 6: using fuzzy approach dynamic modeling eacute converter study process logic linear scheme
Topic 7: systems based phase knowledge learning novel signal active machine research fault cellular
Topic 8: cmos design based web model development amplifier implementation software chip dual electronic
Topic 9: power digital algorithm simulation integrated models dc circuit service vector analog services


**15 topics:**

In [None]:
lda = LatentDirichletAllocation(n_components=15, max_iter=5, learning_method='online', random_state=42).fit(tf)

In [None]:
for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-12 - 1:-1]]))

Topic 0: information networks process mobile models techniques sensor level vector memory supply use
Topic 1: cmos analysis design performance digital multi wireless support problem decision amplifier optimal
Topic 2: phase detection estimation filter la machine en loop automatic ac clustering para
Topic 3: low sub motor multiple search order service sup receiver services quality feedback
Topic 4: based data web management knowledge image channel gb circuits communication multimedia output
Topic 5: using time dynamic voltage ghz architecture real mhz retrieval space line language
Topic 6: fuzzy application learning study theory nonlinear artificial structure db set transceiver neural
Topic 7: control based network neural technology chip speed oacute self parallel technique methods
Topic 8: high integrated eacute converter new scheme development classification software intelligent prediction user
Topic 9: systems adaptive frequency single hybrid agent environment algorithms modulation p

## From 2010 onwards:

In [None]:
pairs_from_2010[:10]

[('Spectre Attacks: Exploiting Speculative Execution.', 2018),
 ('50 Jahre Studiengang Informatik an der RWTH', 2022),
 ('Computer Science Curricula 2013', 2013),
 ('Differences in productivity and impact across the different computer science subareas.',
  2012),
 ('Schloss Dagstuhl - Jahresbericht / Annual Report 2013', 2014),
 ('Schloss Dagstuhl - Jahresbericht / Annual Report 2017', 2018),
 ('Schloss Dagstuhl - Jahresbericht / Annual Report 2012', 2013),
 ('Schloss Dagstuhl - Jahresbericht / Annual Report 2020', 2021),
 ('Schloss Dagstuhl - Jahresbericht / Annual Report 2019', 2020),
 ('Schloss Dagstuhl - Jahresbericht / Annual Report 2014', 2015),
 ('Schloss Dagstuhl - Jahresbericht / Annual Report 2018', 2019),
 ('Schloss Dagstuhl - Jahresbericht / Annual Report 2016', 2017),
 ('Schloss Dagstuhl - Jahresbericht / Annual Report 2022', 2023),
 ('Schloss Dagstuhl - Jahresbericht / Annual Report 2015', 2016),
 ('Schloss Dagstuhl - Jahresbericht / Annual Report 2021', 2022),
 ('Klaus T

In [None]:
from langdetect import detect
import os
import unicodedata
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# download nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Preprocessing

In [None]:
def preprocess_title_for_lda(title, language):
    # Convert to lowercase
    title = title.lower()

    # Normalize Unicode characters
    title = unicodedata.normalize('NFKD', title)

    # Tokenize
    tokens = word_tokenize(title)

    # Remove stopwords and non-alphabetic words, apply lemmatization
    if language in stopwords.fileids():
        lang_stopwords = set(stopwords.words(language))
    else:
        lang_stopwords = set()
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in lang_stopwords]

    return ' '.join(words)

In [None]:
#preprocessing
def preprocess_multilingual_file_for_lda(file_path):
    if not os.path.exists(file_path):
        print("File not found.")
        return

    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

        preprocessed_titles = []
        for line in lines:
            try:
                language = detect(line)#detect language
                preprocessed_line = preprocess_title_for_lda(line, language)
                preprocessed_titles.append(preprocessed_line)
            except Exception as e:
                print(f"Error processing line: {e}")

        return preprocessed_titles

In [None]:
file_path = '/content/drive/MyDrive/titles_from_2010.txt'
prepro_titles_3 = preprocess_multilingual_file_for_lda(file_path)

In [None]:
# Sample titles
prepro_titles_3[:5]

### Vetorization

In [None]:
# CountVectorizer with minimal preprocessing
num_features = 10000
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_features, tokenizer=lambda x: x.split(), stop_words='english')
tf = tf_vectorizer.fit_transform(prepro_titles_3)
tf_feature_names = tf_vectorizer.get_feature_names_out()



### fit the LDA model and generate topics

**5 topics**

In [None]:
num_lda_topics = 5  # Set the number of 5 topics
lda = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=10, learning_method='online', random_state=42)
lda.fit(tf)

# Displaying the top words in each topic
for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic {topic_idx}: ", end="")
    print(" ".join([tf_feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]))

Topic 0: optimization learning hybrid efficient cognitive radio selection user transmission network
Topic 1: using analysis network design dynamic method wireless energy algorithm simulation
Topic 2: control power optimal mimo resource cooperative mobile detection adaptive strategy
Topic 3: model performance vehicle process approach online joint effect new decision
Topic 4: based information data channel social estimation scheme management framework robust


Topics:
0. Systems engineering and control
1. Computational analysis and algorithms
2. Computer science and theoretical computing
3. Algorithmic methods and applications
4. Information systems and graph theory

**10 topics**

In [None]:
num_lda_topics = 10  # Set the more number of topics
lda = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=10, learning_method='online', random_state=42)
lda.fit(tf)

for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic {topic_idx}: ", end="")
    print(" ".join([tf_feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]))

Topic 0: network design mobile detection adaptive cognitive linear evaluation novel fading
Topic 1: control wireless algorithm resource cooperative effect scheduling strategy access code
Topic 2: optimal knowledge field implementation solution computation distribution use synthesis comparison
Topic 3: information optimization approach vehicle hybrid management joint framework electric decision
Topic 4: based communication mimo power channel social scheme vehicular robust nonlinear
Topic 5: learning online massive game transmission research reinforcement machine behavior citation
Topic 6: analysis data dynamic selection heterogeneous interference computing cellular parallel support
Topic 7: method energy modeling application relay multiple user service web efficiency
Topic 8: using model performance simulation process estimation efficient distributed radio study
Topic 9: allocation state case numerical prediction modulation interaction localization graph uncertainty


**15 topics**

In [None]:
num_lda_topics = 15  # Set the more number of topics
lda = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=10, learning_method='online', random_state=42)
lda.fit(tf)

for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic {topic_idx}: ", end="")
    print(" ".join([tf_feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]))

Topic 0: mobile cognitive evaluation web security tracking computation protocol human internet
Topic 1: network design wireless communication channel cooperative relay vehicular time secure
Topic 2: mimo optimal adaptive beamforming distribution modulation mechanism offloading complex policy
Topic 3: performance algorithm study research deep integrated case development efficiency problem
Topic 4: information social management joint framework strategy novel knowledge numerical planning
Topic 5: optimization approach power hybrid electric decision service code sensor intelligent
Topic 6: resource linear heterogeneous nonlinear computing data implementation solution tool feedback
Topic 7: analysis efficient scheme new multiple user spectrum parallel random sensing
Topic 8: using control simulation distributed radio access impact science traffic trajectory
Topic 9: estimation state field citation capacity localization cell finite parameter uncertainty
Topic 10: model dynamic method energy 

# Part2: Combined Topic Models

Method developed by [Bianchi et al. 2021](https://aclanthology.org/2021.acl-short.96/).

[A 6min presentation of the paper by one of the authors.](https://underline.io/lecture/25716-pre-training-is-a-hot-topic-contextualized-document-embeddings-improve-topic-coherence)

Code: [https://github.com/MilaNLProc/contextualized-topic-models](https://github.com/MilaNLProc/contextualized-topic-models)

Tutorial: [https://colab.research.google.com/drive/1fXJjr_rwqvpp1IdNQ4dxqN4Dp88cxO97?usp=sharing](https://colab.research.google.com/drive/1fXJjr_rwqvpp1IdNQ4dxqN4Dp88cxO97?usp=sharing)

Again, perform topic modelling for the three time periods - this time using the combined topic models (CTMs).

You can use and adapt the code from the tutorial linked above.

Use the available GPU for faster running times.

In [None]:
!pip install --upgrade contextualized_topic_models #prevent attribute error

Collecting contextualized_topic_models
  Downloading contextualized_topic_models-2.5.0-py2.py3-none-any.whl (36 kB)
Collecting gensim==4.2.0 (from contextualized_topic_models)
  Downloading gensim-4.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (24.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.0/24.0 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence-transformers>=2.1.1 (from contextualized_topic_models)
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ipywidgets==7.5.1 (from contextualized_topic_models)
  Downloading ipywidgets-7.5.1-py2.py3-none-any.whl (121 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.6/121.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ipython==8.10.0 (from cont

In [None]:
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessingStopwords

num_ctm_topics = 5  # you can also choose a higher number of topics

### Before the 1990s:

In [None]:
#open original dataset
with open(path_before_1990) as fin:
    reader = csv.reader(fin)
    titles_1 = [row[0] for row in reader]

In [None]:
#preprocessing
import nltk
from nltk.corpus import stopwords as stop_words

nltk.download('stopwords')

stopwords = list(stop_words.words("english"))

sp = WhiteSpacePreprocessingStopwords(titles_1, stopwords_list=stopwords)
preprocessed_documents, unpreprocessed_corpus, vocab, retained_indices = sp.preprocess()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
tp = TopicModelDataPreparation("all-mpnet-base-v2")

training_dataset_1 = tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents)

Batches:   0%|          | 0/248 [00:00<?, ?it/s]

In [None]:
ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=num_ctm_topics, num_epochs=10)
ctm.fit(training_dataset_1) # run the model

Epoch: [10/10]	 Seen Samples: [494080/494430]	Train Loss: 35.40875399545067	Time: 0:00:14.506315: : 10it [02:25, 14.58s/it]
100%|██████████| 773/773 [00:13<00:00, 58.20it/s]


In [None]:
ctm.get_topic_lists(10)

[['logic',
  'theorem',
  'automata',
  'sets',
  'sub',
  'theories',
  'languages',
  'properties',
  'free',
  'sup'],
 ['environmental',
  'transputer',
  'subject',
  'event',
  'multiobjective',
  'empirical',
  'congestion',
  'acoustic',
  'heterogeneous',
  'nuclear'],
 ['algorithm',
  'problem',
  'algorithms',
  'problems',
  'parallel',
  'recognition',
  'pattern',
  'method',
  'note',
  'using'],
 ['computer',
  'information',
  'data',
  'language',
  'system',
  'design',
  'software',
  'science',
  'processing',
  'network'],
 ['systems',
  'control',
  'analysis',
  'model',
  'time',
  'uuml',
  'optimal',
  'auml',
  'decision',
  'der']]

### From 1990 to 2009

In [None]:
with open(path_from_1990_to_2009) as fin:
    reader = csv.reader(fin)
    titles_2 = [row[0] for row in reader]

In [None]:
sp = WhiteSpacePreprocessingStopwords(titles_2, stopwords_list=stopwords)
preprocessed_documents, unpreprocessed_corpus, vocab, retained_indices = sp.preprocess()

In [None]:
tp = TopicModelDataPreparation("all-mpnet-base-v2")

training_dataset_2 = tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents)

Batches:   0%|          | 0/249 [00:00<?, ?it/s]

In [None]:
ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=num_ctm_topics, num_epochs=10)
ctm.fit(training_dataset_2) # run the model

Epoch: [10/10]	 Seen Samples: [496000/496050]	Train Loss: 41.79111079062185	Time: 0:00:17.012028: : 10it [02:57, 17.70s/it]
100%|██████████| 776/776 [00:15<00:00, 50.01it/s]


In [None]:
ctm.get_topic_lists(10)

[['amp',
  'science',
  'information',
  'behavior',
  'review',
  'user',
  'book',
  'retrieval',
  'ai',
  'search'],
 ['nets',
  'solutions',
  'programs',
  'equations',
  'number',
  'integer',
  'bounds',
  'extended',
  'weighted',
  'numerical'],
 ['power',
  'cmos',
  'high',
  'low',
  'converter',
  'frequency',
  'dc',
  'current',
  'control',
  'dual'],
 ['de',
  'system',
  'based',
  'fuzzy',
  'using',
  'neural',
  'oacute',
  'model',
  'network',
  'approach'],
 ['time',
  'networks',
  'wireless',
  'mobile',
  'performance',
  'systems',
  'analysis',
  'real',
  'data',
  'distributed']]

### From 2010 onwards

In [None]:
#open the original dataset
with open(path_from_2010) as fin:
    reader = csv.reader(fin)
    titles_3 = [row[0] for row in reader]

In [None]:
sp = WhiteSpacePreprocessingStopwords(titles_3, stopwords_list=stopwords)
preprocessed_documents, unpreprocessed_corpus, vocab, retained_indices = sp.preprocess()

In [None]:
tp = TopicModelDataPreparation("all-mpnet-base-v2")

training_dataset_3 = tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents)

Batches:   0%|          | 0/249 [00:00<?, ?it/s]

In [None]:
ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=num_ctm_topics, num_epochs=10)
ctm.fit(training_dataset_3) # run the model

Epoch: [10/10]	 Seen Samples: [496000/496140]	Train Loss: 46.52148956791047	Time: 0:00:14.662077: : 10it [02:24, 14.42s/it]
100%|██████████| 776/776 [00:13<00:00, 58.18it/s]


In [None]:
ctm.get_topic_lists(10)

[['social',
  'research',
  'behavior',
  'evidence',
  'knowledge',
  'information',
  'engineering',
  'technology',
  'science',
  'risk'],
 ['networks',
  'mimo',
  'wireless',
  'performance',
  'cooperative',
  'systems',
  'allocation',
  'channel',
  'resource',
  'relay'],
 ['simulation',
  'modeling',
  'method',
  'process',
  'particle',
  'optimization',
  'dynamics',
  'simulations',
  'sub',
  'algorithm'],
 ['angle',
  'delays',
  'range',
  'platoon',
  'sampled',
  'independent',
  'incremental',
  'saving',
  'arrival',
  'sequence'],
 ['based',
  'control',
  'learning',
  'vehicle',
  'data',
  'system',
  'vehicles',
  'using',
  'model',
  'approach']]