In [49]:
import os
from gensim import corpora, models
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [50]:
# Set the directory path containing the text files
directory = 'data_url'

# Set the path to the stopwords file
stopwords_file = 'stopWords2.txt'

# Set the output file path
output_file = 'topics.txt'

In [51]:
# Load stopwords from the stopwords file
with open(stopwords_file, 'r') as file:
    stopwords_list = file.read().splitlines()

# Create a list of stopwords by combining NLTK stopwords and custom stopwords
stopwords_list += stopwords.words('english')
stopwords_list += ["n\'t", "\'ve", "\'ll", "\\users\\abc_user\\appdata\\roaming\\python\\python36\\site-packages\\projectq\\cengines\\_optimize.py", "\\users\\abc_user\\appdata\\roaming\\python\\python36\\site-packages\\projectq\\cengines\\_basics.py"]
stopwords_list += ["/usr/bin/ld", "'s","q","/opt/openssl/.openssl/lib/libssl.a","''","'m","'re","large","'d","operators","-wl","pre-commit","static","/usr/bin/ld","oqs_meth.c","/opt/openssl/.openssl/lib/libcrypto.a","extensions_clnt.c","oqs_meth.o","cp2k.popt","extensions_srvr.c","e.g","-z"]
stopwords_list += ["libxsmm_se=1","oqs_kem_free'","oqs_kem_free","base", "ridge", "helper", "moment", "assumed", "workaround", "logic", "psis_bitrev_montgomery", "packets", "fails", "easy", "rely", "generated", "addition", "enforced", "function", "language", "instance", "helped", "years", "/sys/fs/selinux/enforce","underlying"]
stopwords_list += ["qldbsession.qldbsession","18.04.2","absolutely","compile","separate","packet","configure", "power","cxxompflags","compiled","fails","fail","latest","optimal","assume","making","regular","people","specification","specifications","libxsmm_se=0","extra","extra","buffer","exchange","practical","oqs_kem_new","oqs_sig_free'","oqs-openssl","offline","read","great","company","request","concern","means","ca","hear"]
stopwords_list += ["implementing","ec","ci","save","storage","json","dictionary","mind","naming","fixes","complete","complete","page","functions","liboqs","qldbdriver","effort","exists","requests","progress","338","operator","module","dig","worked","explicitly","installation","observe","constructor","potential","pointer","oqs","1.10","pip","millions","oqs-chromium","extremely","fine","start","generating","startup","re-used","constants","engine","cases","writing","pages",".text+0x13d",".text+0x489c","oqs_sig_sign'"]
stopwords_list += ["understand","settings","428","improved","clear","hard","condition","submit","pay","precommit","g","spot","transport","sqrt","pre-computed","pushed","replicate","instructions","task","pays","helpful","swapped","information","rebuild","expected","existing","string","enforce","libraries","crytpo","figure","codes","selection","oqs_kem_alg_is_enabled'","128","ld","re-build","choose","scientific","url","elpa","-l/opt/openssl/oqs/lib","space"]
stopwords_list += ["checksums","mirror","2c11076","ubuntu","download","preseved","practice","difficult","//github.com/yhyoo93/isogenysignature/blob/master/validate.c","loss","incentive","implications","ulimit","anymore","shy","😅😅","capabilities","ray","calculated","install_elpa.sh","share","replace","t1_lib.c",".text+0x47b","content","suppose","purpose","newhope_n/2","successfully","property","format","stored","storing","pickle","saving","noticable","incoming/outgoing","small","attendee","unnecessary","compare","expensive","holding","longer","load","links","select","strength","give","pickling","integer","conduct"]
stopwords_list += ["simple","initial","no-check-certificate","returned","classes","chose","response","grow","basic",".text+0x45cc","times","slower","alice","compute","accessible","pipelines","pair","suggest","str","individual","pkey_oqs_digestverify","framework","rate-limited","404","simplify","front","numeric","numerics","plenty","array","cuz","long","oak"]
stopwords_list += ["extensions_srvr.o","with-ld-opt='-l/opt/openssl/oqs/lib-wl","arguments","oqs_sig_free","stuff","-fopenmp","bad","parameters", "\n"]

# Initialize an empty list to store the document texts
documents = []

# Read the file names from the 'keywords' file
file_names = []
with open('keywords.txt', 'r') as file:
    lines = file.read().splitlines()
    for line in lines:
        if line.startswith('Source File:'):
            filename = line.replace('Source File:', '').strip()
            file_names.append(filename)


In [52]:
# Iterate over the file names
for filename in file_names:
    file_path = os.path.join(directory, filename)

    # Check if the file exists
    if os.path.isfile(file_path):
        with open(file_path, 'r') as file:
            content = file.read()

            # Tokenize the text into words
            tokens = word_tokenize(content)

            # Remove stopwords and convert words to lowercase
            filtered_words = [word.lower() for word in tokens if word.lower() not in stopwords_list ]

            # Append the filtered words to the documents list
            documents.append(filtered_words)


In [53]:
# Create a dictionary from the documents
dictionary = corpora.Dictionary(documents)

# Create a corpus (bag of words) from the documents
corpus = [dictionary.doc2bow(doc) for doc in documents]

# Perform LDA topic modeling
lda_model = models.LdaModel(corpus, id2word=dictionary, num_topics=10)

In [54]:
# Save the topics and their top words to a file
with open(output_file, 'w') as file:
    for topic_id, topic_words in lda_model.show_topics():
        file.write(f"Topic ID: {topic_id}\n")
        file.write(f"Top Words: {topic_words}\n")
        file.write('\n')