In [1]:
import os
import glob
!pip install pdfminer
!pip install pdfminer.six
import pdfminer.high_level
import pdfminer.layout
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.tokenize import sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora, models
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

# Step 1: Convert PDF files to text format using pdfminer
def pdf_to_text(input_folder):
    output_folder = input_folder + "/text/"
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    for pdf_file in glob.glob(input_folder + "/*.pdf"):
        text_file = output_folder + os.path.splitext(os.path.basename(pdf_file))[0] + ".txt"
        with open(text_file, 'w', encoding="utf-8") as f:
            text = pdfminer.high_level.extract_text(pdf_file, laparams=pdfminer.layout.LAParams())
            f.write(text)

# Step 2: Create a dictionary that contains all the text from each of the text files
def create_dictionary(input_folder):
    documents = []
    for text_file in glob.glob(input_folder + "/*.txt"):
        with open(text_file, 'r', encoding="utf-8") as f:
            text = f.read()
            sentences = sent_tokenize(text)
            words = []
            lemmatizer = WordNetLemmatizer()
            for sentence in sentences:
                tokens = nltk.word_tokenize(sentence)
                for token in tokens:
                    words.append(lemmatizer.lemmatize(token.lower()))
            documents.append(words)
    dictionary = corpora.Dictionary(documents)
    return dictionary, documents

# Step 3: Perform topic modelling and visualisation
def topic_modelling_and_visualisation(input_folder):
    dictionary, documents = create_dictionary(input_folder)
    corpus = [dictionary.doc2bow(doc) for doc in documents]
    lda_model = models.ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=10)
    lda_visualisation = gensimvis.prepare(lda_model, corpus, dictionary)
    return lda_visualisation

# Run the code and display the visualisation
input_folder = "/content/drive/MyDrive/test"
pdf_to_text(input_folder)
lda_visualisation = topic_modelling_and_visualisation(input_folder)
pyLDAvis.display(lda_visualisation)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pdfminer
  Downloading pdfminer-20191125.tar.gz (4.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycryptodome
  Downloading pycryptodome-3.17-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m54.4 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pdfminer
  Building wheel for pdfminer (setup.py) ... [?25l[?25hdone
  Created wheel for pdfminer: filename=pdfminer-20191125-py3-none-any.whl size=6140051 sha256=56b888de55ec26e0d479dbcc480dfecd283660ed344049b69b4bd6e155bdef6d
  Stored in directory: /root/.cache/pip/wheels/d1/aa/48/370f83a970d62355a2a47d2d640094a64eea932c22edae1891
Successfully built pdfminer
Ins

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


ModuleNotFoundError: ignored