In [None]:
import re
import nltk
import gensim
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim import corpora, models

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Read the text file
with open('file.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Preprocess the text
def preprocess_text(text):
    # Tokenize sentences
    sentences = sent_tokenize(text)

    # Initialize stopwords
    stop_words = set(stopwords.words('english'))

    # Tokenize and clean each sentence
    cleaned_sentences = []
    for sentence in sentences:
        # Remove punctuation and numbers
        sentence = re.sub(r'[^a-zA-Z\s]', '', sentence)

        # Tokenize words
        words = word_tokenize(sentence.lower())

        # Remove stopwords
        words = [word for word in words if word not in stop_words]

        cleaned_sentences.append(words)

    return cleaned_sentences, sentences

cleaned_sentences, sentences = preprocess_text(text)

# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(cleaned_sentences)
corpus = [dictionary.doc2bow(sentence) for sentence in cleaned_sentences]

# Apply LDA to find major topics
num_topics = 5
lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

# Get the topics with their keywords
topics = lda_model.show_topics(num_topics=num_topics, num_words=10, formatted=False)

# Generate human-readable topic names based on top keywords
def generate_topic_name(keywords):
    stop_words = set(stopwords.words('english'))
    filtered_keywords = [word for word, _ in keywords if word not in stop_words]
    if len(filtered_keywords) >= 3:
        return f"{filtered_keywords[0].capitalize()} and {filtered_keywords[1]}: {filtered_keywords[2]}"
    else:
        return " ".join([word.capitalize() for word in filtered_keywords])

topic_names = {i: generate_topic_name(keywords) for i, keywords in topics}

# Print the topics with their generated names
for topic_id, keywords in topics:
    print(f"Topic {topic_id} Heading: {topic_names[topic_id]}")
    print(f"Keywords: {[word for word, _ in keywords]}")

# Map sentences to topics
def map_sentences_to_topics(lda_model, corpus, sentences):
    sentence_topics = {}
    for i, sentence_bow in enumerate(corpus):
        topic_distribution = lda_model.get_document_topics(sentence_bow)
        dominant_topic = max(topic_distribution, key=lambda x: x[1])[0]
        if dominant_topic not in sentence_topics:
            sentence_topics[dominant_topic] = []
        sentence_topics[dominant_topic].append(sentences[i])
    return sentence_topics

sentence_topics = map_sentences_to_topics(lda_model, corpus, sentences)

# Print the sentences grouped by topics
for topic, sents in sentence_topics.items():
    print(f"Heading: {topic_names[topic]}")
    for sent in sents:
        print(f" - {sent}")
    print("\n")
