# Import Lib

In [1]:
import nltk
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.tokenize import word_tokenize
import PySimpleGUI as sg
from langdetect import detect
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Marathi Lang Stemmer

In [2]:
import os
import re
import spacy
import codecs
import mahaNLP
# Load the spaCy Multilingual model
from mahaNLP.preprocess import Preprocess
nlp = spacy.load('xx_ent_wiki_sm') # data load to identify words, punctions etc...

def load_text(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        return text
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None

def summarize_marathi_text(text, num_sentences):  # Language detection and validation
    detected_language = detect(text)
    if detected_language != 'mr':  # Marathi language code is 'mr'
        print("Please enter text in Marathi language.")
        return None  # Indicate failure or handle error message display

def preprocess_text(text):
    # Remove punctuation and digits
    text = re.sub(r'[^a-zA-Z\u0900-\u097F\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

def perform_marathi_stemming(text):
    doc = text.split(".")
    suffixes = {
        1: [u"ो", u"े", u"ू", u"ु", u"ी", u"ि", u"ा", u"च"],
        2: [u"चा", u"चे", u"ने", u"नी", u"ना", u"ते", u"ीं", u"तील", u"ात", u"ाँ", u"ां", u"ों", u"ें", u"तच", u"ता", u"ही",
            u"ले"],
        3: [u"ाचा", u"ाचे", u"तील", u"ानी", u"ाने", u"ाना", u"ाते", u"ाती", u"ाता", u"तीं", u"तून", u"तील", u"तही", u"तपण",
            u"कडे", u"ातच", u"हून", u"पणे", u"ाही", u"ाले"],
        4: [u"मधले", u"ातील", u"च्या", u"न्या", u"ऱ्या", u"ख्या", u"वर", u"साठी", u"ातून", u"कडून", u"मुळे", u"वरून",
            u"ातील", u"नीही", u"ातही", u"ातपण", u"ाकडे", u"पाशी", u"ाहून", u"ापणे", u"मधला"],
        5: [u"ामधले", u"ाच्या", u"ान्या", u"ाऱ्या", u"ाख्या", u"ावर", u"ासाठी", u"पासून", u"ाकडून", u"ामुळे", u"ावरून",
            u"कडेही", u"ानीही", u"ापाशी", u"ामधला", u"मध्ये"],
        6: [u"पर्यंत", u"ापासून", u"ाकडेही", u"पूर्वक", u"लेल्या", u"ामध्ये"],
        7: [u"ापर्यंत", u"प्रमाणे", u"तसुद्धा", u"ापूर्वक", u"ालेल्या"],
        8: [u"ाप्रमाणे", u"ातसुद्धा"],
    }
    preprocessed_text = []
    # print(doc)
    for each in doc:
        tokens = each.split(' ')
        cleaned_tokens = []
        for tok in tokens:
            if '-' in tok:
                subtokens = tok.split('-')
                cleaned_tokens.extend(subtokens)
            else:
                cleaned_tokens.append(tok.strip())

        stems = []
        for word in cleaned_tokens:
            for i in range(8, 0, -1):
                if len(word) > i + 1:
                    for suf in suffixes[i]:
                        if word.endswith(suf):
                            word = word[:-i]
            if word:
                stems.append(word)
        # if stems:
        #     preprocessed_text.append(' '.join(stems))
        preprocessed_text.append(' '.join(stems))
        # preprocessed_text += "."
    return preprocessed_text

# Function to perform lemmatization for Marathi text
# def perform_marathi_lemmatization(text):
#     # Tokenize the text using spaCy
#     doc = nlp(text)
#     lemmatized_words = []
#     # Iterate through each token and extract the lemma
#     for token in doc:
#         # If the token is not a punctuation or a space
#         if not token.is_punct and not token.is_space:
#             # Append the lemma to the list
#             lemmatized_words.append(token.lemma_)
#     # Return the list of lemmatized words
#     return lemmatized_words

# def perform_marathi_lemmatization(text):
#     doc = nlp(text)
#     lemmatized_words = [token.lemma_ for token in doc]
#     return lemmatized_words

# Marathi text

In [3]:
# marathi_text = '''एका गावात एक छोटा सुंदर मुलगा होता. त्याच्या नावाचं 'राहुल' होतं. राहुलने प्रत्येक दिवस शाळेत आणि घरीचं खेळायला मनापासून वाढवलं. त्याच्या आईबाबांनी त्याला प्रत्येक वेळी सांगितलं, "पुढचं शिका, पुढचं उभा आणि प्रत्येक क्षण आनंदाने जगा." 
# एका दिवशी, राहुलने अपन्या मित्रांसोबत एका आग्रहाच्या खेळाचं स्वागत केलं. त्याच्या मनात वाटत होतं, "माझं स्वप्न याचं सफर कसं सुरू होतं आणि कसं आनंदाने संपतं, ते सर्वांसाठी आणि माझ्या आईबाबांसाठी कसं उपयोगी होतं."
# त्याच्या आग्रहाच्या खेळाचं सुरू होतं आणि राहुल आणि त्याच्या मित्रांनी खूप आनंदाने खेळलं. त्यांना वेळ अद्याप अधिक मजा केला. राहुल आणि त्याच्या मित्रांनी साथीत खेळून खूप आनंदाने वेळ व्यतीत केलं.
# खेळाच्या शेवटी, राहुलने समजलं की, सफर महत्त्वाचं असतं. त्याचं उद्दिष्टं नक्कीपणा प्राप्त करणं आणि त्याचं स्वप्न पूर्ण करणं हे महत्त्वाचं आहे. त्याने निरंतर यशस्वीपणे काम केलं आणि आपल्या स्वप्नांचं सफर पूर्ण केलं.
# अखेर, त्याचं उद्दिष्ट साधलं आणि त्याचं स्वप्न साकार झालं. त्याचं परिश्रम आणि आत्मविश्वास ने त्याला प्रत्येकाला आणि समुदायाला प्रेरित केलं. राहुल आणि त्याचे मित्र एकत्र येऊन एकाच नात्याने सामाजिक सेवेत सहभागी झाले आणि स्वप्न साकार झाले.
# सद्यस्थितीत, राहुलने आपलं स्वप्न पूर्ण केलं, आणि त्याने प्रत्येक क्षण आनंदाने जगा.'''

# Summarize in lines

In [4]:
# num_sentences = 5

# Preprocessing Text

In [5]:
# preproceed_text = preprocess_text(marathi_text)

# lemmatization

In [6]:
# lemmatized_words = perform_marathi_lemmatization(marathi_text)
# print(lemmatized_words)

In [7]:
# # The issue is occurring because the input text is not properly tokenized before performing lemmatization. In the provided code, the input text is split by periods (`'.'`) to separate sentences, which may not work correctly for all cases, especially if there are punctuation marks within the sentences.

# # To fix this issue, we need to ensure proper tokenization of the input text before performing lemmatization. We can use Spacy for tokenization and then iterate through each token to extract its lemma. Here's the modified code:

# # ```python
# import spacy

# # Load the spaCy Multilingual model
# nlp = spacy.load('xx_ent_wiki_sm')

# # Function to perform lemmatization for Marathi text
# def perform_marathi_lemmatization(text):
#     # Tokenize the text using spaCy
#     doc = nlp(text)
#     lemmatized_words = []
#     # Iterate through each token and extract the lemma
#     for token in doc:
#         # If the token is not a punctuation or a space
#         if not token.is_punct and not token.is_space:
#             # Append the lemma to the list
#             lemmatized_words.append(token.lemma_)
#     # Return the list of lemmatized words
#     return lemmatized_words

# # Example usage:
# marathi_text = '''एका गावात एक छोटा सुंदर मुलगा होता. त्याच्या नावाचं 'राहुल' होतं. राहुलने प्रत्येक दिवस शाळेत आणि घरीचं खेळायला मनापासून वाढवलं. त्याच्या आईबाबांनी त्याला प्रत्येक वेळी सांगितलं, "पुढचं शिका, पुढचं उभा आणि प्रत्येक क्षण आनंदाने जगा." 
# एका दिवशी, राहुलने अपन्या मित्रांसोबत एका आग्रहाच्या खेळाचं स्वागत केलं. त्याच्या मनात वाटत होतं, "माझं स्वप्न याचं सफर कसं सुरू होतं आणि कसं आनंदाने संपतं, ते सर्वांसाठी आणि माझ्या आईबाबांसाठी कसं उपयोगी होतं."
# त्याच्या आग्रहाच्या खेळाचं सुरू होतं आणि राहुल आणि त्याच्या मित्रांनी खूप आनंदाने खेळलं. त्यांना वेळ अद्याप अधिक मजा केला. राहुल आणि त्याच्या मित्रांनी साथीत खेळून खूप आनंदाने वेळ व्यतीत केलं.
# खेळाच्या शेवटी, राहुलने समजलं की, सफर महत्त्वाचं असतं. त्याचं उद्दिष्टं नक्कीपणा प्राप्त करणं आणि त्याचं स्वप्न पूर्ण करणं हे महत्त्वाचं आहे. त्याने निरंतर यशस्वीपणे काम केलं आणि आपल्या स्वप्नांचं सफर पूर्ण केलं.
# अखेर, त्याचं उद्दिष्ट साधलं आणि त्याचं स्वप्न साकार झालं. त्याचं परिश्रम आणि आत्मविश्वास ने त्याला प्रत्येकाला आणि समुदायाला प्रेरित केलं. राहुल आणि त्याचे मित्र एकत्र येऊन एकाच नात्याने सामाजिक सेवेत सहभागी झाले आणि स्वप्न साकार झाले.
# सद्यस्थितीत, राहुलने आपलं स्वप्न पूर्ण केलं, आणि त्याने प्रत्येक क्षण आनंदाने जगा.'''
# lemmatized_words = perform_marathi_lemmatization(marathi_text)
# print(lemmatized_words)
# # ```

# # This should correctly perform le

# # mmatization for the provided Marathi text.

In [8]:
# perform_marathi_lemmatization(preproceed_text)

# stemming

In [9]:
# Stemmed_text = perform_marathi_stemming(marathi_text)

In [10]:
# Stemmed_text

This function calculates the number of bigrams in a given sentence by tokenizing the sentence into words and generating all possible pairs of adjacent words, representing bigrams.

In [11]:
# def calculate_bigram_length(sentence):
#     # Tokenize the sentence into words
#     words = sentence.split()
    
#     # Generate bigrams
#     bigrams = [(words[i], words[i + 1]) for i in range(len(words) - 1)]
    
#     # Return the number of bigrams
#     return len(bigrams)

This function calculates the number of trigrams in a given sentence by tokenizing the sentence into words and generating all possible sequences of three consecutive words, representing trigrams.

In [12]:
# def calculate_trigram_length(sentence):
#     # Tokenize the sentence into words
#     words = sentence.split()
    
#     # Generate trigrams
#     trigrams = [(words[i], words[i + 1], words[i + 2]) for i in range(len(words) - 2)]
    
#     # Return the number of trigrams
#     return len(trigrams)

This function takes a sentence as input, calculates the TF-ISF value for each term in the sentence, and returns the average TF-ISF value. TF-ISF is a measure used in information retrieval to assess the importance of terms in a document relative to the entire corpus.

In [13]:
# def calculate_tf_isf(sentence):
#     # Tokenize the sentence into words
#     words = sentence.split()
    
#     # Calculate term frequency
#     term_freq = {}
#     for word in words:
#         term_freq[word] = term_freq.get(word, 0) + 1
    
#     # Calculate inverse sentence frequency
#     inverse_sentence_freq = 1 / len(words)
    
#     # Calculate TF-ISF for each term
#     tf_isf = {}
#     for word, freq in term_freq.items():
#         tf_isf[word] = freq * inverse_sentence_freq

#     summ = 0
#     for val in  tf_isf.values():
#         summ += val
#     tf_isf = summ/len(words)
    
#     return tf_isf

This function takes a sentence and the maximum sentence length in a document as input, calculates the ratio of the sentence's length to the maximum length, and returns this ratio as the sentence length factor. 

In [14]:
# def calculate_sentence_length_factor(sentence, max_sentence_length):
#     # Tokenize the sentence into words
#     words = sentence.split()
    
#     # Calculate the length of the sentence
#     sentence_length = len(words)
    
#     # Calculate the sentence length factor
#     sentence_length_factor = sentence_length / max_sentence_length
    
#     return sentence_length_factor

This function takes a sentence as input, counts the number of numeric tokens in the sentence, calculates the ratio of numeric tokens to the total number of tokens, and returns this ratio.

In [15]:
# def calculate_numeric_tokens_ratio(sentence):
#     # Tokenize the sentence into words
#     words = sentence.split()
    
#     # Count the number of numeric tokens
#     numeric_count = sum(1 for word in words if word.isdigit())
    
#     # Calculate the ratio of numeric tokens to total tokens
#     if len(words) > 0:
#         numeric_tokens_ratio = numeric_count / len(words)
#     else:
#         numeric_tokens_ratio = 0.0
    
#     return numeric_tokens_ratio

This helps prioritize or weigh sentences based on their position within the document. 

In [16]:
# def calculate_pos_factor(total_sentences, current_pos):
#     """
#     Calculate the Position Factor (POS factor) for a given sentence.

#     Parameters:
#     total_sentences (int): Total number of sentences in the document.
#     current_pos (int): Position of the current sentence within the document.

#     Returns:
#     float: The calculated Position Factor.
#     """
#     return (total_sentences - current_pos) / total_sentences


The Thematic Number helps in assessing the thematic relevance or concentration of keywords within a sentence relative to the main themes or topics of the document.

In [17]:
# def calculate_thematic_number(sentence, keywords):
#     """
#     Calculate the Thematic Number for a given sentence.

#     Parameters:
#     sentence (str): The input sentence.
#     keywords (list): List of keywords representing main themes or topics of the document.

#     Returns:
#     float: The calculated Thematic Number.
#     """
#     # Tokenize the sentence into words
#     words = sentence.split()
    
#     # Count the number of keywords present in the sentence
#     keyword_count = sum(1 for word in words if word in keywords)
    
#     # Calculate the ratio of keywords in the sentence to the total number of keywords
#     if len(keywords) > 0:
#         thematic_number = keyword_count / len(keywords)
#     else:
#         thematic_number = 0.0
    
#     return thematic_number


This function is commonly used in text analysis tasks such as clustering or classification, where the centroid serves as a representative feature vector for the entire document collection.

In [18]:
# def calculate_centroid(preprocessed_text):
#     vectorizer = CountVectorizer()
#     X = vectorizer.fit_transform(preprocessed_text)
#     centroid = np.mean(X.toarray(), axis=0)
#     return centroid

This function is useful when you need to convert sparse matrices (which are memory-efficient but not directly usable for certain operations) into dense matrices (which are easier to work with but may require more memory).

In [19]:

# def convert_sparse_to_dense_general(sparse_matrix, num_rows, num_cols):
#     dense_matrix = np.zeros((num_rows, num_cols), dtype=sparse_matrix.dtype)
#     rows, cols = sparse_matrix.nonzero()
#     data = sparse_matrix.data
#     for row, col, value in zip(rows, cols, data):
#         if row < num_rows and col < num_cols:
#             dense_matrix[row, col] = value
#     return dense_matrix

This function allows for the computation of how similar a sentence is to the overall theme or content represented by the centroid of a document, which can be useful for tasks such as document classification or clustering.

In [20]:
# def calculate_cosine_similarity(sentence, document_centroid):
#     document_vector = np.array([document_centroid])
#     sentence_vector = vectorizer.transform([sentence])
#     sentence_vector = convert_sparse_to_dense_general(sentence_vector, 1, 59)
#     cosine_sim = cosine_similarity(sentence_vector, document_vector)[0][0]
#     return cosine_sim

This function facilitates the evaluation of the importance or relevance of each sentence in the context of the entire document based on various linguistic and statistical properties.

In [21]:
# def calculate_sentence_scores(stemmed_text, max_sentence_length):
#     sentence_scores = {}

#     for idx, sentence in enumerate(stemmed_text):
#         try:
#             # Calculate various factors
#             bigram_length = calculate_bigram_length(sentence)
#             trigram_length = calculate_trigram_length(sentence)
#             tf_isf = calculate_tf_isf(sentence)
#             thematic_number = calculate_thematic_number(sentence, [])
#             sentence_length_factor = calculate_sentence_length_factor(sentence, max_sentence_length)
#             numeric_tokens_ratio = calculate_numeric_tokens_ratio(sentence)
#             # Calculate sentence score
#             sentence_score = (
#                 bigram_length
#                 + trigram_length
#                 + tf_isf
#                 + thematic_number
#                 + sentence_length_factor
#                 + numeric_tokens_ratio
#             )
#         except: 
#             sentence_score = 0
#         sentence_scores[idx] = sentence_score
#     return sentence_scores

This function provides a concise summary of the original text by extracting the most important sentences based on their assigned scores.







In [22]:
# def generate_summary(original_text, stemmed_text, sentence_scores, num_sentences=5):
#     # Sort the sentences based on scores (in descending order)
#     sorted_sentences = sorted(sentence_scores.items(), key=lambda x: x[1], reverse=True)
#     # Select the top num_sentences sentences for the summary
#     selected_sentences = sorted_sentences[:num_sentences]
#     # Sort selected sentences based on their original order
#     selected_sentences.sort(key=lambda x: x[0])
#     doc = original_text.split(".")
#     # Generate the summary by joining selected sentences with periods
#     summary = '. '.join(doc[idx] for idx, _ in selected_sentences) + '.'
#     return summary

By following these steps, the provided code is able to preprocess the Marathi text, compute relevant metrics, and generate a summary that captures the key information from the original text.

In [23]:
# # Preprocess text
# preprocessed_text = preprocess_text(marathi_text)
# stemmed_text = preprocessed_text.split(".")
        
# # Calculate total sentences
# total_sentences = len(stemmed_text)
        
# # Calculate maximum sentence length
# max_sentence_length = max(len(sentence.split()) for sentence in stemmed_text)
        
# # Calculate document centroid
# document_centroid = calculate_centroid(stemmed_text)
        
# # Calculate sentence scores
# sentence_scores = calculate_sentence_scores(stemmed_text, max_sentence_length)
        
# # Generate summary
# summary = generate_summary(marathi_text, stemmed_text, sentence_scores, num_sentences)

# GUI

In [24]:
# import PySimpleGUI as sg
# from langdetect import detect


# # Define the layout of the GUI
# layout = [
#     [sg.Text('Enter your Marathi text:')],
#     [sg.Multiline(key='-TEXT-', size=(100, 12))],
#     [sg.Text('Enter the desired number of sentences for the summary:')],
#     [sg.InputText(key='-NUM_SENTENCES-'), sg.Button('Generate Summary'), sg.Button('Reset'), sg.Button('Exit')],
#     [sg.Multiline(size=(100, 12), key='-OUTPUT-')],
# ]

# # Create the window
# window = sg.Window('Marathi Text Summarizer', layout, size=(700, 520))

# # Define the functions needed for summarization
# def preprocess_text(text):
#     # Preprocessing steps
#     return text

# # def calculate_centroid(preproceed_text):
# #     vectorizer = TfidfVectorizer()
# #     tfidf_matrix = vectorizer.fit_transform(preproceed_text)
# #     centroid_vector = np.mean(tfidf_matrix, axis=0)
# #     return centroid_vector

# def convert_sparse_to_dense_general(sparse_matrix, num_rows, num_cols):
#     dense_matrix = np.zeros((num_rows, num_cols), dtype=sparse_matrix.dtype)
#     rows, cols = sparse_matrix.nonzero()
#     data = sparse_matrix.data
#     for row, col, value in zip(rows, cols, data):
#         if row < num_rows and col < num_cols:
#             dense_matrix[row, col] = value
#     return dense_matrix


# def calculate_sentence_scores(stemmed_text, total_sentences,  max_sentence_length):
#     sentence_scores = {}

#     for idx, sentence in enumerate(stemmed_text):
#         try:
#             pos_factor = calculate_pos_factor(total_sentences, idx + 1)
#             bigram_length = calculate_bigram_length(sentence)
#             trigram_length = calculate_trigram_length(sentence)
#             tf_isf = calculate_tf_isf(sentence)
#             # cosine_similarity_score = calculate_cosine_similarity(sentence, document_centroid)
#             thematic_number = calculate_thematic_number(sentence,[])
#             sentence_length_factor = calculate_sentence_length_factor(sentence, max_sentence_length)
#             numeric_tokens_ratio = calculate_numeric_tokens_ratio(sentence)
#             sentence_score = (
#                 pos_factor
#                 + bigram_length
#                 + trigram_length
#                 + tf_isf
#                 # + cosine_similarity_score
#                 + thematic_number
#                 + sentence_length_factor
#                 + numeric_tokens_ratio
#             )
#         except: 
#             sentence_score = 0
#         sentence_scores[idx] = sentence_score
#     return sentence_scores

# def generate_summary(original_text, stemmed_text, sentence_scores, num_sentences=5):
#     # Sort the sentences based on scores (in descending order)
#     sorted_sentences = sorted(sentence_scores.items(), key=lambda x: x[1], reverse=True)
#     # Select the top num_sentences sentences for the summary
#     selected_sentences = sorted_sentences[:num_sentences]
#     # Sort selected sentences based on their original order
#     selected_sentences.sort(key=lambda x: x[0])
#     doc = original_text.split(".")
#     # Generate the summary by joining selected sentences
#     summary = ' '.join(doc[idx] for idx, _ in selected_sentences)
#     return summary


# def calculate_bigram_length(sentence):
#     # Tokenize the sentence into words
#     words = sentence.split()
#     # Generate bigrams
#     bigrams = [(words[i], words[i + 1]) for i in range(len(words) - 1)]
#     # Return the number of bigrams
#     return len(bigrams)


# def calculate_trigram_length(sentence):
#     # Tokenize the sentence into words
#     words = sentence.split()
#     # Generate trigrams
#     trigrams = [(words[i], words[i + 1], words[i + 2]) for i in range(len(words) - 2)]
#     # Return the number of trigrams
#     return len(trigrams)


# def calculate_tf_isf(sentence):
#     # Tokenize the sentence into words
#     words = sentence.split()
#     # Calculate term frequency
#     term_freq = {}
#     for word in words:
#         term_freq[word] = term_freq.get(word, 0) + 1
#     # Calculate inverse sentence frequency
#     inverse_sentence_freq = 1 / len(words)
#     # Calculate TF-ISF for each term
#     tf_isf = {}
#     for word, freq in term_freq.items():
#         tf_isf[word] = freq * inverse_sentence_freq
#     summ = 0
#     for val in  tf_isf.values():
#         summ += val
#     tf_isf = summ/len(words)
#     return tf_isf


# def calculate_sentence_length_factor(sentence, max_sentence_length):
#     # Tokenize the sentence into words
#     words = sentence.split()
#     # Calculate the length of the sentence
#     sentence_length = len(words)
#     # Calculate the sentence length factor
#     sentence_length_factor = sentence_length / max_sentence_length
#     return sentence_length_factor


# def calculate_numeric_tokens_ratio(sentence):
#     # Tokenize the sentence into words
#     words = sentence.split()
#     # Count the number of numeric tokens
#     numeric_count = sum(1 for word in words if word.isdigit())
#     # Calculate the ratio of numeric tokens to total tokens
#     if len(words) > 0:
#         numeric_tokens_ratio = numeric_count / len(words)
#     else:
#         numeric_tokens_ratio = 0.0
#     return numeric_tokens_ratio


# def calculate_pos_factor(total_sentences, current_pos):
#     return (total_sentences - current_pos) / total_sentences


# def calculate_thematic_number(sentence, keywords):
#     # Tokenize the sentence into words
#     words = sentence.split()
#     # Count the number of keywords present in the sentence
#     keyword_count = sum(1 for word in words if word in keywords)
#     # Calculate the ratio of keywords in the sentence to the total number of keywords
#     if len(keywords) > 0:
#         thematic_number = keyword_count / len(keywords)
#     else:
#         thematic_number = 0.0
#     return thematic_number


# def calculate_centroid(preprocessed_text):
#     vectorizer = CountVectorizer()
#     X = vectorizer.fit_transform(preprocessed_text)
#     centroid = np.mean(X.toarray(), axis=0)
#     return centroid

# def cosine_similarity_with_centroid_marathi(sentence, centroid, vectorizer):
#     sentence_vector = vectorizer.transform([sentence]).toarray()
#     similarity = cosine_similarity(sentence_vector, [centroid])
#     print("similarity: ", similarity[0][0])
#     return similarity[0][0]

# def convert_sparse_to_dense_general(sparse_matrix, num_rows, num_cols):
#     dense_matrix = np.zeros((num_rows, num_cols), dtype=sparse_matrix.dtype)
#     rows, cols = sparse_matrix.nonzero()
#     data = sparse_matrix.data
#     for row, col, value in zip(rows, cols, data):
#         if row < num_rows and col < num_cols:
#             dense_matrix[row, col] = value
#     return dense_matrix

# def calculate_cosine_similarity(sentence, document_centroid):
#     document_vector = np.array([document_centroid])
#     vectorizer = TfidfVectorizer()
#     sentence_vector = vectorizer.fit_transform([sentence])
#     sentence_vector = convert_sparse_to_dense_general(sentence_vector, 1, 59)
#     cosine_sim = cosine_similarity(sentence_vector, document_vector)[0][0]
#     return cosine_sim


# # Event loop
# while True:
#     event, values = window.read()
#     if event == sg.WINDOW_CLOSED or event == 'Exit':
#         break
#     elif event == 'Generate Summary' or (event == '\r' and values['-TEXT-'] != ''):
#         # Get input values
#         marathi_text = values['-TEXT-']
#         num_sentences = int(values['-NUM_SENTENCES-']) if values['-NUM_SENTENCES-'] else 5
        
#         # Perform language detection
#         try:
#             if detect(marathi_text) != 'mr':  # 'mr' is the ISO code for Marathi
#                 raise ValueError('Please input Marathi text only.')
#         except:
#             sg.popup_error('Please input Marathi text only.')
#             continue
        
#         # Preprocess text
#         preprocessed_text = preprocess_text(marathi_text)
#         stemmed_text = perform_marathi_stemming(marathi_text)
        
#         # Calculate total sentences
#         total_sentences = len(stemmed_text)
        
#         # Calculate maximum sentence length
#         max_sentence_length = max(len(sentence.split()) for sentence in stemmed_text)
        
#         # Calculate sentence scores
#         sentence_scores = calculate_sentence_scores(stemmed_text, total_sentences, max_sentence_length)
        
#         # Generate summary
#         summary = generate_summary(marathi_text, stemmed_text, sentence_scores, num_sentences)
        
#         # Update output in GUI
#         # sg.Print(summary)  # Print in the output element
#         window['-OUTPUT-'].update(summary)
        
#     elif event == 'Reset':
#         # Clear input and output fields
#         window['-TEXT-'].update('')
#         window['-NUM_SENTENCES-'].update('')
#         window['-OUTPUT-'].update('')

# # Close the window
# window.close()

# Optimized

In this optimized version:

Redundant operations are minimized.

The vectorizer is initialized once outside the loop.

Preprocessing steps are simplified.

Error handling is improved.

Function calls and parameter passing are streamlined

In [25]:
import PySimpleGUI as sg
from langdetect import detect
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Define the layout of the GUI
layout = [
    [sg.Text('Enter your Marathi text:')],
    [sg.Multiline(key='-TEXT-', size=(100, 12))],
    [sg.Text('Enter the desired number of sentences for the summary:(default 5)')],
    [sg.InputText(key='-NUM_SENTENCES-'), sg.Button('Generate Summary'), sg.Button('Reset'), sg.Button('Exit')],
    [sg.Multiline(size=(100, 12), key='-OUTPUT-')],
]

# Create the window
window = sg.Window('Marathi Text Summarizer', layout, size=(700, 520))

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Define the functions needed for summarization
def preprocess_text(text):
    # Placeholder for preprocessing steps
    return text

def calculate_sentence_scores(stemmed_text, max_sentence_length):
    sentence_scores = {}

    for idx, sentence in enumerate(stemmed_text):
        try:
            # Calculate various factors
            bigram_length = calculate_bigram_length(sentence)
            trigram_length = calculate_trigram_length(sentence)
            tf_isf = calculate_tf_isf(sentence)
            thematic_number = calculate_thematic_number(sentence, [])
            sentence_length_factor = calculate_sentence_length_factor(sentence, max_sentence_length)
            numeric_tokens_ratio = calculate_numeric_tokens_ratio(sentence)
            # Calculate sentence score
            sentence_score = (
                bigram_length
                + trigram_length
                + tf_isf
                + thematic_number
                + sentence_length_factor
                + numeric_tokens_ratio
            )
        except: 
            sentence_score = 0
        sentence_scores[idx] = sentence_score
    return sentence_scores


def generate_summary(original_text, stemmed_text, sentence_scores, num_sentences=5):
    # Sort the sentences based on scores (in descending order)
    sorted_sentences = sorted(sentence_scores.items(), key=lambda x: x[1], reverse=True)
    # Select the top num_sentences sentences for the summary
    selected_sentences = sorted_sentences[:num_sentences]
    # Sort selected sentences based on their original order
    selected_sentences.sort(key=lambda x: x[0])
    doc = original_text.split(".")
    # Generate the summary by joining selected sentences with periods
    summary = '. '.join(doc[idx] for idx, _ in selected_sentences) + '.'
    return summary


def calculate_bigram_length(sentence):
    # Tokenize the sentence into words
    words = sentence.split()
    # Generate bigrams
    bigrams = [(words[i], words[i + 1]) for i in range(len(words) - 1)]
    # Return the number of bigrams
    return len(bigrams)

def calculate_trigram_length(sentence):
    # Tokenize the sentence into words
    words = sentence.split()
    # Generate trigrams
    trigrams = [(words[i], words[i + 1], words[i + 2]) for i in range(len(words) - 2)]
    # Return the number of trigrams
    return len(trigrams)

def calculate_tf_isf(sentence):
    # Tokenize the sentence into words
    words = sentence.split()
    # Calculate term frequency
    term_freq = {}
    for word in words:
        term_freq[word] = term_freq.get(word, 0) + 1
    # Calculate inverse sentence frequency
    inverse_sentence_freq = 1 / len(words)
    # Calculate TF-ISF for each term
    tf_isf = sum(freq * inverse_sentence_freq for freq in term_freq.values()) / len(words)
    return tf_isf

def calculate_sentence_length_factor(sentence, max_sentence_length):
    # Tokenize the sentence into words
    words = sentence.split()
    # Calculate the sentence length factor
    sentence_length_factor = len(words) / max_sentence_length
    return sentence_length_factor

def calculate_numeric_tokens_ratio(sentence):
    # Tokenize the sentence into words
    words = sentence.split()
    # Count the number of numeric tokens
    numeric_count = sum(1 for word in words if word.isdigit())
    # Calculate the ratio of numeric tokens to total tokens
    numeric_tokens_ratio = numeric_count / len(words) if words else 0.0
    return numeric_tokens_ratio

def calculate_thematic_number(sentence, keywords):
    # Tokenize the sentence into words
    words = sentence.split()
    # Calculate the ratio of keywords in the sentence to the total number of keywords
    thematic_number = sum(1 for word in words if word in keywords) / len(keywords) if keywords else 0.0
    return thematic_number

def calculate_centroid(preprocessed_text):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(preprocessed_text)
    centroid = np.mean(X.toarray(), axis=0)
    return centroid

def convert_sparse_to_dense_general(sparse_matrix, num_rows, num_cols):
    dense_matrix = np.zeros((num_rows, num_cols), dtype=sparse_matrix.dtype)
    rows, cols = sparse_matrix.nonzero()
    data = sparse_matrix.data
    for row, col, value in zip(rows, cols, data):
        if row < num_rows and col < num_cols:
            dense_matrix[row, col] = value
    return dense_matrix

def calculate_cosine_similarity(sentence, document_centroid):
    document_vector = np.array([document_centroid])
    sentence_vector = vectorizer.transform([sentence])
    sentence_vector = convert_sparse_to_dense_general(sentence_vector, 1, 59)
    cosine_sim = cosine_similarity(sentence_vector, document_vector)[0][0]
    return cosine_sim

# Event loop
while True:
    event, values = window.read()
    if event == sg.WINDOW_CLOSED or event == 'Exit':
        break
    elif event == 'Generate Summary' or (event == '\r' and values['-TEXT-'] != ''):
        # Get input values
        marathi_text = values['-TEXT-']
        num_sentences = int(values['-NUM_SENTENCES-']) if values['-NUM_SENTENCES-'] else 5
        
        # Perform language detection
        try:
            if detect(marathi_text) != 'mr':  # 'mr' is the ISO code for Marathi
                raise ValueError('Please input Marathi text only.')
        except:
            sg.popup_error('Please input Marathi text only.')
            continue
        
        # Preprocess text
        preprocessed_text = preprocess_text(marathi_text)
        stemmed_text = preprocessed_text.split(".")
        
        # Calculate total sentences
        total_sentences = len(stemmed_text)
        
        # Calculate maximum sentence length
        max_sentence_length = max(len(sentence.split()) for sentence in stemmed_text)
        
        # Calculate document centroid
        document_centroid = calculate_centroid(stemmed_text)
        
        # Calculate sentence scores
        sentence_scores = calculate_sentence_scores(stemmed_text, max_sentence_length)
        
        # Generate summary
        summary = generate_summary(marathi_text, stemmed_text, sentence_scores, num_sentences)
        
        # Update output in GUI
        window['-OUTPUT-'].update(summary)
        
    elif event == 'Reset':
        # Clear input and output fields
        window['-TEXT-'].update('')
        window['-NUM_SENTENCES-'].update('')
        window['-OUTPUT-'].update('')

# Close the window
window.close()


# Txt File import

In [26]:
# import PySimpleGUI as sg

# # Define the layout of the GUI
# layout = [
#     [sg.Text('Select Marathi text file:'), sg.FileBrowse(key='-FILE-')],
#     [sg.Text('Or enter your Marathi text:')],
#     [sg.Multiline(key='-TEXT-', size=(100, 12))],
#     [sg.Text('Enter the desired number of sentences for the summary:')],
#     [sg.InputText(key='-NUM_SENTENCES-'), sg.Button('Generate Summary'), sg.Button('Reset'), sg.Button('Exit')],
#     [sg.Multiline(size=(100, 12), key='-OUTPUT-')],
# ]

# # Create the window
# window = sg.Window('Marathi Text Summarizer', layout, size=(700, 520))

# # Initialize the TF-IDF vectorizer
# vectorizer = TfidfVectorizer()

# # Define the functions needed for summarization
# def preprocess_text(text):
#     # Placeholder for preprocessing steps
#     return text

# def calculate_sentence_scores(stemmed_text, max_sentence_length):
#     sentence_scores = {}

#     for idx, sentence in enumerate(stemmed_text):
#         try:
#             # Calculate various factors
#             bigram_length = calculate_bigram_length(sentence)
#             trigram_length = calculate_trigram_length(sentence)
#             tf_isf = calculate_tf_isf(sentence)
#             thematic_number = calculate_thematic_number(sentence, [])
#             sentence_length_factor = calculate_sentence_length_factor(sentence, max_sentence_length)
#             numeric_tokens_ratio = calculate_numeric_tokens_ratio(sentence)
#             # Calculate sentence score
#             sentence_score = (
#                 bigram_length
#                 + trigram_length
#                 + tf_isf
#                 + thematic_number
#                 + sentence_length_factor
#                 + numeric_tokens_ratio
#             )
#         except: 
#             sentence_score = 0
#         sentence_scores[idx] = sentence_score
#     return sentence_scores


# def generate_summary(original_text, stemmed_text, sentence_scores, num_sentences=5):
#     # Sort the sentences based on scores (in descending order)
#     sorted_sentences = sorted(sentence_scores.items(), key=lambda x: x[1], reverse=True)
#     # Select the top num_sentences sentences for the summary
#     selected_sentences = sorted_sentences[:num_sentences]
#     # Sort selected sentences based on their original order
#     selected_sentences.sort(key=lambda x: x[0])
#     doc = original_text.split(".")
#     # Generate the summary by joining selected sentences with periods
#     summary = '. '.join(doc[idx] for idx, _ in selected_sentences) + '.'
#     return summary


# def calculate_bigram_length(sentence):
#     # Tokenize the sentence into words
#     words = sentence.split()
#     # Generate bigrams
#     bigrams = [(words[i], words[i + 1]) for i in range(len(words) - 1)]
#     # Return the number of bigrams
#     return len(bigrams)

# def calculate_trigram_length(sentence):
#     # Tokenize the sentence into words
#     words = sentence.split()
#     # Generate trigrams
#     trigrams = [(words[i], words[i + 1], words[i + 2]) for i in range(len(words) - 2)]
#     # Return the number of trigrams
#     return len(trigrams)

# def calculate_tf_isf(sentence):
#     # Tokenize the sentence into words
#     words = sentence.split()
#     # Calculate term frequency
#     term_freq = {}
#     for word in words:
#         term_freq[word] = term_freq.get(word, 0) + 1
#     # Calculate inverse sentence frequency
#     inverse_sentence_freq = 1 / len(words)
#     # Calculate TF-ISF for each term
#     tf_isf = sum(freq * inverse_sentence_freq for freq in term_freq.values()) / len(words)
#     return tf_isf

# def calculate_sentence_length_factor(sentence, max_sentence_length):
#     # Tokenize the sentence into words
#     words = sentence.split()
#     # Calculate the sentence length factor
#     sentence_length_factor = len(words) / max_sentence_length
#     return sentence_length_factor

# def calculate_numeric_tokens_ratio(sentence):
#     # Tokenize the sentence into words
#     words = sentence.split()
#     # Count the number of numeric tokens
#     numeric_count = sum(1 for word in words if word.isdigit())
#     # Calculate the ratio of numeric tokens to total tokens
#     numeric_tokens_ratio = numeric_count / len(words) if words else 0.0
#     return numeric_tokens_ratio

# def calculate_thematic_number(sentence, keywords):
#     # Tokenize the sentence into words
#     words = sentence.split()
#     # Calculate the ratio of keywords in the sentence to the total number of keywords
#     thematic_number = sum(1 for word in words if word in keywords) / len(keywords) if keywords else 0.0
#     return thematic_number

# def calculate_centroid(preprocessed_text):
#     vectorizer = CountVectorizer()
#     X = vectorizer.fit_transform(preprocessed_text)
#     centroid = np.mean(X.toarray(), axis=0)
#     return centroid

# def convert_sparse_to_dense_general(sparse_matrix, num_rows, num_cols):
#     dense_matrix = np.zeros((num_rows, num_cols), dtype=sparse_matrix.dtype)
#     rows, cols = sparse_matrix.nonzero()
#     data = sparse_matrix.data
#     for row, col, value in zip(rows, cols, data):
#         if row < num_rows and col < num_cols:
#             dense_matrix[row, col] = value
#     return dense_matrix

# def calculate_cosine_similarity(sentence, document_centroid):
#     document_vector = np.array([document_centroid])
#     sentence_vector = vectorizer.transform([sentence])
#     sentence_vector = convert_sparse_to_dense_general(sentence_vector, 1, 59)
#     cosine_sim = cosine_similarity(sentence_vector, document_vector)[0][0]
#     return cosine_sim


# # Event loop
# while True:
#     event, values = window.read()
#     if event == sg.WINDOW_CLOSED or event == 'Exit':
#         break
#     elif event == 'Generate Summary' or (event == '\r' and values['-TEXT-'] != ''):
#         # Check if the text is from file or input
#         if values['-TEXT-'] == '':
#             # Get input values from file
#             file_path = values['-FILE-']
#             if os.path.exists(file_path):
#                 with open(file_path, 'r', encoding='utf-8') as file:
#                     marathi_text = file.read()
#             else:
#                 sg.popup_error('File not found!')
#                 continue
#         else:
#             # Get input values from text input
#             marathi_text = values['-TEXT-']
        
#         num_sentences = int(values['-NUM_SENTENCES-']) if values['-NUM_SENTENCES-'] else 5
        
#         # Perform language detection
#         try:
#             if detect(marathi_text) != 'mr':  # 'mr' is the ISO code for Marathi
#                 raise ValueError('Please input Marathi text only.')
#         except:
#             sg.popup_error('Please input Marathi text only.')
#             continue
        
#         # Preprocess text
#         preprocessed_text = preprocess_text(marathi_text)
#         stemmed_text = preprocessed_text.split(".")
        
#         # Calculate total sentences
#         total_sentences = len(stemmed_text)
        
#         # Calculate maximum sentence length
#         max_sentence_length = max(len(sentence.split()) for sentence in stemmed_text)
        
#         # Calculate document centroid
#         document_centroid = calculate_centroid(stemmed_text)
        
#         # Calculate sentence scores
#         sentence_scores = calculate_sentence_scores(stemmed_text, max_sentence_length)
        
#         # Generate summary
#         summary = generate_summary(marathi_text, stemmed_text, sentence_scores, num_sentences)
        
#         # Update output in GUI
#         window['-OUTPUT-'].update(summary)
        
#     elif event == 'Reset':
#         # Clear input and output fields
#         window['-TEXT-'].update('')
#         window['-NUM_SENTENCES-'].update('')
#         window['-OUTPUT-'].update('')
        

# # Close the window
# window.close()
