In [1]:
import nltk
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.tokenize import word_tokenize
import PySimpleGUI as sg
from langdetect import detect
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
import os
import re
import spacy
import codecs
import mahaNLP
# Load the spaCy Multilingual model
from mahaNLP.preprocess import Preprocess
nlp = spacy.load('xx_ent_wiki_sm')

def load_text(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        return text
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None

def preprocess_text(text):
    # Remove punctuation and digits
    text = re.sub(r'[^a-zA-Z\u0900-\u097F\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

def perform_marathi_stemming(text):
    doc = text.split(".")
    suffixes = {
        1: [u"ो", u"े", u"ू", u"ु", u"ी", u"ि", u"ा", u"च"],
        2: [u"चा", u"चे", u"ने", u"नी", u"ना", u"ते", u"ीं", u"तील", u"ात", u"ाँ", u"ां", u"ों", u"ें", u"तच", u"ता", u"ही",
            u"ले"],
        3: [u"ाचा", u"ाचे", u"तील", u"ानी", u"ाने", u"ाना", u"ाते", u"ाती", u"ाता", u"तीं", u"तून", u"तील", u"तही", u"तपण",
            u"कडे", u"ातच", u"हून", u"पणे", u"ाही", u"ाले"],
        4: [u"मधले", u"ातील", u"च्या", u"न्या", u"ऱ्या", u"ख्या", u"वर", u"साठी", u"ातून", u"कडून", u"मुळे", u"वरून",
            u"ातील", u"नीही", u"ातही", u"ातपण", u"ाकडे", u"पाशी", u"ाहून", u"ापणे", u"मधला"],
        5: [u"ामधले", u"ाच्या", u"ान्या", u"ाऱ्या", u"ाख्या", u"ावर", u"ासाठी", u"पासून", u"ाकडून", u"ामुळे", u"ावरून",
            u"कडेही", u"ानीही", u"ापाशी", u"ामधला", u"मध्ये"],
        6: [u"पर्यंत", u"ापासून", u"ाकडेही", u"पूर्वक", u"लेल्या", u"ामध्ये"],
        7: [u"ापर्यंत", u"प्रमाणे", u"तसुद्धा", u"ापूर्वक", u"ालेल्या"],
        8: [u"ाप्रमाणे", u"ातसुद्धा"],
    }
    preprocessed_text = []
    # print(doc)
    for each in doc:
        tokens = each.split(' ')
        cleaned_tokens = []
        for tok in tokens:
            if '-' in tok:
                subtokens = tok.split('-')
                cleaned_tokens.extend(subtokens)
            else:
                cleaned_tokens.append(tok.strip())

        stems = []
        for word in cleaned_tokens:
            for i in range(8, 0, -1):
                if len(word) > i + 1:
                    for suf in suffixes[i]:
                        if word.endswith(suf):
                            word = word[:-i]
            if word:
                stems.append(word)
        if stems:
            preprocessed_text.append(' '.join(stems))
        # preprocessed_text += "."
    return preprocessed_text

# def perform_marathi_lemmatization(text):
#     doc = nlp(text)
#     lemmatized_words = [token.lemma_ for token in doc]
#     return lemmatized_words

In [3]:
marathi_text = input("Enter your Marathi text:\n")
num_sentences = int(input("Enter the desired number of sentences for the summary: "))

In [4]:
preproceed_text = preprocess_text(marathi_text)

In [5]:
# perform_marathi_lemmatization(preproceed_text)

In [6]:
Stemmed_text = perform_marathi_stemming(marathi_text)

In [7]:
Stemmed_text

['विस्तीर्ण पसरलेलं जंगल, जंगल उगम पावलेल आण गावाल वेढ देणारी, आजूबाज डोंगर हिरवाई देणार वाघई नदी, एक बाजूल जंगल तर बाक तीन बाजूं नद आण डोंगरदऱ्य वेढलेलं टुमदार गाव वाघदर',
 'पूर्व मुबलक असण वाघ संख्य हे नाव या नदील आण गावाल लाभलं होतं',
 'गाव एक प्राथमिक शाळा, एक पोस्टा पेट',
 'बाक सार व्यवहार करायल गावकऱ्य २० किम लांब जावं लागत अस',
 'दादाराव पाटील आल्य तश गरज जास्त उरल नव्हत हे खरं',
 'फोन वरून बर काम होत, शिवाय वाड कॉम्प्य होण्यासारख सार सरकार काम होऊन जात होत',
 'वाड म्हणज दादाराव पाटल वाड',
 'जंगलाल लागून असलेल हा वाड म्हणज एक कोडं होतं',
 'वाड्य पसार मोठ म्हणज जवळपास एकर भर',
 'नद आण जंगल आडोशाल जाग पा चाऱ् बाजूल पुरुषभर उं दगड भिंत जंगल बाजूल बांधलेल दोन मजल चौसोप टुमदार वाड',
 'त्याच्यापुढं दगड कुंपण बरोबरीने बांधलेल मोठ मोठ गोठा, तबेला, कुत्र्यांसाठी जाळीदार खोल',
 'आण अजून ८ खोल्य',
 'वाड्य मुख्य दार दोन् बाजूल बैठक खोल्य',
 'बाहेरून येणार जाणार सार लोक आध इथ थांबत आण प्राथमिक चर् इथ होई, आण मग पुढं एन्ट्र',
 'अर्थ गाव कोणत्य रहिवाश्याल वाड्य बिनधोक प्रवेश हो',
 'वाड्य ह्य ब

### Functions

In [8]:
def calculate_bigram_length(sentence):
    # Tokenize the sentence into words
    words = sentence.split()
    
    # Generate bigrams
    bigrams = [(words[i], words[i + 1]) for i in range(len(words) - 1)]
    
    # Return the number of bigrams
    return len(bigrams)

In [9]:
def calculate_trigram_length(sentence):
    # Tokenize the sentence into words
    words = sentence.split()
    
    # Generate trigrams
    trigrams = [(words[i], words[i + 1], words[i + 2]) for i in range(len(words) - 2)]
    
    # Return the number of trigrams
    return len(trigrams)

In [10]:
def calculate_tf_isf(sentence):
    # Tokenize the sentence into words
    words = sentence.split()
    
    # Calculate term frequency
    term_freq = {}
    for word in words:
        term_freq[word] = term_freq.get(word, 0) + 1
    
    # Calculate inverse sentence frequency
    inverse_sentence_freq = 1 / len(words)
    
    # Calculate TF-ISF for each term
    tf_isf = {}
    for word, freq in term_freq.items():
        tf_isf[word] = freq * inverse_sentence_freq

    summ = 0
    for val in  tf_isf.values():
        summ += val
    tf_isf = summ/len(words)
    
    return tf_isf

In [11]:
def calculate_sentence_length_factor(sentence, max_sentence_length):
    # Tokenize the sentence into words
    words = sentence.split()
    
    # Calculate the length of the sentence
    sentence_length = len(words)
    
    # Calculate the sentence length factor
    sentence_length_factor = sentence_length / max_sentence_length
    
    return sentence_length_factor

In [12]:
def calculate_numeric_tokens_ratio(sentence):
    # Tokenize the sentence into words
    words = sentence.split()
    
    # Count the number of numeric tokens
    numeric_count = sum(1 for word in words if word.isdigit())
    
    # Calculate the ratio of numeric tokens to total tokens
    if len(words) > 0:
        numeric_tokens_ratio = numeric_count / len(words)
    else:
        numeric_tokens_ratio = 0.0
    
    return numeric_tokens_ratio

In [13]:
def calculate_pos_factor(total_sentences, current_pos):
    """
    Calculate the Position Factor (POS factor) for a given sentence.

    Parameters:
    total_sentences (int): Total number of sentences in the document.
    current_pos (int): Position of the current sentence within the document.

    Returns:
    float: The calculated Position Factor.
    """
    return (total_sentences - current_pos) / total_sentences


In [14]:
def calculate_thematic_number(sentence, keywords):
    """
    Calculate the Thematic Number for a given sentence.

    Parameters:
    sentence (str): The input sentence.
    keywords (list): List of keywords representing main themes or topics of the document.

    Returns:
    float: The calculated Thematic Number.
    """
    # Tokenize the sentence into words
    words = sentence.split()
    
    # Count the number of keywords present in the sentence
    keyword_count = sum(1 for word in words if word in keywords)
    
    # Calculate the ratio of keywords in the sentence to the total number of keywords
    if len(keywords) > 0:
        thematic_number = keyword_count / len(keywords)
    else:
        thematic_number = 0.0
    
    return thematic_number


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

def calculate_centroid(preproceed_text):
    """
    Calculate the centroid of the document.

    Parameters:
    preproceed_text (list): List of preprocessed sentences from the document.

    Returns:
    dict: A dictionary representing the centroid of the document.
    """
    # Initialize a TF-IDF vectorizer
    vectorizer = TfidfVectorizer()
    
    # Fit and transform the preprocessed text to compute TF-IDF vectors
    tfidf_matrix = vectorizer.fit_transform(preproceed_text)
    
   # Compute the centroid vector
    centroid_vector = np.mean(tfidf_matrix, axis=0)

    return centroid_vector

In [16]:
import numpy as np
from scipy.sparse import csr_matrix

def convert_sparse_to_dense_general(sparse_matrix, num_rows, num_cols):
    """
    Convert a sparse matrix to a dense matrix with the specified dimensions.

    Args:
        sparse_matrix (scipy.sparse.csr_matrix): The input sparse matrix.
        num_rows (int): The desired number of rows in the dense matrix.
        num_cols (int): The desired number of columns in the dense matrix.

    Returns:
        numpy.ndarray: A dense matrix with the specified dimensions.
    """
    dense_matrix = np.zeros((num_rows, num_cols), dtype=sparse_matrix.dtype)
    rows, cols = sparse_matrix.nonzero()
    data = sparse_matrix.data

    for row, col, value in zip(rows, cols, data):
        if row < num_rows and col < num_cols:
            dense_matrix[row, col] = value

    return dense_matrix


In [17]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

def calculate_cosine_similarity(sentence, document_centroid):
    """
    Calculate the cosine similarity between a sentence vector and the document centroid.

    Parameters:
    sentence (str): Sentence text.
    document_centroid (dict): Document centroid represented as a dictionary of TF-IDF values.

    Returns:
    float: Cosine similarity score.
    """
    # Convert document centroid dictionary to a sparse matrix
    document_vector = np.array([document_centroid])
    
    # Vectorize the sentence using the same TF-IDF vectorizer used for the document centroid
    vectorizer = TfidfVectorizer()
    sentence_vector = vectorizer.fit_transform([sentence])
    sentence_vector = convert_sparse_to_dense_general(sentence_vector, 1, 59)
    print(sentence_vector)
    # Compute cosine similarity
    print(document_centroid)
    cosine_sim = cosine_similarity(sentence_vector, document_vector.reshape(1, -1))[0][0]

    return cosine_sim



In [18]:
def calculate_sentence_scores(stemmed_text, total_sentences, document_centroid, max_sentence_length):
    """
    Calculate scores for each sentence based on different metrics.

    Parameters:
    stemmed_text (list): List of preprocessed and stemmed sentences.
    total_sentences (int): Total number of sentences in the document.
    document_centroid (dict): Centroid of the document.
    max_sentence_length (int): Length of the longest sentence in the document.

    Returns:
    dict: Dictionary containing sentence indices as keys and corresponding scores as values.
    """
    sentence_scores = {}

    for idx, sentence in enumerate(stemmed_text):
        try:
        # Calculate POS factor
            pos_factor = calculate_pos_factor(total_sentences, idx + 1)
            # print("pos fact:",pos_factor)

            # Calculate Bigram token length
            bigram_length = calculate_bigram_length(sentence)
            # print("bigram length:",bigram_length)

            # Calculate Trigram token length
            trigram_length = calculate_trigram_length(sentence)
            # print("trigram length:",trigram_length)

            # Calculate TF-ISF vector
            tf_isf = calculate_tf_isf(sentence)
            # print("tf isf:",tf_isf)

            # Calculate cosine similarity
            cosine_similarity_score = calculate_cosine_similarity(sentence, document_centroid)
            # print("cosine similarity:",cosine_similarity_score)

            # Calculate thematic number
            thematic_number = calculate_thematic_number(sentence,[])
            # print("thematic:",thematic_number)

            # Calculate sentence length factor
            sentence_length_factor = calculate_sentence_length_factor(sentence, max_sentence_length)
            # print("sent len fac:",sentence_length_factor)

            # Calculate numeric tokens ratio
            numeric_tokens_ratio = calculate_numeric_tokens_ratio(sentence)
            # print("num tockens ratio:",numeric_tokens_ratio)

            # Calculate sentence score using a combination of metrics
            sentence_score = (
                pos_factor
                + bigram_length
                + trigram_length
                + tf_isf
                + cosine_similarity_score
                + thematic_number
                + sentence_length_factor
                + numeric_tokens_ratio
            )
        except: 
            sentence_score = 0
        # Store the sentence score
        sentence_scores[idx] = sentence_score

    return sentence_scores


In [19]:
def generate_summary(original_text, stemmed_text, sentence_scores, num_sentences=5):
    """
    Generate a summary based on the calculated sentence scores.

    Parameters:
    stemmed_text (list): List of preprocessed and stemmed sentences.
    sentence_scores (dict): Dictionary containing sentence indices as keys and corresponding scores as values.
    num_sentences (int): Number of sentences to include in the summary. Default is 5.

    Returns:
    str: The generated summary.
    """
    # Sort the sentences based on scores (in descending order)
    sorted_sentences = sorted(sentence_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Select the top num_sentences sentences for the summary
    selected_sentences = sorted_sentences[:num_sentences]
    
    # Sort selected sentences based on their original order
    selected_sentences.sort(key=lambda x: x[0])
    doc = original_text.split(".")
    # Generate the summary by joining selected sentences
    summary = ' '.join(doc[idx] for idx, _ in selected_sentences)
    
    return summary


In [20]:
# Calculate total sentences
total_sentences = len(Stemmed_text)

# Calculate document centroid
document_centroid = calculate_centroid(Stemmed_text)

# Calculate maximum sentence length
max_sentence_length = max(len(sentence.split()) for sentence in Stemmed_text)

# Calculate sentence scores
sentence_scores = calculate_sentence_scores(Stemmed_text, total_sentences, document_centroid, max_sentence_length)
Original_text = marathi_text
# Generate summary
summary = generate_summary(Original_text, Stemmed_text, sentence_scores, num_sentences)
print(summary)

[[0.18257419 0.36514837 0.18257419 0.18257419 0.18257419 0.18257419
  0.54772256 0.18257419 0.18257419 0.18257419 0.18257419 0.36514837
  0.18257419 0.18257419 0.18257419 0.18257419 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.        ]]
[[0.00636538 0.0035694  0.01760081 0.01122459 0.01158384 0.00776113
  0.0068074  0.01064379 0.03999332 0.00975326 0.03715479 0.01240861
  0.00382821 0.00313242 0.00598133 0.00459116 0.07526411 0.00531538
  0.00466182 0.00896102 0.00703616 0.01852842 0.01936577 0.02413852
  0.0037661  0.00703616 0.00471018 0.00452794 0.01481375 0.00856602
  0.01490902 0.00313242 0.00338966 0.00599074

[[0.00636538 0.0035694  0.01760081 0.01122459 0.01158384 0.00776113
  0.0068074  0.01064379 0.03999332 0.00975326 0.03715479 0.01240861
  0.00382821 0.00313242 0.00598133 0.00459116 0.07526411 0.00531538
  0.00466182 0.00896102 0.00703616 0.01852842 0.01936577 0.02413852
  0.0037661  0.00703616 0.00471018 0.00452794 0.01481375 0.00856602
  0.01490902 0.00313242 0.00338966 0.00599074 0.00861659 0.00629489
  0.00588681 0.01725107 0.06849769 0.01054967 0.00649971 0.0037661
  0.00428301 0.00583387 0.0066539  0.00506462 0.0036401  0.00419784
  0.00527877 0.00452794 0.00466182 0.00672082 0.00359998 0.00527877
  0.00790488 0.00568599 0.00553924 0.01695476 0.01082391 0.0035694
  0.00861284 0.00452794 0.01884268 0.00338966 0.01039247 0.00932957
  0.00419784 0.0133838  0.00588681 0.00313242 0.03105608 0.0068074
  0.00313242 0.0059264  0.00542881 0.02307171 0.00447125 0.00886028
  0.00674898 0.00672022 0.0035694  0.00776113 0.00389646 0.00466182
  0.0036401  0.00389646 0.0061117  0.01150563 0.005

In [21]:
# # # Summarize the text
# summary = generate_summary(Original_text, Stemmed_text, sentence_scores, num_sentences)
# print("Summary: ",summary)

In [28]:
import PySimpleGUI as sg
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from langdetect import detect

# Define the layout of the GUI
layout = [
    [sg.Text('Enter your Marathi text:')],
    [sg.Multiline(key='-TEXT-', size=(600, 10))],
    [sg.Text('Enter the desired number of sentences for the summary (default is 5):')],
    [sg.InputText(key='-NUM_SENTENCES-', default_text='5'), sg.Button('Generate Summary'), sg.Button('Reset'), sg.Button('Exit')],
    [sg.Text(size=(65, 20), key='-OUTPUT-')],
]

# Create the window
window = sg.Window('Marathi Text Summarizer', layout, size=(600,600))

# Define the functions needed for summarization
# (Code for these functions remains the same as in the previous example)

# Event loop
while True:
    event, values = window.read()
    if event == sg.WINDOW_CLOSED or event == 'Exit':
        break
    elif event == 'Generate Summary':
        # Get input values
        marathi_text = values['-TEXT-']
        num_sentences = int(values['-NUM_SENTENCES-']) if values['-NUM_SENTENCES-'] else 5
        
        # Detect the language of the input text
        try:
            language = detect(marathi_text)
        except:
            language = ''
        
        # Check if the detected language is Marathi
        if language == 'mr':
            # Preprocess text
            preprocessed_text = preprocess_text(marathi_text)
            stemmed_text = perform_marathi_stemming(marathi_text)
            
            # Calculate total sentences
            total_sentences = len(stemmed_text)
            
            # Calculate document centroid
            document_centroid = calculate_centroid(stemmed_text)
            
            # Calculate maximum sentence length
            max_sentence_length = max(len(sentence.split()) for sentence in stemmed_text)
            
            # Calculate sentence scores
            sentence_scores = calculate_sentence_scores(stemmed_text, total_sentences, document_centroid, max_sentence_length)
            
            # Generate summary
            summary = generate_summary(marathi_text, stemmed_text, sentence_scores, num_sentences)
            
            # Update output in GUI
            window['-OUTPUT-'].update(summary)
        else:
            # Display error message and reset button
            window['-OUTPUT-'].update('Only Marathi text is allowed. Please enter Marathi text.')
            layout[4] = [sg.Button('Reset')]
            window['-TEXT-'].update('')
            window['-NUM_SENTENCES-'].update('5')
    elif event == 'Reset':
        # Clear input and output fields
        window['-TEXT-'].update('')
        window['-NUM_SENTENCES-'].update('5')
        window['-OUTPUT-'].update('')

# Close the window
window.close()


[[0.18257419 0.36514837 0.18257419 0.18257419 0.18257419 0.18257419
  0.54772256 0.18257419 0.18257419 0.18257419 0.18257419 0.36514837
  0.18257419 0.18257419 0.18257419 0.18257419 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.        ]]
[[0.00636538 0.0035694  0.01760081 0.01122459 0.01158384 0.00776113
  0.0068074  0.01064379 0.03999332 0.00975326 0.03715479 0.01240861
  0.00382821 0.00313242 0.00598133 0.00459116 0.07526411 0.00531538
  0.00466182 0.00896102 0.00703616 0.01852842 0.01936577 0.02413852
  0.0037661  0.00703616 0.00471018 0.00452794 0.01481375 0.00856602
  0.01490902 0.00313242 0.00338966 0.00599074