In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import json

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\samyukta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\samyukta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\samyukta\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
# Specify the path to your JSON file
file_path = "../data/output/data.json"

# Open the JSON file and load its contents
with open(file_path, 'r') as file:
    email_list = json.load(file)

In [5]:
# Define a function for preprocessing with lemmatization
def preprocess_with_lemmatization(text):
    # Tokenize the text into words
    words = word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words("english"))
    words = [word for word in words if word.lower() not in stop_words]
    
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in words]  # Lemmatize verbs
    
    # Join the lemmatized words back into text
    preprocessed_text = " ".join(lemmatized_words)
    
    return preprocessed_text

In [6]:
# Preprocess the emails using lemmatization
preprocessed_emails = [preprocess_with_lemmatization(email) for email in email_list]

In [7]:
# Calculating TF-IDF for each email
generated_list = []
for i in range(len(preprocessed_emails)):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform([preprocessed_emails[i],])

    # Compute cosine similarity between sentences
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # Create sentence scores based on TF-IDF scores
    sentence_scores = cosine_sim[0]

    # Sort the sentence scores
    sorted_indices = np.argsort(sentence_scores)

    # Number of sentences in the summary
    num_sentences = 5

    # Extract top sentences as the summary
    top_sentence_indices = sorted_indices[-num_sentences:]
    # Extract top sentences using the sorted indices
    summary = ' '.join([preprocessed_emails[i].split('.')[idx] for idx in top_sentence_indices])

    # Print the original document and the summary
    print("Original Document:")
    print(email_list[i])

    print("\nSummarized Text:")
    generated_list.append(summary)
    print(summary)


Original Document:
Uncover your surprise springtime savings today!

To view the html graphic version of this message, please copy and paste the link below into your browser:
http://l.em.disneydebit.com/rts/go2.aspx?t=371661&tp=i-1NGB-DG-7u6-rkzO5-1n-318pE-1c-rdwep-lA6RAbMS3B-IvqO&pi=8Y640LqX2Icv1QAy0zTeQHle-81TYwfgOvYMocKnDJQ&x=5778bf70154f43651af115ca8b80e4ff&hp2=25b68808335e2409b8306afbecd8428393cb5594e9eaf2f26580edfc99913245

Offer made by Disney. JPMorgan Chase Bank, N.A. and its affiliates are not affiliates of Disney and are not responsible for offer fulfillment.

For all inquiries regarding the Disney Debit Card, please visit DisneyDebit.com.
http://l.em.disneydebit.com/rts/go2.aspx?t=371662&tp=i-1NGB-DG-7u6-rkzO5-1n-318pE-1c-rdwep-lA6RAbMS3B-IvqO&pi=8Y640LqX2Icv1QAy0zTeQHle-81TYwfgOvYMocKnDJQ&x=20240301&hp2=94a8c989e287e34057a32357d2f311f7dd9dc5346b88b69a5a455d77ca26c532

This email was sent to: pallesaisamyukta@gmail.com

Please do not reply to this email as this address is no

## Evaluating

In [8]:
# from rouge import Rouge

In [9]:
# generated_sum = generated_list[0]
# original_text = email_list[0]

# # Initialize the Rouge object
# rouge = Rouge()

# # Compute ROUGE scores
# scores = rouge.get_scores(generated_sum, original_text)

# # Print ROUGE scores
# print(scores)

In [10]:
from evaluation import rouge_metric

In [11]:
average_scores = rouge_metric(email_list, generated_list)

In [12]:
average_scores

{'rouge-1': {'r': 0.06511441997087261,
  'p': 0.4279779727764925,
  'f': 0.10918285025316005},
 'rouge-2': {'r': 0.01806927564045425,
  'p': 0.15088988627376695,
  'f': 0.03097445707538867},
 'rouge-l': {'r': 0.06511441997087261,
  'p': 0.4279779727764925,
  'f': 0.10918285025316005}}