## Import all Required Libraries

In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import json
import pandas as pd

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\samyukta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\samyukta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\samyukta\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Reading Data & Preprocessing

Stop Words & Lemmatization 


In [3]:
# Specify the path to your csv file
file_path = "../data/merged_email_data.csv"
df = pd.read_csv(file_path)

In [4]:
df.head(2)

Unnamed: 0,thread_id,summary,subject,timestamp,from,to,body
0,1,The email thread discusses the Master Terminat...,FW: Master Termination Log,2002-01-29 11:23:42,"Gossett, Jeffrey C. JGOSSET","['Giron', 'Darron C. Dgiron', 'Love', 'Phillip...",\r\n\r\n -----Original Message-----\r\nFrom: =...
1,1,The email thread discusses the Master Terminat...,FW: Master Termination Log,2002-01-31 12:50:00,"Theriot, Kim S. KTHERIO","['Murphy', 'Melissa Mmurphy', 'Gossett', 'Jeff...",\r\n\r\n -----Original Message-----\r\nFrom: =...


In [5]:
# Group by thread_id and combine text
combined_df = df.groupby('thread_id')['body'].apply(lambda x: ' '.join(x)).reset_index(name='combined_text')
combined_df

Unnamed: 0,thread_id,combined_text
0,1,\r\n\r\n -----Original Message-----\r\nFrom: =...
1,2,I'll be there... I will attend. Suzanne:\r\nHe...
2,3,"Hey there; \r\n""Do you know who your ""big toe""..."
3,4,thanks for the update.\r\nPL that is ok. Than...
4,5,I think you can send it just so he has the for...
...,...,...
4162,4163,FYI.\r\n---------------------- Forwarded by Ka...
4163,4164,Can you send him a hard copy (He is w Constell...
4164,4165,"I don't see you on MSN, but I am on the phone ..."
4165,4166,I'm planning to be out August 28 - Sept 1. I a...


In [6]:
combined_df = pd.merge(combined_df, df[['thread_id', 'summary']].drop_duplicates(), on = 'thread_id', how = 'left')
combined_df

Unnamed: 0,thread_id,combined_text,summary
0,1,\r\n\r\n -----Original Message-----\r\nFrom: =...,The email thread discusses the Master Terminat...
1,2,I'll be there... I will attend. Suzanne:\r\nHe...,A lunch meeting has been scheduled for May 5th...
2,3,"Hey there; \r\n""Do you know who your ""big toe""...",Ben is updating a friend on his progress with ...
3,4,thanks for the update.\r\nPL that is ok. Than...,The recipient of the email thread initially ex...
4,5,I think you can send it just so he has the for...,The email thread discusses the long form confi...
...,...,...,...
4162,4163,FYI.\r\n---------------------- Forwarded by Ka...,Peter Thompson has sent a memo to Kay Mann and...
4163,4164,Can you send him a hard copy (He is w Constell...,The email thread revolves around the sharing a...
4164,4165,"I don't see you on MSN, but I am on the phone ...",Susan asks Emily about her plans for the weeke...
4165,4166,I'm planning to be out August 28 - Sept 1. I a...,Several employees will be on vacation during d...


In [61]:
email_list = [text for text in combined_df['combined_text']]
email_list[0]

'\r\n\r\n -----Original Message-----\r\nFrom: =09Theriot, Kim S. =20\r\nSent:=09Tuesday, January 29, 2002 1:23 PM\r\nTo:=09Richardson, Stacey; Anderson, Diane; Gossett, Jeffrey C.; White, Stac=\r\ney W.; Murphy, Melissa; Hall, D. Todd; Sweeney, Kevin\r\nCc:=09Aucoin, Evelyn; Baxter, Bryce; Wynne, Rita\r\nSubject:=09FW: Master Termination Log\r\n\r\n\r\n\r\n -----Original Message-----\r\nFrom: =09Panus, Stephanie =20\r\nSent:=09Tuesday, January 29, 2002 11:39 AM\r\nTo:=09Adams, Laurel; Alonso, Tom; Aronowitz, Alan; Bailey, Susan; Balfour-F=\r\nlanagan, Cyndie; Baughman, Edward; Belden, Tim; Bishop, Serena; Brackett, D=\r\nebbie R.; Bradford, William S.; Browning, Mary Nell; Bruce, James; Bruce, M=\r\nichelle; Bruce, Robert; Buerkle, Jim; Calger, Christopher F.; Carrington, C=\r\nlara; Considine, Keith; Cordova, Karen A.; Crandall, Sean; Cutsforth, Diane=\r\n; Diamond, Russell; Dunton, Heather; Edison, Susan; Elafandi, Mo; Fischer, =\r\nMark; Flores, Nony; Fondren, Mark; Gorny, Vladimir;

In [8]:
# Define a function for preprocessing with lemmatization
def preprocess_with_lemmatization(text):
    # Tokenize the text into words
    words = word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words("english"))
    words = [word for word in words if word.lower() not in stop_words]
    
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in words]  # Lemmatize verbs
    
    # Join the lemmatized words back into text
    preprocessed_text = " ".join(lemmatized_words)
    
    return preprocessed_text

In [9]:
# Preprocess the emails using lemmatization
preprocessed_emails = [preprocess_with_lemmatization(email) for email in email_list]

In [60]:
preprocessed_emails[0]

"-- -- -Original Message -- -- - : =09Theriot , Kim S. =20 Sent : =09Tuesday , January 29 , 2002 1:23 PM : =09Richardson , Stacey ; Anderson , Diane ; Gossett , Jeffrey C. ; White , Stac= ey W. ; Murphy , Melissa ; Hall , D. Todd ; Sweeney , Kevin Cc : =09Aucoin , Evelyn ; Baxter , Bryce ; Wynne , Rita Subject : =09FW : Master Termination Log -- -- -Original Message -- -- - : =09Panus , Stephanie =20 Sent : =09Tuesday , January 29 , 2002 11:39 : =09Adams , Laurel ; Alonso , Tom ; Aronowitz , Alan ; Bailey , Susan ; Balfour-F= lanagan , Cyndie ; Baughman , Edward ; Belden , Tim ; Bishop , Serena ; Brackett , D= ebbie R. ; Bradford , William S. ; Browning , Mary Nell ; Bruce , James ; Bruce , M= ichelle ; Bruce , Robert ; Buerkle , Jim ; Calger , Christopher F. ; Carrington , C= lara ; Considine , Keith ; Cordova , Karen A. ; Crandall , Sean ; Cutsforth , Diane= ; Diamond , Russell ; Dunton , Heather ; Edison , Susan ; Elafandi , Mo ; Fischer , = Mark ; Flores , Nony ; Fondren , Mark ; G

## Modelling

TFiDF
Clustering

In [11]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt')  # Ensure the 'punkt' tokenizer models are downloaded

# Tokenize each email into sentences
emails_sentences = [sent_tokenize(email) for email in preprocessed_emails]

# Flatten the list of sentences across all emails for TF-IDF
all_sentences = [sentence for email in emails_sentences for sentence in email]

# Compute TF-IDF for all sentences
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(all_sentences)

# Placeholder for summaries
summary_list = []

for email, sentences in zip(preprocessed_emails, emails_sentences):
    # Transform the sentences of the current email to get their TF-IDF representation
    sentence_tfidf = tfidf_vectorizer.transform(sentences)
    
    # Calculate average TF-IDF score for each sentence
    avg_tfidf_scores = np.mean(sentence_tfidf.toarray(), axis=1)
    
    # Select indices of sentences with the highest TF-IDF scores
    # Adjust the number of sentences as needed for the summary
    num_sentences = min(5, len(sentences))  # For example, top 5 sentences or less if the email is short
    top_sentence_indices = np.argsort(avg_tfidf_scores)[-num_sentences:]
    
    # Create the summary by joining the top sentences
    summary = ' '.join([sentences[idx] for idx in sorted(top_sentence_indices)])
    summary_list.append(summary)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\samyukta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Evaluation

In [58]:
original_summ = [text for text in combined_df['summary']]
original_summ[0]

"The email thread discusses the Master Termination Log and the need to investigate a CNG LDC (Hope Gas) termination and a $66 million settlement offer. Stephanie Panus sends out the Daily List and Master Termination Log for various dates. Kim Theriot requests her name and Melissa Murphy's name to be removed from the distribution list and adds several names to it. The thread also includes updates on terminations and valid terminations for various companies."

In [55]:
# Import necessary libraries
import sys
from rouge import Rouge
import time

# Adjusting the recursion limit to prevent RecursionError during deep comparisons
sys.setrecursionlimit(3000)

# Due to Recursion Error, splitting into chunks & increased limit
iters = len(original_summ)//100 # Number of iterations needed, based on data size
total_scores = [] # List to hold all calculated ROUGE scores

# Iterate through chunks of summaries to compute ROUGE scores
for i in range(iters):
    # Compute the scores
    try:
        rouge = Rouge() # Initialize the Rouge scoring object
        # Calculate ROUGE scores for the current chunk
        scores = rouge.get_scores(summary_list[100*i:100*(i+1)], original_summ[100*i:100*(i+1)], avg=True)
        total_scores.append(scores) # Append the scores for this chunk to the total scores list
    except RecursionError:
        time.sleep(10) # Wait for 10 seconds and continue, to handle recursion limit issues
        continue 

# Handle any remaining summaries after the chunked processing    
rouge = Rouge() # Re-initialize the Rouge scoring object
scores = rouge.get_scores(summary_list[100*(i+1):], original_summ[100*(i+1):], avg=True)
total_scores.append(scores)

# Initialize a dictionary to calculate average ROUGE scores
avg_scores = {'rouge-1': {'r': 0, 'p': 0, 'f': 0},
                'rouge-2': {'r': 0, 'p': 0, 'f': 0},
                'rouge-l': {'r': 0, 'p': 0, 'f': 0}}

# Accumulate scores from each chunk to compute the average
for score in total_scores:
    for metric in score.keys():  
        avg_scores[metric]['r'] += score[metric]['r']
        avg_scores[metric]['p'] += score[metric]['p']
        avg_scores[metric]['f'] += score[metric]['f']

# Calculate the average scores by dividing by the number of chunks (i+2 accounts for the final partial chunk)
for metric in avg_scores:
    avg_scores[metric]['r'] /= (i+2)
    avg_scores[metric]['p'] /= (i+2)
    avg_scores[metric]['f'] /= (i+2)

# Print the average ROUGE scores
print(avg_scores)

{'rouge-1': {'r': 0.18422990676946885, 'p': 0.15169145874113643, 'f': 0.15740302866728578}, 'rouge-2': {'r': 0.03727749125950121, 'p': 0.02848241253037934, 'f': 0.02992919383943061}, 'rouge-l': {'r': 0.17539146865888927, 'p': 0.14451540830137063, 'f': 0.14986437825085558}}


In [63]:
# Import the BERTScore score function
from bert_score import score

# Evaluating BERTScore for a subset of 100 email summaries due to computational constraints
# summary_list contains the generated summaries
# original_summ contains the reference summaries
# We select a random subset (for example, the 600th to 700th summaries) for evaluation
P, R, F1 = score(summary_list[600:700], original_summ[600:700], lang="en", model_type="bert-base-uncased")

# Calculate the average of the precision, recall, and F1 score tensors
# .mean().item() computes the mean of the tensor and converts it to a Python float
avg_precision = P.mean().item()
avg_recall = R.mean().item()
avg_f1 = F1.mean().item()

# Print the average BERTScore metrics for the evaluated subset
print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)
print("Average F1:", avg_f1)

Average Precision: 0.3720662593841553
Average Recall: 0.46384432911872864
Average F1: 0.410055547952652


In [1]:
import bert_score
print(bert_score.__version__)

  from .autonotebook import tqdm as notebook_tqdm


0.3.12
