<a href="https://colab.research.google.com/github/muniprasadreddy/my-colab_work/blob/main/%E2%80%9CEnron_Email_Generative_AI_tasks_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Problem Statement:
In this task, you will use the “Enron Email Dataset” to build a system that can either summarize
long email threads or generate responses to common emails. The goal is to explore the capabilities
of a generative language model to handle everyday email tasksmodel to handle everyday email tasks.

**Objective:**
• Create a pipeline using a pre-trained language model to perform one of the following tasks:
1. Summarize long email threads.
2. Generate automated responses to common email types.

# **Dataset Exploration & Preprocessing:**

## Import Libraries

In [None]:
import pandas as pd
import csv
import re
from io import StringIO
import email
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# import pandas as pd
# import csv

# # Load dataset
# df = pd.read_csv("/content/emails.csv", quoting=csv.QUOTE_NONE, error_bad_lines=False)
# # quoting=csv.QUOTE_NONE: Treats all characters as data, ignoring quotes
# # error_bad_lines=False: Skips lines that cause parsing errors

In [None]:

# Load dataset
df = pd.read_csv("/content/emails.csv", quoting=csv.QUOTE_NONE, on_bad_lines='skip')
# quoting=csv.QUOTE_NONE: Treats all characters as data, ignoring quotes
# on_bad_lines='skip': Skips lines that cause parsing errors

In [None]:
# Preview the dataset
df.head()

Unnamed: 0,"""file""","""message"""
0,"""allen-p/_sent_mail/1.""","""Message-ID: <18782981.1075855378110.JavaMail...."
1,Date: Mon,14 May 2001 16:39:00 -0700 (PDT)
2,From: phillip.allen@enron.com,
3,To: tim.belden@enron.com,
4,Subject:,


In [None]:
df.tail()

Unnamed: 0,"""file""","""message"""
27277595,affiliates) and the intended recipient or any ...,and may not be
27277596,relied on by anyone as the basis of a contract...,
27277597,Thank you.,
27277598,**********************************************...,
27277599,"""",


In [None]:
df.shape

(27277600, 2)

In [None]:
print(df.columns)

Index(['"file"', '"message"'], dtype='object')


In [None]:
print(df.loc[57282]['"message"'])

nan


In [None]:
print(df.iloc[102370,1])

"Message-ID: <2154371.1075855718388.JavaMail.evans@thyme>


In [None]:
# transform the email into correct format
message = df.loc[102370]['"message"']
e = email.message_from_string(message)

e.items()

[('"Message-ID', '<2154371.1075855718388.JavaMail.evans@thyme>')]

In [None]:
# get date
e.get('Date')

In [None]:
# show message body
e.get_payload()

''

**Extract Message Body**

In [None]:
# def get_field(field, messages):
#     column = []
#     for message in messages:
#         e = email.message_from_string(message)
#         value = e.get(field)
#         if value is not None:
#             value = value.replace("\n", " ").replace("\r", "")
#         column.append(value)
#     return column

In [None]:
import email
from io import StringIO

def get_field(field, messages):
    column = []
    for message in messages:
        # Convert message to string if it's not already
        if not isinstance(message, str):
            message = str(message)
        e = email.message_from_string(message)
        value = e.get(field)
        if value is not None:
            value = value.replace("\n", " ").replace("\r", "")
        column.append(value)
    return column

In [None]:
df['date'] = get_field("Date", df['"message"'])
df['from'] = get_field("From", df['"message"'])
df['to'] = get_field("To", df['"message"'])
df['subject'] = get_field("Subject", df['"message"'])


In [None]:
df['X-From'] = get_field("X-From", df['"message"'])
df['X-To'] = get_field("X-To", df['"message"'])
df['X-cc'] = get_field("X-cc", df['"message"'])
df['X-bcc'] = get_field("X-bcc", df['"message"'])


In [None]:
df['X-Folder'] = get_field("X-Folder", df['"message"'])
df['X-Origin'] = get_field("X-Origin", df['"message"'])
df['X-FileName'] = get_field("X-FileName", df['"message"'])

In [None]:
df.head()

Unnamed: 0,"""file""","""message""",date,from,to,subject,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName
0,"""allen-p/_sent_mail/1.""","""Message-ID: <18782981.1075855378110.JavaMail....",,,,,,,,,,,
1,Date: Mon,14 May 2001 16:39:00 -0700 (PDT),,,,,,,,,,,
2,From: phillip.allen@enron.com,,,,,,,,,,,,
3,To: tim.belden@enron.com,,,,,,,,,,,,
4,Subject:,,,,,,,,,,,,


In [None]:


def body(messages):
    column = []
    for message in messages:
        # Check if the message is a string before processing it
        if isinstance(message, str):
            e = email.message_from_string(message)
            body = e.get_payload()
            if body is not None:
                body = body.replace("\n", " ").replace("\r", "")
            column.append(body)
        else:
            # Handle non-string messages (e.g., floats) by appending None or a placeholder
            column.append(None)  # You can replace None with a more appropriate placeholder if needed
    return column

df['body'] = body(df['"message"'])

In [None]:
df.head(3)

In [None]:
df.columns

In [None]:
email_df = df[['to', 'from', 'x-to', 'x-from', 'body']]

In [None]:
email_df = df[['to', 'from', '"x-to"', '"x-from"', 'body']]

In [None]:
email_df = df[['to', 'from', 'x-to', 'x-from', 'body']]

 **Filter Emails for Long Threads and Common Topics**

In [None]:
# Assume 'body' column contains the email text and 'thread_id' identifies each thread.
# Filter threads with 3 or more replies

# Count emails per thread_id
thread_counts = df['thread_id'].value_counts()
long_threads = thread_counts[thread_counts >= 3].index

# Filter emails that belong to long threads
long_thread_emails = df[df['thread_id'].isin(long_threads)]


## **Filter by Common Topics**

In [None]:
# Define keywords to filter by subject or body
keywords = ['meeting', 'project update', 'schedule', 'report']

# Filter based on subject or body containing keywords
common_topic_emails = df[
    df['subject'].str.contains('|'.join(keywords), case=False, na=False) |
    df['body'].str.contains('|'.join(keywords), case=False, na=False)
]


## **Clean Email Text**

In [None]:
def clean_email_text(text):
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)

    # Remove forwarded messages and replies
    text = re.sub(r'(-{2,}|_{2,}|From:|To:|Sent:|Subject:).*', '', text, flags=re.MULTILINE)

    # Remove signatures or greetings at the end
    text = re.sub(r'(Best regards|Kind regards|Sincerely|Regards|Thank you|Cheers).*', '', text, flags=re.IGNORECASE)

    # Tokenize and remove stop words
    words = word_tokenize(text)
    words = [word for word in words if word.lower() not in stopwords.words('english')]

    return ' '.join(words)

# Apply cleaning function to the body of emails in long threads and common topics
long_thread_emails['cleaned_body'] = long_thread_emails['body'].apply(clean_email_text)
common_topic_emails['cleaned_body'] = common_topic_emails['body'].apply(clean_email_text)


## Combine Cleaned Emails and Save


In [None]:
# Combine cleaned datasets
processed_emails = pd.concat([long_thread_emails, common_topic_emails])

# Select only the necessary columns for further processing
processed_emails = processed_emails[['message_id', 'thread_id', 'subject', 'cleaned_body']]

# Save to CSV
processed_emails.to_csv("processed_enron_emails.csv", index=False)


In [None]:
pip install transformers torch


**Import the Model and Tokenizer**

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load T5 model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-small")  # or "t5-base" for larger model
tokenizer = T5Tokenizer.from_pretrained("t5-small")



**Prepare and Preprocess Email Threads for Summarization**

In [None]:
import pandas as pd

# Load the processed email dataset
email_data = pd.read_csv("processed_enron_emails.csv")

# Preview the dataset
email_data.head()


Concatenate Email Threads into Single Text Blocks

In [None]:
# Combine emails by 'thread_id' to create a single text for each thread
email_threads = email_data.groupby('thread_id')['cleaned_body'].apply(lambda x: ' '.join(x)).reset_index()

# Preview combined threads
email_threads.head()


Summarize Each Email Thread

In [None]:
def summarize_text(text, model, tokenizer, max_input_length=512, max_output_length=150):
    # Encode input text and limit to max_input_length tokens
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=max_input_length, truncation=True)

    # Generate summary with the model
    summary_ids = model.generate(inputs, max_length=max_output_length, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)

    # Decode the summary back into text
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary

# Apply summarization function to each thread
email_threads['summary'] = email_threads['cleaned_body'].apply(lambda x: summarize_text(x, model, tokenizer))


 Evaluate Summaries

In [None]:
# Display summaries for review
for i in range(5):
    print(f"Thread ID: {email_threads['thread_id'][i]}")
    print(f"Original Text: {email_threads['cleaned_body'][i][:500]}")  # Preview first 500 characters
    print(f"Summary: {email_threads['summary'][i]}")
    print("-" * 80)


In [None]:
from rouge_score import rouge_scorer

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Calculate ROUGE scores for a sample of summaries
for i in range(5):
    scores = scorer.score(email_threads['cleaned_body'][i], email_threads['summary'][i])
    print(f"Thread ID: {email_threads['thread_id'][i]}")
    print(f"ROUGE Scores: {scores}")
    print("-" * 80)


In [None]:
# Save summarized threads to CSV
email_threads[['thread_id', 'summary']].to_csv("summarized_email_threads.csv", index=False)


**Response Generation Task** :

Goal: Automatically generate responses to common email types.
• Steps:
o Select a set of common email topics (e.g., meeting requests, status updates).
o Use a pre-trained model to generate an automated response based on the email
content.
o Evaluate the responses by checking if they are relevant and appropriate for the
context.

In [None]:
import pandas as pd

# Load the preprocessed email dataset
email_data = pd.read_csv("processed_enron_emails.csv")

# Apply categorization to each email based on subject and body
email_data['category'] = email_data.apply(lambda x: categorize_email(x['subject'], x['cleaned_body']), axis=1)

# Filter emails to keep only those that match common topics
email_data = email_data[email_data['category'] != 'other']


**Select Common Email Topics**

In [None]:
# Define categories for common email types
common_topics = {
    'meeting_request': ['meeting', 'schedule', 'appointment'],
    'status_update': ['update', 'progress', 'status', 'report'],
    'follow_up': ['follow-up', 'pending', 'reminder']
}

# Example function to categorize an email based on subject or body content
def categorize_email(subject, body):
    for topic, keywords in common_topics.items():
        if any(keyword in subject.lower() or keyword in body.lower() for keyword in keywords):
            return topic
    return 'other'  # Use 'other' for emails that don't match defined categories


In [None]:
pip install transformers


**Load the Pre-trained Model and Tokenizer**

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)


**Generate Responses for Each Emai**

In [None]:
import torch

def generate_response(email_content, category, model, tokenizer, max_length=100):
    # Define prompt templates for each category
    prompts = {
        'meeting_request': "Respond to a meeting request:",
        'status_update': "Provide a status update:",
        'follow_up': "Respond to a follow-up email:",
    }

    # Construct the prompt
    prompt = prompts.get(category, "Respond to the email:") + " " + email_content

    # Encode the input and generate response
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    outputs = model.generate(inputs, max_length=max_length, num_return_sequences=1, no_repeat_ngram_size=2, top_k=50, top_p=0.95, temperature=0.7)

    # Decode the generated text
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response


In [None]:
# Generate responses for each email
email_data['response'] = email_data.apply(lambda x: generate_response(x['cleaned_body'], x['category'], model, tokenizer), axis=1)

# Display sample responses
email_data[['subject', 'cleaned_body', 'response']].head()


**Evaluate the Generated Responses**

Manual Evaluation

In [None]:
# Display sample responses for manual evaluation
for i in range(5):
    print(f"Subject: {email_data['subject'][i]}")
    print(f"Original Email: {email_data['cleaned_body'][i]}")
    print(f"Generated Response: {email_data['response'][i]}")
    print("-" * 80)


**Automated Evaluation (Optional)**

In [None]:
from nltk.translate.bleu_score import sentence_bleu

# Calculate BLEU scores for a sample if true responses are available
# Assuming `true_response` column contains actual responses
if 'true_response' in email_data.columns:
    email_data['bleu_score'] = email_data.apply(lambda x: sentence_bleu([x['true_response']], x['response']), axis=1)
    print(email_data[['subject', 'bleu_score']].head())


Save Generated Responses for Further Analysis**

In [None]:
# Save the email data with generated responses to a CSV
email_data[['subject', 'cleaned_body', 'category', 'response']].to_csv("generated_responses.csv", index=False)


Summarization Task:
o Assess the quality (using metrics or manually) of the summaries. Does the
summary capture the main points? Is it concise and accurate

In [None]:
!pip install rouge-score


**Setting Up Automatic Evaluation with ROUGE Scores**

Import and Initialize the ROUGE Scorer

In [None]:
from rouge_score import rouge_scorer
import pandas as pd

# Initialize ROUGE scorer for different n-grams
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)


**Load Summarized Data and Prepare Reference Texts**

In [None]:
# Load the summarized data for evaluation
email_threads = pd.read_csv("summarized_email_threads.csv")

# Display the data structure for reference
email_threads.head()


**Calculate ROUGE Scores for Each Summary**

In [None]:
def calculate_rouge_scores(reference, summary, scorer):
    # Calculate ROUGE-1, ROUGE-2, and ROUGE-L scores
    scores = scorer.score(reference, summary)

    # Extract the F1 scores for each metric as a concise measure of quality
    return {
        'rouge1': scores['rouge1'].fmeasure,
        'rouge2': scores['rouge2'].fmeasure,
        'rougeL': scores['rougeL'].fmeasure
    }

# Apply ROUGE scoring to each row in the dataframe
email_threads[['rouge1', 'rouge2', 'rougeL']] = email_threads.apply(
    lambda row: pd.Series(calculate_rouge_scores(row['original_text'], row['summary'], scorer)), axis=1
)


**Review Average ROUGE Scores**

In [None]:
# Calculate average ROUGE scores
average_rouge1 = email_threads['rouge1'].mean()
average_rouge2 = email_threads['rouge2'].mean()
average_rougeL = email_threads['rougeL'].mean()

print(f"Average ROUGE-1 Score: {average_rouge1:.2f}")
print(f"Average ROUGE-2 Score: {average_rouge2:.2f}")
print(f"Average ROUGE-L Score: {average_rougeL:.2f}")


** Manual Evaluation of Summaries**

In [None]:
# Display a sample of summaries for manual evaluation
sample_size = 5
sample_data = email_threads.sample(n=sample_size)

for idx, row in sample_data.iterrows():
    print(f"Original Thread:\n{row['original_text'][:500]}...")  # Show first 500 characters
    print(f"Generated Summary:\n{row['summary']}")
    print(f"ROUGE-1: {row['rouge1']:.2f}, ROUGE-2: {row['rouge2']:.2f}, ROUGE-L: {row['rougeL']:.2f}")
    print("Does the summary capture main points? Is it concise and relevant?")
    print("-" * 80)


**Additional Qualitative Evaluation Criteria (Optional)**

In [None]:
# Sample rubric for manual scoring (adjust as needed)
email_threads['relevance_score'] = [5, 4, 3, 5, 4]  # Replace with actual scores from manual review
email_threads['conciseness_score'] = [4, 5, 3, 4, 5]
email_threads['coherence_score'] = [5, 4, 4, 5, 4]

# Calculate average manual scores
average_relevance = email_threads['relevance_score'].mean()
average_conciseness = email_threads['conciseness_score'].mean()
average_coherence = email_threads['coherence_score'].mean()

print(f"Average Relevance Score: {average_relevance}")
print(f"Average Conciseness Score: {average_conciseness}")
print(f"Average Coherence Score: {average_coherence}")


Response Generation Task:
o Check if the responses are coherent, contextually appropriate, and relevant to
the email.     writen  code in detail way

In [None]:
import pandas as pd

# Load the generated responses for evaluation
responses_data = pd.read_csv("generated_responses.csv")

# Display the data structure for reference
responses_data.head()


**Automated Evaluation of Response Relevanc**e

Import and Initialize Sentence-BERT Model

In [None]:
from sentence_transformers import SentenceTransformer, util

# Load the Sentence-BERT model
similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')


**Calculate Semantic Similarity**

In [None]:
def calculate_similarity(email_content, generated_response, model):
    # Encode both email content and response
    email_embedding = model.encode(email_content, convert_to_tensor=True)
    response_embedding = model.encode(generated_response, convert_to_tensor=True)

    # Calculate cosine similarity
    similarity = util.cos_sim(email_embedding, response_embedding).item()
    return similarity

# Apply the similarity function to each row in the data
responses_data['similarity'] = responses_data.apply(
    lambda x: calculate_similarity(x['email_content'], x['generated_response'], similarity_model), axis=1
)

# Display sample similarity scores
responses_data[['email_content', 'generated_response', 'similarity']].head()


**Average Similarity Score**

In [None]:
average_similarity = responses_data['similarity'].mean()
print(f"Average Semantic Similarity: {average_similarity:.2f}")


**Manual Evaluation for Coherence, Contextual Appropriateness, and Relevance**

In [None]:
# Display a sample of responses for manual evaluation
sample_size = 5
sample_data = responses_data.sample(n=sample_size)

for idx, row in sample_data.iterrows():
    print(f"Original Email Content:\n{row['email_content'][:500]}...")  # Show first 500 characters
    print(f"Generated Response:\n{row['generated_response']}")
    print(f"Semantic Similarity Score: {row['similarity']:.2f}")
    print("Score for Coherence (1-5): ")
    print("Score for Contextual Appropriateness (1-5): ")
    print("Score for Relevance (1-5): ")
    print("-" * 80)


 **Enter Scores and Calculate Averages **

In [None]:
# Manually add scores (replace with actual scores from manual review)
responses_data['coherence_score'] = [4, 5, 3, 4, 5]  # Example scores
responses_data['contextual_appropriateness_score'] = [5, 4, 3, 4, 5]
responses_data['relevance_score'] = [4, 5, 4, 5, 5]

# Calculate average scores for each criterion
average_coherence = responses_data['coherence_score'].mean()
average_contextual_appropriateness = responses_data['contextual_appropriateness_score'].mean()
average_relevance = responses_data['relevance_score'].mean()

print(f"Average Coherence Score: {average_coherence}")
print(f"Average Contextual Appropriateness Score: {average_contextual_appropriateness}")
print(f"Average Relevance Score: {average_relevance}")


**Summary of Evaluation Results**

In [None]:
print("Evaluation Summary:")
print(f"Average Semantic Similarity (Relevance): {average_similarity:.2f}")
print(f"Average Coherence Score: {average_coherence:.2f}")
print(f"Average Contextual Appropriateness Score: {average_contextual_appropriateness:.2f}")
print(f"Average Relevance Score: {average_relevance:.2f}")


 **Flask App**

In [None]:
pip install flask


In [None]:
from flask import Flask, request, jsonify
from transformers import pipeline

# Initialize the Flask app
app = Flask(__name__)

# Load the pre-trained model for summarization or response generation
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")  # Replace with model as needed
generator = pipeline("text-generation", model="gpt2")  # Replace with the chosen response model

@app.route("/summarize", methods=["POST"])
def summarize():
    # Get the email thread from the POST request
    email_thread = request.json.get("email_thread")

    # Generate a summary
    summary = summarizer(email_thread, max_length=100, min_length=30, do_sample=False)

    # Extract and return the summary text
    return jsonify({"summary": summary[0]["summary_text"]})

@app.route("/generate_response", methods=["POST"])
def generate_response():
    # Get the email content from the POST request
    email_content = request.json.get("email_content")

    # Generate a response
    response = generator(email_content, max_length=50, num_return_sequences=1)

    # Extract and return the generated response text
    return jsonify({"response": response[0]["generated_text"]})

if __name__ == "__main__":
    app.run(debug=True)
