In [1]:
import pandas as pd
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForSeq2SeqLM,
    pipeline,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)
from datasets import Dataset
import evaluate
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load email-specific pretrained model
model_name = "IrisWiris/email-summarizer"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

print("✅ Model and tokenizer loaded successfully!")


✅ Model and tokenizer loaded successfully!


In [4]:
from transformers import pipeline

# Create the summarizer pipeline with your base model
summarizer = pipeline("summarization", model="IrisWiris/email-summarizer")  # or whichever model you used for training

# Sample long email
sample_email = """
Subject: Project Update and Next Steps

Hi Team,

I hope this email finds you well. I wanted to provide a comprehensive update on the Q1 marketing campaign project that we've been working on for the past two months.

First, I'm pleased to announce that we've successfully completed the initial research phase. Our team conducted extensive market analysis, competitor research, and customer surveys. The data shows promising opportunities in the millennial demographic, particularly in urban areas.

Second, regarding the budget allocation, we need to discuss some adjustments. The initial budget of $50,000 may need to be increased by 15% due to rising advertising costs on social media platforms. I've prepared a detailed breakdown that I'll share in tomorrow's meeting.

Third, the creative team has developed three campaign concepts. Each concept has been tested with focus groups, and we have clear feedback on which direction resonates most with our target audience. I'll be presenting these findings along with my recommendations next week.

Finally, I want to address the timeline concerns raised in last week's meeting. While we're slightly behind schedule due to the extended research phase, I'm confident we can still launch by the end of Q2 if we expedite the design phase and bring in additional resources.

Please review the attached documents before our meeting on Friday at 2 PM. If you have any questions or concerns, don't hesitate to reach out.

Best regards,
Sarah
"""

# Generate summary with pretrained model
print("="*50)
print("BEFORE FINE-TUNING:")
print("="*50)
summary_before = summarizer(sample_email, max_length=100, min_length=30, do_sample=False)
print(f"\nOriginal Length: {len(sample_email.split())} words")
print(f"Summary: {summary_before[0]['summary_text']}")
print(f"Summary Length: {len(summary_before[0]['summary_text'].split())} words")


Device set to use cpu


BEFORE FINE-TUNING:


Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Original Length: 234 words
Summary: Sarah is pleased to provide a comprehensive update on the Q1 marketing campaign project for the past two months. She has conducted extensive market analysis, competitor research, and customer surveys, and has prepared a detailed breakdown to share in tomorrow's meeting. The initial budget of $50,000 may need to be increased by 15% due to rising advertising costs on social media platforms. Sarah will present findings along with recommendations next week.
Summary Length: 71 words


Loading the Dataset

In [16]:
# Load your email data
# Format: CSV with columns 'email' and 'summary'
try:
    df = pd.read_csv(r"C:\Users\nandi\OneDrive\Desktop\traffic\email summ\data\emails.csv")
except FileNotFoundError:
    # Create sample dataset if CSV not found
    sample_data = {
        'email': [
             "Subject: Meeting Reminder\n\nHi team, just a quick reminder that we have our weekly sync meeting tomorrow at 10 AM. Please come prepared with your updates and any blockers you're facing. Looking forward to seeing everyone there.",
            "Subject: Vacation Request\n\nDear Manager, I would like to request vacation leave from June 1st to June 15th for a family trip. I have completed all my pending tasks and briefed John to cover for me during my absence. Please let me know if this works.",
            "Subject: Bug Report\n\nHello Support Team, I'm experiencing a critical issue with the login module. Users are unable to authenticate using their credentials since this morning. Error code 500 is being displayed. This is affecting approximately 200 users. Please prioritize this issue."
        ],
        'summary': [
            "Weekly sync meeting reminder for tomorrow at 10 AM.",
            "Vacation leave request from June 1-15 with coverage arranged.",
            "Critical login authentication bug affecting 200 users, needs immediate attention."
        ]
    }
    df = pd.DataFrame(sample_data)

print(f"Dataset size: {len(df)} emails")
print("\nSample data:")
print(df.head())


Dataset size: 8 emails

Sample data:
                                               email  \
0  Subject: Meeting Reminder\n\nHi team, just a q...   
1  Subject: Vacation Request\n\nDear Manager, I w...   
2  Subject: Bug Report\n\nHello Support Team, I'm...   
3  Subject: Project Update\n\nHi Team, I wanted t...   
4  Subject: Server Maintenance Notice\n\nDear All...   

                                             summary  
0  Weekly sync meeting reminder for tomorrow at 1...  
1  Vacation leave request from June 1-15 with cov...  
2  Critical login authentication bug affecting 20...  
3  Q1 marketing campaign update: research complet...  
4  Server maintenance scheduled next weekend, mul...  


Data Conversion and Splitting

In [None]:
# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Split into train/validation
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset['train']
eval_dataset = split_dataset['test']

# Preprocessing function
def preprocess_function(examples):
    # Tokenize emails (inputs)
    model_inputs = tokenizer(
        examples['email'], 
        max_length=512, 
        truncation=True,
        padding='max_length'
    )
    
    # Tokenize summaries (labels)
    labels = tokenizer(
        examples['summary'], 
        max_length=128, 
        truncation=True,
        padding='max_length'
    )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_eval = eval_dataset.map(preprocess_function, batched=True)

print("✅ Data preprocessing complete!")


Map: 100%|██████████| 2/2 [00:00<00:00, 10.25 examples/s]
Map: 100%|██████████| 1/1 [00:00<00:00, 28.19 examples/s]

✅ Data preprocessing complete!





In [10]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./email_model_finetuned",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

print("✅ Trainer initialized. Ready for fine-tuning!")


  trainer = Trainer(


✅ Trainer initialized. Ready for fine-tuning!


In [11]:
# Start fine-tuning
print("Starting fine-tuning...")
trainer.train()

# Save the fine-tuned model
trainer.save_model("./email_model_finetuned")
tokenizer.save_pretrained("./email_model_finetuned")
print("\nFine-tuned model saved!")


Starting fine-tuning...


Epoch,Training Loss,Validation Loss
1,No log,0.535211
2,No log,0.530083
3,No log,0.528388


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].



Fine-tuned model saved!


In [1]:
import shutil
import os

# Source (where it is now)
source = "./email_model_finetuned"

# Destination (parent folder)
destination = "../email_model_finetuned"

# Copy the model folder
if os.path.exists(source):
    if os.path.exists(destination):
        print("Destination already exists, removing old version...")
        shutil.rmtree(destination)
    
    shutil.copytree(source, destination)
    print(f"✅ Model copied to: {os.path.abspath(destination)}")
else:
    print("❌ Source not found")


✅ Model copied to: c:\Users\nandi\OneDrive\Desktop\traffic\email summ\email_model_finetuned


In [12]:
# Load fine-tuned model
finetuned_model = AutoModelForSeq2SeqLM.from_pretrained("./email_model_finetuned")
finetuned_tokenizer = AutoTokenizer.from_pretrained("./email_model_finetuned")

# Create new pipeline
finetuned_summarizer = pipeline(
    "summarization", 
    model=finetuned_model, 
    tokenizer=finetuned_tokenizer
)

# Test on same sample email
print("="*50)
print("AFTER FINE-TUNING:")
print("="*50)
summary_after = finetuned_summarizer(sample_email, max_length=100, min_length=30, do_sample=False)
print(f"\nSummary: {summary_after[0]['summary_text']}")
print(f"Summary Length: {len(summary_after[0]['summary_text'].split())} words")

print("\n" + "="*50)
print("COMPARISON:")
print("="*50)
print(f"Before: {summary_before[0]['summary_text']}")
print(f"\nAfter:  {summary_after[0]['summary_text']}")


Device set to use cpu


AFTER FINE-TUNING:


Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Summary: Sarah is pleased to provide a comprehensive update on the Q1 marketing campaign project for the past two months. She has conducted extensive market analysis, competitor research, and customer surveys. She is prepared a detailed breakdown and will share in tomorrow's meeting. Sarah is confident the project will launch by the end of Q2 if they expedite the design phase.
Summary Length: 60 words

COMPARISON:
Before: Sarah is pleased to provide a comprehensive update on the Q1 marketing campaign project for the past two months. She has conducted extensive market analysis, competitor research, and customer surveys, and has prepared a detailed breakdown to share in tomorrow's meeting. The initial budget of $50,000 may need to be increased by 15% due to rising advertising costs on social media platforms. Sarah will present findings along with recommendations next week.

After:  Sarah is pleased to provide a comprehensive update on the Q1 marketing campaign project for the past two m

In [13]:
# Load ROUGE metric
rouge = evaluate.load('rouge')

def evaluate_model(model, tokenizer, test_samples):
    predictions = []
    references = []
    
    for sample in test_samples:
        email_text = sample['email']
        true_summary = sample['summary']
        
        # Generate prediction
        inputs = tokenizer(email_text, return_tensors="pt", max_length=512, truncation=True)
        outputs = model.generate(**inputs, max_length=128, min_length=30, do_sample=False)
        pred_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        predictions.append(pred_summary)
        references.append(true_summary)
    
    # Calculate ROUGE scores
    results = rouge.compute(predictions=predictions, references=references)
    return results

# Evaluate on test set
test_samples = eval_dataset.to_dict()
test_samples = [{'email': test_samples['email'][i], 'summary': test_samples['summary'][i]} 
                for i in range(len(test_samples['email']))]

print("Evaluating fine-tuned model...")
scores = evaluate_model(finetuned_model, finetuned_tokenizer, test_samples[:5])  # Test on 5 samples

print("\nROUGE Scores:")
for key, value in scores.items():
    print(f"{key}: {value:.4f}")


Evaluating fine-tuned model...

ROUGE Scores:
rouge1: 0.2105
rouge2: 0.0556
rougeL: 0.1579
rougeLsum: 0.1579
