In [1]:
import pandas as pd

# Load the Enron emails dataset
enron_emails = pd.read_csv('enron_emails.csv')

# Display the first few rows to verify
enron_emails.head()

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [None]:
enron_emails.info()
enron_emails.describe()

In [4]:
enron_emails['message'][0]

"Message-ID: <18782981.1075855378110.JavaMail.evans@thyme>\nDate: Mon, 14 May 2001 16:39:00 -0700 (PDT)\nFrom: phillip.allen@enron.com\nTo: tim.belden@enron.com\nSubject: \nMime-Version: 1.0\nContent-Type: text/plain; charset=us-ascii\nContent-Transfer-Encoding: 7bit\nX-From: Phillip K Allen\nX-To: Tim Belden <Tim Belden/Enron@EnronXGate>\nX-cc: \nX-bcc: \nX-Folder: \\Phillip_Allen_Jan2002_1\\Allen, Phillip K.\\'Sent Mail\nX-Origin: Allen-P\nX-FileName: pallen (Non-Privileged).pst\n\nHere is our forecast\n\n "

In [None]:
from talon import init
init()

from talon import signature

email_body = """
Hi Team,

Please find the attached report.

Best regards,
John Doe
Senior Analyst
john.doe@example.com
"""

extracted_signature, _ = signature.extract(email_body, 'john.doe@example.com')
print("Talon Extracted Signature:")
print(extracted_signature)

In [4]:
from transformers import pipeline

# Extract sender using a fine-tuned BERT model
def extract_sender_bert(email_body):
    classifier = pipeline("text-classification", model="bert-base-uncased")
    result = classifier(email_body)
    return result[0]['label'], None  # Returning a second value

In [14]:
email_body = """
Hi Team,

Please find the attached report.

Best regards,
John Doe
Senior Analyst
john.doe@example.com
"""

extracted_signature, _ = extract_sender_bert(email_body)
print("Bert Extracted Signature:")
print(extracted_signature)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Bert Extracted Signature:
LABEL_0


In [11]:
from transformers import pipeline

#ner_pipeline = pipeline("ner", model="nickmuchi/bert-base-uncased-email-ner", device=0)  # Use GPU
ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", device=0)
#ner_pipeline = pipeline("ner", model="Jean-Baptiste/camembert-ner", device=0)

def extract_sender(email_body):
    result = ner_pipeline(email_body)
    sender_name = " ".join([ent['word'] for ent in result if ent['entity'] == "B-PER" or ent['entity'] == "I-PER"])
    return sender_name if sender_name else "Unknown Sender"

email_body = """
Hi Team,

Please find the attached report.

Best regards,
John Doe
Senior Analyst
john.doe@example.com
"""

extracted_sender = extract_sender(email_body)
print("Extracted Sender:", extracted_sender)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Extracted Sender: John Do


In [13]:
import re
from transformers import pipeline

# Load NER model
ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple", device=0)

def extract_signature(email_body):
    # Step 1: Use NER to extract entities (name, job title)
    ner_results = ner_pipeline(email_body)
    
    # Step 2: Extract PERSON and ORG entities
    person_name = []
    job_title = []
    
    for entity in ner_results:
        if entity["entity_group"] == "PER":
            person_name.append(entity["word"])
        elif entity["entity_group"] in ["ORG", "MISC"]:
            job_title.append(entity["word"])
    
    full_name = " ".join(person_name) if person_name else None
    job_title_text = " ".join(job_title) if job_title else None
    
    # Step 3: Extract email using regex
    email_match = re.search(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", email_body)
    email_address = email_match.group(0) if email_match else None

    # Step 4: Return dictionary output
    return {
        "name": full_name,
        "title": job_title_text,
        "email": email_address
    }

# Example email
email_body = """
Hi Team,

Please find the attached report.

Best regards,
John Doe
Senior Analyst
john.doe@example.com
"""

extracted_signature = extract_signature(email_body)
print("Extracted Signature Dictionary:", extracted_signature)


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Extracted Signature Dictionary: {'name': 'John Do', 'title': None, 'email': 'john.doe@example.com'}


In [21]:
df = enron_emails.copy()

# Apply extraction to the first 10 rows
df_subset = df.iloc[:10].copy()  # Select first 10 rows
df_subset["signature"] = df_subset["message"].apply(extract_signature)

# Display the output
#print(df_subset[["file", "signature"]])
#df_subset["signature"]
print(df_subset["signature"])

0    {'name': 'Phillip K Allen Tim Belden Tim Bel P...
1    {'name': 'Phillip K Allen John J Lavorato John...
2    {'name': 'Phillip K Allen Leah Van Arsdal Phil...
3    {'name': 'Phillip K Allen Randall L Gay Philli...
4    {'name': 'Phillip K Allen Greg Piper Phillip A...
5    {'name': 'Phillip K Allen Greg Piper Phillip A...
6    {'name': 'Phillip K Allen John S Phillip Allen...
7    {'name': 'Phillip K Allen Joyce Teixeira Phill...
8    {'name': 'Phillip K Allen Mark Scott Phillip A...
9    {'name': 'Phillip K Allen Allen Phillip Allen ...
Name: signature, dtype: object


In [22]:
email_values = [signature.get("email") for signature in df_subset["signature"]]
print(email_values)

['phillip.allen@enron.com', 'phillip.allen@enron.com', 'phillip.allen@enron.com', 'phillip.allen@enron.com', 'phillip.allen@enron.com', 'phillip.allen@enron.com', 'phillip.allen@enron.com', 'phillip.allen@enron.com', 'phillip.allen@enron.com', 'phillip.allen@enron.com']
