In [1]:
import pandas as pd

# Load the Enron emails dataset
enron_emails = pd.read_csv('enron_emails.csv')

# Display the first few rows to verify
enron_emails.head()

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [2]:
enron_emails.info()
enron_emails.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517401 entries, 0 to 517400
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   file     517401 non-null  object
 1   message  517401 non-null  object
dtypes: object(2)
memory usage: 7.9+ MB


Unnamed: 0,file,message
count,517401,517401
unique,517401,517401
top,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
freq,1,1


In [3]:
import re
from transformers import pipeline

# Load NER model
ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple", device=0)

def extract_signature(email_body):
    # Step 1: Use NER to extract entities (name, job title)
    ner_results = ner_pipeline(email_body)
    
    # Step 2: Extract PERSON and ORG entities
    person_name = []
    job_title = []
    
    for entity in ner_results:
        if entity["entity_group"] == "PER":
            person_name.append(entity["word"])
        elif entity["entity_group"] in ["ORG", "MISC"]:
            job_title.append(entity["word"])
    
    full_name = " ".join(person_name) if person_name else "None"
    job_title_text = " ".join(job_title) if job_title else "None"
    
    # Step 3: Extract email using regex
    email_match = re.search(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", email_body)
    email_address = email_match.group(0) if email_match else "None"

    # Step 4: Return dictionary output
    # signature_text' and 'sender' columns
    return {
        "signature_text": full_name + "|" + job_title_text + "|" +email_address,
        "sender": email_address,
        "name": full_name,
        "title": job_title_text,
        "email": email_address
    }





Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
df = enron_emails.copy()
#df_subset = df

# Apply extraction to the first 10 rows
df_subset = df.iloc[:1000].copy()  # Select first 10 rows
df_subset["signature"] = df_subset["message"].apply(extract_signature)

# Display the output of emails only
email_values = [signature.get("email") for signature in df_subset["signature"]]
# Create a DataFrame with signature_text and sender columns
signature_df = pd.DataFrame(df_subset["signature"].tolist(), columns=["signature_text", "sender"])

# Save the DataFrame to a CSV file
signature_df.to_csv("signature.csv", index=False)
#print(email_values)
unique_emails = set(email_values)
print(unique_emails)

  attn_output = torch.nn.functional.scaled_dot_product_attention(
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


{'alyse.herasimchuk@enron.com', 'jsmith@austintx.com', '1.11913372.-2@multexinvestornetwork.com', 'messenger@ecm.bloomberg.com', 'bounce-news-932653@lists.autoweb.com', 'perfmgmt@enron.com', 'ina.rangel@enron.com', 'mark.whitt@enron.com', 'yahoo-delivers@yahoo-inc.com', 'subscriptions@intelligencepress.com', 'stephanie.miller@enron.com', 'christi.nicolay@enron.com', 'kim.ward@enron.com', 'ei_editor@ftenergy.com', 'billc@greenbuilder.com', 'rebecca.cantrell@enron.com', 'rob_tom@freenet.carleton.ca', 'yild@zdemail.zdlists.com', 'richard.shapiro@enron.com', 'philip.polsky@enron.com', 'webmaster@earnings.com', 'public.relations@enron.com', 'jfreeman@ssm.net', 'lisa.jacobson@enron.com', 'gthorse@keyad.com', 'tiffany.miller@enron.com', 'bobregon@bga.com', 'grensheltr@aol.com', 'tim.heizenrader@enron.com', 'owner-strawbale@crest.org', 'sarah.novosel@enron.com', 'announce@inbox.nytimes.com', 'frank.hayden@enron.com', 'phillip.allen@enron.com', 'calxa@aol.com', 'matt@fastpacket.net', 'aod@newsd

In [41]:
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch

# Load your data
df = pd.read_csv("signature_data.csv")  # Should contain 'signature_text' and 'sender' columns

# Encode sender labels numerically
df['label'] = df['sender'].astype('category').cat.codes

# Split dataset
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["signature_text"], truncation=True, padding=True)

print(df.head)
#t1 = train_dataset.map(tokenize_function, batched=True)
#t1 =tokenize_function(df[0]["signature_text"])
#print(t1)

In [53]:
from transformers import pipeline

# Pre-trained model pipeline (for demo purpose, general model)
classifier = pipeline("text-classification", model="dslim/bert-base-NER")

email_signature = "John Doe, Senior Analyst, john.doe@example.com"

result = classifier(email_signature)
print("BERT Predicted Sender Class:")
print(result)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


BERT Predicted Sender Class:
[{'label': 'O', 'score': 0.2072329819202423}]


In [43]:
def tokenize_function(examples):
    return tokenizer(examples["signature_text"], truncation=True, padding=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Load BERT model
# "dslim/bert-base-NER"
# "bert-base-uncased"
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=df['label'].nunique())

# Training arguments
training_args = TrainingArguments(
    output_dir="./bert_sender_detection_new",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train
trainer.train()

# Evaluate
trainer.evaluate()

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.3351209759712219, 'eval_runtime': 0.7064, 'eval_samples_per_second': 283.122, 'eval_steps_per_second': 18.403, 'epoch': 1.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.19635699689388275, 'eval_runtime': 0.6997, 'eval_samples_per_second': 285.837, 'eval_steps_per_second': 18.579, 'epoch': 2.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.19058085978031158, 'eval_runtime': 0.6735, 'eval_samples_per_second': 296.954, 'eval_steps_per_second': 19.302, 'epoch': 3.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.1839957982301712, 'eval_runtime': 0.6723, 'eval_samples_per_second': 297.494, 'eval_steps_per_second': 19.337, 'epoch': 4.0}
{'train_runtime': 42.1627, 'train_samples_per_second': 75.896, 'train_steps_per_second': 4.744, 'train_loss': 0.5025893402099609, 'epoch': 4.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.1839957982301712,
 'eval_runtime': 0.673,
 'eval_samples_per_second': 297.193,
 'eval_steps_per_second': 19.318,
 'epoch': 4.0}

In [45]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load trained model & tokenizer
model_path = "./bert_sender_detection_new"  # Folder where model was saved
#model_path = r"D:\Dev\GitHub\AI-ML-Course\Projects\External\Signature\bert_sender_detection"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

# Label mapping (you can save/load it too)
label_to_sender = {0: "John Doe", 1: "Jane Smith"}  # Example mapping

def predict_sender(signature_text):
    inputs = tokenizer(signature_text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=1)
    predicted_label = torch.argmax(probs, dim=1).item()
    confidence = probs[0][predicted_label].item()
    return label_to_sender[predicted_label], confidence

# Example usage
signature = "John Doe, Senior Analyst, john@example.com"
sender, confidence = predict_sender(signature)
print(f"Predicted Sender: {sender}, Confidence: {confidence:.2f}")


OSError: Can't load tokenizer for './bert_sender_detection_new'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure './bert_sender_detection_new' is the correct path to a directory containing all relevant files for a BertTokenizer tokenizer.

In [None]:
import sklearn, xgboost
print(f"scikit-learn version: {sklearn.__version__}") 
print(f"XGBoost version: {xgboost.__version__}") 
# for xgboost error with sklearn use these versions
scikit-learn version: 1.5.2
XGBoost version: 2.1.0