In [169]:
 !pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.33.0-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.33.0-py3-none-any.whl (315 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.1/315.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.32.1
    Uninstalling accelerate-0.32.1:
      Successfully uninstalled accelerate-0.32.1
Successfully installed accelerate-0.33.0


In [170]:
!pip install nltk==3.8.1 -U



In [171]:
!git clone https://github.com/ryanzhumich/AESLC

Cloning into 'AESLC'...
remote: Enumerating objects: 17469, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 17469 (delta 1), reused 0 (delta 0), pack-reused 17461[K
Receiving objects: 100% (17469/17469), 7.36 MiB | 17.01 MiB/s, done.
Resolving deltas: 100% (48/48), done.


In [172]:
import os
import re
import nltk
# nltk.download()
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments

# from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset, DataLoader

In [173]:

# Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [174]:
# Function for text preprocessing
def preprocess_text(text):
    # Tokenize text
    tokens = word_tokenize(text.lower())

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and word.isalnum()]

    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join tokens back into a string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text


# Custom dataset class
class EmailSubjectDataset(Dataset):
    def __init__(self, emails, subjects, tokenizer, max_length):
        self.emails = emails
        self.subjects = subjects
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.emails)

    def __getitem__(self, idx):
        email_text = self.emails[idx]
        subject_text = self.subjects[idx]

        # Tokenize inputs and outputs
        inputs = self.tokenizer(email_text, return_tensors='pt', max_length=self.max_length, truncation=True,
                                padding='max_length')
        outputs = self.tokenizer(subject_text, return_tensors='pt', max_length=self.max_length, truncation=True,
                                 padding='max_length')

        return {
            'input_ids': inputs.input_ids.flatten(),
            'attention_mask': inputs.attention_mask.flatten(),
            'labels': outputs.input_ids.flatten()
        }


# Directory containing email files
emails_directory = '/kaggle/working/AESLC/enron_subject_line/train'

# Initialize tokenizer and model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')


# Load emails and subjects from files
emails = []
subjects = []

for filename in os.listdir(emails_directory):

    with open(os.path.join(emails_directory, filename), 'r', encoding='utf-8', errors='ignore') as file:
        email_text = file.read()
        # Extract subject from email body using regex
        match = re.search(r'@subject\s+(.*)', email_text, re.IGNORECASE | re.DOTALL)

        if match:
            subject = match.group(1).strip()
            email_body = re.sub(r'@subject\n(.+)\n', '', email_text).strip()
            preprocessed_email = preprocess_text(email_body)

            emails.append(preprocessed_email)
            subjects.append(subject)

# Create dataset
print(emails[0])
print(subjects[0])
dataset = EmailSubjectDataset(emails, subjects, tokenizer, max_length=512)

# Split dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    logging_dir='./logs',
    logging_steps=500,
    evaluation_strategy="epoch",
)

# Initialize model
# model = GPT2LMHeadModel.from_pretrained('gpt2')

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Fine-tuning the model
trainer.train()


LookupError: 
**********************************************************************
  Resource 'corpora/wordnet' not found.  Please use the NLTK
  Downloader to obtain the resource:  >>> nltk.download()
  Searched in:
    - '/root/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/share/nltk_data'
**********************************************************************

In [None]:
# Function to generate subject lines from email text
def get_first_four_words_split(text):
    words = text.split()
    return ' '.join(words[:4])




def generate_subject(model, tokenizer, email_text, max_length=512):
    # Move model to correct device
    device = next(model.parameters()).device

    email_text = preprocess_text(email_text)

    input_ids = tokenizer.encode(email_text, return_tensors='pt').to(device)  # Move input_ids to device

    # Generate subject line
    with torch.no_grad():
        output = model.generate(input_ids, max_length=max_length, num_beams=5, early_stopping=True)

    generated_subject = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_subject

# Example usage of subject line generation
example_email = "The following reports have been waiting for your approval for more than 4 days.Please review.Owner: James W Reitmeyer Report Name: JReitmeyer 10/24/01 Days In Mgr.Queue: 5"
generated_subject = generate_subject(model, tokenizer, example_email)
print("Generated Subject:", get_first_four_words_split(generated_subject))



example_email = "All,  The below Analyst & Associate recruiting dates require ENA participation at Manager level at above.In order to resource each of your departments it is important to have ENA's involvement and participation in the interviews and debrief sessions on Fantastic Friday and Super Saturday events.These de-brief sessions will allow you the opportunity to select candidates you wish to join your groups.The target  is to assign potential candidates to business units and departments from the outset.As ENA has the highest percentage of A&A rotating in its business unit, the participation of ENA at interview should reflect this.Therefore, please encourage your direct reports and managers to participate in the below events in order to secure candidates for your business area.Associate Recruiting:		Saturday November 3						Total - 70 Candidates for Interview Analyst Recruiting:		Friday, November 16						Total - 70 Candidates for Interivew Associate Recruiting:		Saturday, December 1						Total - 70 Candidates for Interview  The above spreadsheet represents ENA's particpation today which I believe highlights the need for much additional support in these efforts.Please confirm by return participation of your respective groups.Regards,"
generated_subject = generate_subject(model, tokenizer, example_email)
print("Generated Subject:", get_first_four_words_split(generated_subject))


example_email = "Late on October 25th, we received information about a nonspecific threat to the Enron Center.We communicated with law enforcement officials who found the threat unsubstantiated and without merit.Nonetheless we take all threats seriously and have increased the security presence at the Enron Center still further.Once again, if you observe suspicious behavior, please call security at 3-6200."
generated_subject = generate_subject(model, tokenizer, example_email)
print("Generated Subject:", get_first_four_words_split(generated_subject))


example_email = "Thanks in advance for agreeing to speak at the Global Operations Controller  Forum.There will be approximately 30 Enron business controllers present at  the meeting.All have responsibility for mid and back office operations for  the following Enron entities:  Enron North America, Enron Europe, Enron South  America, Enron Global Markets, Enron Industrial Markets, Enron Broadband  Services and Enron Energy Services.Attendees will be here from Houston,  Calgary, Tokyo, Sydney, London and New York (metals business).Attached for your reference is the agenda.There may be some slight changes  before the forum begins, but this will give you a good idea of the topics to  be covered and the other speakers who will address the group.You are scheduled to address the group as follows:"
generated_subject = generate_subject(model, tokenizer, example_email)
print("Generated Subject:", get_first_four_words_split(generated_subject))



example_email = "To confirm:  Mark Thomas Mike Presley  both previously on the list for Netco's Market Risk/Research Group have resigned.I'm combing the Estate for replacements.DP"
generated_subject = generate_subject(model, tokenizer, example_email)
print("Generated Subject:", get_first_four_words_split(generated_subject))



example_email = "Michelle;  This is the presentation which was provided to the HR VP's.The HR VPs were tasked with going off to met their BU OTCs and devise plans around the three components.We are currently collating those plans into one document.We'll send that along to you when completed.Pls call if you have any questions."
generated_subject = generate_subject(model, tokenizer, example_email)
print("Generated Subject:", get_first_four_words_split(generated_subject))


In [None]:
# Save model to local directory
local_model_dir = './saved_model'
trainer.save_model(local_model_dir)

In [None]:
# !zip -r /content/finalmodel.zip /content/saved_model

import shutil
import os

zip_filename = '/kaggle/working/finalmodel_new.zip'
shutil.make_archive(zip_filename.split('.')[0], 'zip', local_model_dir)

# Note: make_archive creates a ZIP file with the base name (finalmodel) in the current working directory (/kaggle/working/)

# Clean up: Remove the original saved model directory
shutil.rmtree(local_model_dir)

In [None]:
# from google.colab import files
# files.download('/content/finalmodel.zip')

print(os.listdir('/kaggle/working'))

# Optionally, display a success message
print(f"ZIP file created successfully: {zip_filename}")

In [None]:
# Example: Saving a model and making it available for download in Kaggle

import joblib
import zipfile

# Assuming 'model' is your trained model object
# Example: saving a trained model using joblib
joblib.dump(model, '/kaggle/working/model.pkl')

# Create a ZIP file containing the saved model
with zipfile.ZipFile('/kaggle/working/finalmodel.zip', 'w') as zipf:
    zipf.write('/kaggle/working/model.pkl')

# Optionally, remove the temporary model.pkl file
import os
os.remove('/kaggle/working/model.pkl')


In [None]:
!pip list

In [None]:
!pip install safetensors transformers evaluate -U

In [None]:
!pip install evaluate

In [None]:
import joblib
import zipfile
import os
import re
import nltk
import torch
from transformers import BartTokenizer, BartForConditionalGeneration
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import evaluate

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
print(nltk.data.path)

In [None]:
!pip uninstall nltk -y
!pip install nltk

In [None]:
# # Function to load safetensor weights
# def load_model_weights(model, safetensor_path):
#     state_dict = load_safetensors(safetensor_path)
#     model.load_state_dict(state_dict)
from safetensors.torch import load_file as safetensors_load_file

In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer

# Initialize the tokenizer and model
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')


# Define a function to load weights correctly
model_path = "/kaggle/input/bart_fb_email/transformers/default/1"
state_dict = load_safetensors(f"{model_path}/model.safetensors")
model = BartForConditionalGeneration.from_pretrained(model_path, state_dict=state_dict)
model.resize_token_embeddings(len(tokenizer))
tokenizer = BartTokenizer.from_pretrained(model_path)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})


In [None]:
from safetensors.torch import load_file as load_safetensors

In [None]:
import re
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Function for text preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization
#     lemmatizer = WordNetLemmatizer()
#     tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Reconstruct text
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

# Load and preprocess the test data from a .txt file
def load_and_preprocess_file(file_path):
    test_emails = []
    reference_subjects = []
    
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        # Read the entire file content
        content = file.read()
        
        # Split data based on '<end>' or any other separator you use
        entries = content.split("<end>")
        
        for entry in entries:
            email_match = re.search(r"Email : (.*?) <Sep>", entry, re.DOTALL)
            subject_match = re.search(r"Subject : (.*?) <Sep>", entry, re.DOTALL)
            
            if email_match and subject_match:
                email_body = email_match.group(1).strip()
                reference_subject = subject_match.group(1).strip()
                
                # Preprocess the email body
                preprocessed_email = preprocess_text(email_body)
                
                # Append preprocessed email and reference subject to lists
                test_emails.append(preprocessed_email)
                reference_subjects.append(reference_subject)
            else:
                print("No match found in entry:", entry)
    
    return test_emails, reference_subjects

# Path to your .txt file
file_path = '/kaggle/input/subject/EmailSubjectTest.txt'

# Load and preprocess the data
test_emails, reference_subjects = load_and_preprocess_file(file_path)

# Print the results
print(f"Number of preprocessed test emails: {len(test_emails)}")
print(f"Number of reference subjects: {len(reference_subjects)}")

for email, subject in zip(test_emails, reference_subjects):
    print(f"Email: {email}")
    print(f"Reference Subject: {subject}")


In [None]:
!export CUDA_LAUNCH_BLOCKING=1

In [None]:
expected_keys = model.state_dict().keys()
missing_keys = [key for key in expected_keys if key not in state_dict]
print(f"Missing keys in state_dict: {missing_keys}")

In [None]:

# Function to generate subject lines
def generate_subject(model, tokenizer, email_text, max_length=512):
    try:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
        input_ids = tokenizer.encode(email_text, return_tensors='pt').to(device)
        
        # Debugging print statements
        print(f"Generating subject for email: {email_text[:100]}...")  # Print part of email text
        print(f"Input IDs shape: {input_ids.shape}")
        
        with torch.no_grad():
            output = model.generate(input_ids, max_length=max_length, num_beams=5, early_stopping=True)
        
        generated_subject = tokenizer.decode(output[0], skip_special_tokens=True)
        return generated_subject
    except Exception as e:
        print(f"Error generating subject: {e}")
        raise

# Generate subject lines for test emails
generated_subjects = []
try:
    generated_subjects = [generate_subject(model, tokenizer, email) for email in test_emails]
except Exception as e:
    print(f"Error generating subjects: {e}")
    raise

print("Generated subjects:")
for subject in generated_subjects:
    print(subject)


In [None]:
!pip install rouge_score

In [None]:
# Load evaluation metrics
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

# Evaluate BLEU and ROUGE scores
bleu_results = bleu_metric.compute(predictions=generated_subjects, references=[[ref] for ref in reference_subjects])
rouge_results = rouge_metric.compute(predictions=generated_subjects, references=reference_subjects)

print("BLEU score:", bleu_results)
print("ROUGE scores:", rouge_results)

In [None]:
!pip install --upgrade torch torchvision torchaudio evaluate
