In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import re
import os
from tqdm import tqdm
import pickle


# Load the CSV
df = pd.read_csv('datasets/genz_emails_final.csv')
# drop the id column and save this as genz_data_final.csv
df = df.drop(columns=['id'])
# Save the modified DataFrame to a new CSV file
df.to_csv('datasets/genz_data_final.csv', index=False)
print(df.head())
print(df.columns)
print(df.info())


2025-05-07 23:19:51.596891: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746640191.610485   17791 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746640191.614468   17791 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-07 23:19:51.631839: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


                                            original  \
0  Hello ,\n\nI need a computer and flat screen m...   
1  Please disregard to prior version and use this...   
2  \n\t [IMAGE] \t\n\tThanksgiving is just around...   
3  The PRELIMINARY Violation Memos for 11/16/01 h...   
4  Start Date: 3/30/01; HourAhead hour: 24;  No a...   

                                           generated   Hierarchy_Label  
0  Hello,\n\nI'm low-key in need of a computer an...     Sender higher  
1  Lowkey, forget the old one and vibe with this ...     Sender higher  
2  Thanksgiving is lowkey right around the corner...  Recipient higher  
3  The lowkey preliminary violation memos for 11/...  Recipient higher  
4  Start Date: 3/30/01; HourAhead hour: 24; No ex...     Similar level  
Index(['original', 'generated', 'Hierarchy_Label'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2503 entries, 0 to 2502
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  -

In [2]:

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Check for CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")



Using device: cuda


In [3]:
# Load and preprocess the data
df = pd.read_csv('datasets/genz_data_final.csv')
# add all the rows present in datasets/genz_data.csv to the above df
df_additional = pd.read_csv('datasets/genz_data.csv')
df = pd.concat([df, df_additional], ignore_index=True)
# Basic text preprocessing
def preprocess_text(text):
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()
        # Replace multiple spaces with a single space
        text = re.sub(r'\s+', ' ', text)
        # Remove special characters but keep basic punctuation
        text = re.sub(r'[^\w\s.,!?;:\'"-]', '', text)
        return text.strip()
    return ""



In [4]:
# Apply preprocessing
df['original_processed'] = df['original'].apply(preprocess_text)
df['generated_processed'] = df['generated'].apply(preprocess_text)

# Display sample data
print(f"Total examples: {len(df)}")
print("\nOriginal vs Gen Z Style (first 3 examples):")
for i in range(3):
    print(f"\nOriginal: {df['original_processed'].iloc[i][:100]}...")
    print(f"Gen Z: {df['generated_processed'].iloc[i][:100]}...")

# Split the data
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
print(f"\nTraining examples: {len(train_df)}")
print(f"Validation examples: {len(val_df)}")

Total examples: 4902

Original vs Gen Z Style (first 3 examples):

Original: hello , i need a computer and flat screen moved. the location is eb3240e. only- one computer and a f...
Gen Z: hello, i'm low-key in need of a computer and flat screen moved, tbh. the spot is eb3240e, fyi. just ...

Original: please disregard to prior version and use this one....
Gen Z: lowkey, forget the old one and vibe with this new version, aight?...

Original: image thanksgiving is just around the corner! thanksgiving special order now and save 5 off or 10 of...
Gen Z: thanksgiving is lowkey right around the corner! thanksgiving special order now and cop a 5 discount ...

Training examples: 4411
Validation examples: 491


In [5]:
class GenZStyleDataset(Dataset):
    def __init__(self, data, tokenizer, max_source_length=512, max_target_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_source_length = max_source_length
        self.max_target_length = max_target_length
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        source_text = self.data['original_processed'].iloc[index]
        target_text = self.data['generated_processed'].iloc[index]
        
        # Prepare the inputs for the model
        source_encoding = self.tokenizer(
            "translate to GenZ style: " + source_text,
            max_length=self.max_source_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        target_encoding = self.tokenizer(
            target_text,
            max_length=self.max_target_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        input_ids = source_encoding["input_ids"].squeeze()
        attention_mask = source_encoding["attention_mask"].squeeze()
        labels = target_encoding["input_ids"].squeeze()
        # Replace padding token id with -100 so it's ignored in loss calculation
        labels[labels == self.tokenizer.pad_token_id] = -100
        
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

In [6]:
# Load T5 model and tokenizer
model_name = "t5-small"  # You can use "t5-base" for better performance
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Move model to device
model = model.to(device)

# Create datasets and dataloaders
train_dataset = GenZStyleDataset(train_df, tokenizer)
val_dataset = GenZStyleDataset(val_df, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=4)

# Set up the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-4)
total_steps = len(train_dataloader) * 5  # 5 epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
# Training function
def train(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    
    progress_bar = tqdm(dataloader, desc="Training")
    for batch in progress_bar:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        
        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        total_loss += loss.item()
        
        # Backward pass
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
        # Update progress bar
        progress_bar.set_postfix({"loss": loss.item()})
    
    return total_loss / len(dataloader)

# Evaluation function
def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            total_loss += loss.item()
    
    return total_loss / len(dataloader)

# Function to generate Gen Z style text
def generate_genz_text(text, model, tokenizer, device, max_length=512):
    model.eval()
    
    # Prepare input
    input_text = text
    input_ids = tokenizer(
        input_text,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    ).input_ids.to(device)
    
    # Generate output
    outputs = model.generate(
        input_ids=input_ids,
        max_length=max_length,
        num_beams=5,
        no_repeat_ngram_size=2,
        early_stopping=True
    )
    
    # Decode output
    genz_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return genz_text

In [8]:


# # Training loop
# num_epochs = 5
# best_val_loss = float('inf')

# print("\nStarting training...")
# for epoch in range(num_epochs):
#     print(f"\nEpoch {epoch + 1}/{num_epochs}")
    
#     # Train
#     train_loss = train(model, train_dataloader, optimizer, scheduler, device)
#     print(f"Training loss: {train_loss:.4f}")
    
#     # Evaluate
#     val_loss = evaluate(model, val_dataloader, device)
#     print(f"Validation loss: {val_loss:.4f}")
    
#     # Save the best model
#     if val_loss < best_val_loss:
#         best_val_loss = val_loss
#         # Create directory if it doesn't exist
#         os.makedirs('models', exist_ok=True)
#         torch.save(model.state_dict(), 'models/genz_t5_model_new.pt')
#         print("Best model saved!")

# # Load the best model
# model.load_state_dict(torch.load('models/genz_t5_model_new.pt'))

In [9]:
# # Test the model on some examples
# test_examples = [
#     "avoid majority despite stock mission idea action. rise adult dream way. job organization forget world guess off. property thank need manage later reason. pressure near lose organization. close admit popular option. understand reveal follow ask body. range final seven fall",
#     "Hello, I wanted to inform you about our upcoming meeting next Tuesday.",
#     "The project deadline has been extended until next Friday.",
#     "Thank you for your prompt response to my email.",
#     "religious region never happy main. piece us step wonder teach. management seek military alone environment budget. bit his but phone. whether of season road herself system. court better national tonight. state indicate house too test. ahead capital change."
# ]

# print("\nTesting the model with examples:")
# for text in test_examples:
#     genz_text = generate_genz_text(text, model, tokenizer, device)
#     print(f"\nOriginal: {text}")
#     print(f"Gen Z: {genz_text}")

# # Save the tokenizer for later use
# tokenizer.save_pretrained('models/genz_t5_tokenizer_new')

# print("\nTraining complete! Model and tokenizer saved.")

In [10]:
# # Create a simple function to use the model
# def translate_to_genz(input_text):
#     """
#     Function to translate normal text to Gen Z style using the trained model.
#     """
#     return generate_genz_text(input_text, model, tokenizer, device)

# # Example of how to use the function
# print("\nExample usage of the translation function:")
# example = "This is a formal business communication regarding your recent purchase."
# genz_result = translate_to_genz(example)
# print(f"Original: {example}")
# print(f"Gen Z: {genz_result}")



In [11]:
# Save both model and tokenizer in a single pickle file for easier loading
model_package = {
    'model_state_dict': model.state_dict(),
    'tokenizer_name': model_name
}

with open('models/genz_t5_complete.pkl', 'wb') as f:
    pickle.dump(model_package, f)

print("\nSaved complete model package for easy loading!")

# Usage example for loading the saved model
def load_genz_model():
    """
    Load the trained Gen Z translator model and tokenizer.
    """
    with open('models/genz_t5_complete.pkl', 'rb') as f:
        model_package = pickle.load(f)
    
    # Load tokenizer and model
    tokenizer = T5Tokenizer.from_pretrained(model_package['tokenizer_name'])
    model = T5ForConditionalGeneration.from_pretrained(model_package['tokenizer_name'])
    model.load_state_dict(model_package['model_state_dict'])
    model.to(device)
    model.eval()
    
    return model, tokenizer

# Example to show how to load and use the model
print("\nExample of loading and using the saved model:")
print("# Code to load and use the model:")
print("model, tokenizer = load_genz_model()")
print("input_text = 'Your formal text here'")
print("genz_text = generate_genz_text(input_text, model, tokenizer, device)")
print("print(genz_text)")


# load the model and tokenizer
model, tokenizer = load_genz_model()
# Example usage




Saved complete model package for easy loading!

Example of loading and using the saved model:
# Code to load and use the model:
model, tokenizer = load_genz_model()
input_text = 'Your formal text here'
genz_text = generate_genz_text(input_text, model, tokenizer, device)
print(genz_text)


In [17]:
input_text = "Dad, I was talking with Kathleen this weekend and she had some ideas and suggestions about Enron. I asked her to put them in writing so that I could share them with you."
genz_text = generate_genz_text(input_text, model, tokenizer, device)
print(genz_text)

Dad, I was talking with Kathleen this weekend and she had some ideas and suggestions about Enron.


In [13]:
from tqdm import tqdm
import pandas as pd

def replace_generated_column(input_csv, output_csv):
    """
    Replace the 'generated' column in the input CSV with Gen Z style translations,
    displaying a progress bar using tqdm.
    """
    df = pd.read_csv(input_csv)

    # Load the model and tokenizer
    model, tokenizer = load_genz_model()

    # Set up tqdm for pandas
    tqdm.pandas(desc="Translating rows")

    # Translate each row with progress bar
    df['generated'] = df['original'].progress_apply(lambda x: generate_genz_text(x, model, tokenizer, device))

    # Save the modified DataFrame to a new CSV file
    df.to_csv(output_csv, index=False)
    print(f"Saved modified DataFrame to {output_csv}")

# Replace the generated column in the original CSV
replace_generated_column('datasets/genz_emails_final.csv', 'datasets/genz_emails_final_translated.csv')


Translating rows:   0%|          | 10/2503 [00:04<16:51,  2.46it/s]


KeyboardInterrupt: 

In [None]:
trans_df = pd.read_csv('datasets/genz_emails_final_translated.csv')


trans_df.head(20)

Unnamed: 0,original,generated,id,Hierarchy_Label
0,"Hello ,\n\nI need a computer and flat screen m...",the location is EB3240E. ONLY- one computer an...,6868,Sender higher
1,Please disregard to prior version and use this...,"slap the previous version and use this one, no...",24016,Sender higher
2,\n\t [IMAGE] \t\n\tThanksgiving is just around...,[IMAGE] Thanksgiving is just around the corner...,9668,Recipient higher
3,The PRELIMINARY Violation Memos for 11/16/01 h...,the PRELIMINARY Violation Memos for 11/16/01 h...,13640,Recipient higher
4,Start Date: 3/30/01; HourAhead hour: 24; No a...,no variances detected. LOG MESSAGES: PARSING F...,14018,Similar level
5,west- san juan\nplease change deal V96295 from...,change deal V96295 from gas daily to just a ny...,7488,Sender higher
6,I will be attending the Columbia Business Scho...,thanks for the invite. Ben Rogers,5804,Sender higher
7,"\n\tLinda, I went ahead and assigned Beverly M...","Linda, I went ahead and assigned Beverly Mille...",12909,Sender higher
8,A work order (J00136) has been set up on Gulf ...,a work order (J00136) has been set up on Gulf ...,3386,Recipient higher
9,"Teams, Danny Collier, Region IX, EPA in Sanfra...","teams, Danny Collier, region IX, EPA in Sanfra...",9567,Sender higher


In [None]:


# print the first 50 rows of original and generated columns whose length is less than 50


# Filter rows with 'original' length < 50
filtered_df = trans_df[trans_df['original'].str.len() > 50]
filtered_df = filtered_df[filtered_df['original'].str.len() < 150]

# Sample up to 50 such rows
sampled_df = filtered_df.sample(n=min(50, len(filtered_df)), random_state=42)

# Print each sample neatly
for i, row in sampled_df.iterrows():
    print(f"ID  : {row['id']}")
    print(f"Original  \n {row['original']}\n\n")
    print(f"Generated \n {row['generated']}\n\n")
    print('-' * 40)



ID  : 19494
Original  
 are you still partying tonight?  i think i am going to stay in, but i am 
definately up for going out tomorrow night.  


Generated 
 are you still ghosting tonight? i'm still sna party tonight, bruh? think u gonna stay in, but im definately up for going out tomorrow night.


----------------------------------------
ID  : 4104
Original  
 Hi, Vince,

Please see attached the updated Total Return Swap deals. 

All the best!

Li



Generated 
 yo, Vince, check attached the updated Total Return Swap deals. all the best! Li


----------------------------------------
ID  : 18182
Original  
 Please change the book coding on this log in to FT-Peoples.  Thanks.
PL


Generated 
 please change the book coding on this log in to FT-Peoples. thanks. PL


----------------------------------------
ID  : 5671
Original  
 Per your request . . .

 - #1127382 v10 - NEW AGENCY AGREEMENT.doc


Generated 
 . -1127382 v10 - NEW AGENCY AGREEMENT.doc


------------------------------------