In [1]:
pip install pandas transformers datasets torch scikit-learn


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_c

In [17]:
import pandas as pd

# Load all Excel files with the correct path
emails_work_permit = pd.read_excel("/content/sample_data/emailsWorkPermit.xlsx")
emails_faq = pd.read_excel("/content/sample_data/emailsFAQ.xlsx")
emails_academic = pd.read_excel("/content/sample_data/emailsAcademic.xlsx")
faqs_scraped = pd.read_excel("/content/sample_data/FAQs_scraped.xlsx")

# Print the first few rows to verify data is loaded
print("emailsWorkPermit:\n", emails_work_permit.head())
print("emailsFAQ:\n", emails_faq.head())
print("emailsAcademic:\n", emails_academic.head())
print("FAQs_scraped:\n", faqs_scraped.head())


emailsWorkPermit:
                                              Subject  \
0  Inquiry about the work permit application proc...   
1  Request for supporting documents required for ...   
2  Clarification on work hour restrictions for in...   
3  Assistance needed for renewing an expiring wor...   
4  Question regarding eligibility for a post-grad...   

                                             Content  \
0  I am writing to ask about the process of apply...   
1  I would like to confirm which supporting docum...   
2  I need clarification on the work hour restrict...   
3  My work permit is about to expire, and I need ...   
4  I am planning to apply for a post-graduation w...   

                                              Answer     Category  
0  To apply for a work permit as an international...  work permit  
1  The supporting documents required for a work p...  work permit  
2  International students holding a work permit a...  work permit  
3  To renew an expiring work permit

In [18]:
df = pd.concat([emails_work_permit, emails_faq, emails_academic, faqs_scraped], ignore_index=True)

# Merge Subject and Content into one input field
df["input_text"] = df["Subject"] + " " + df["Content"]

# Keep only the required columns: input_text (X) and Answer (y)
df = df[["input_text", "Answer"]]

# Print the first few rows
print(df.head())


                                          input_text  \
0  Inquiry about the work permit application proc...   
1  Request for supporting documents required for ...   
2  Clarification on work hour restrictions for in...   
3  Assistance needed for renewing an expiring wor...   
4  Question regarding eligibility for a post-grad...   

                                              Answer  
0  To apply for a work permit as an international...  
1  The supporting documents required for a work p...  
2  International students holding a work permit a...  
3  To renew an expiring work permit, you will nee...  
4  To be eligible for a post-graduation work perm...  


In [19]:
# Drop missing values if any
df = df.dropna()

# Check again
print("Missing values:", df.isnull().sum())


Missing values: input_text    0
Answer        0
dtype: int64


In [20]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["input_text"].tolist(),  # Features (input emails)
    df["Answer"].tolist(),      # Labels (email responses)
    test_size=0.2,              # 20% data for validation
    random_state=42             # Ensures reproducibility
)

print("Training Samples:", len(train_texts))
print("Validation Samples:", len(val_texts))


Training Samples: 154
Validation Samples: 39


In [21]:
from transformers import RobertaTokenizer

# Load RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Tokenize text with truncation and padding
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [22]:
import torch

class EmailDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(idx)  # Assigning index as label (temporary)
        return item

# Create dataset objects
train_dataset = EmailDataset(train_encodings, train_labels)
val_dataset = EmailDataset(val_encodings, val_labels)

print(f"Train Dataset Size: {len(train_dataset)}")
print(f"Validation Dataset Size: {len(val_dataset)}")


Train Dataset Size: 154
Validation Dataset Size: 39


In [23]:
from transformers import RobertaForSequenceClassification

# Load RoBERTa model (for text generation, we need seq2seq, but for now, classification)
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=len(set(train_labels))  # Number of unique labels
)


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=100,  # You can adjust this
    save_strategy="epoch",
    logging_dir="./logs",
)




In [25]:
from transformers import Trainer

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)


In [None]:
trainer.train()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmaytezhou99[0m ([33mmaytezhou99-yuan-ze-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,5.059591
2,No log,5.048608
3,No log,5.058627
4,No log,5.092756
5,No log,5.092914
6,No log,5.178911
7,No log,5.238441
8,No log,5.326563
9,No log,5.387219
10,No log,5.369674
