In [29]:
# Install necessary libraries
!pip install transformers pandas scikit-learn torch numpy
!pip install accelerate -U
!pip install transformers[torch]



In [18]:
# import the libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

In [19]:
#Load & Prepare Email Classification Dataset

import os
import glob

# Path to the data folder
data_path = 'data_filled/'

# Get all Excel files in the folder
excel_files = glob.glob(os.path.join(data_path, '*.xlsx'))

# Initialize an empty list to store dataframes
dfs = []

# Read each Excel file and append to the list
for file in excel_files:
    df = pd.read_excel(file)
    dfs.append(df)

# Combine all dataframes into one
combined_df = pd.concat(dfs, ignore_index=True)

# Display the first few rows to check the data
print(f"Total number of emails: {len(combined_df)}")
combined_df.head()

Total number of emails: 194


Unnamed: 0,Subject,Content,Answer,Category
0,When does course selection open?,I need to plan my schedule for next semester. ...,Course selection for next semester opens on [s...,Academic
1,How do I withdraw from a course?,I want to drop a course this semester. Could y...,"To withdraw from a course, you will need to co...",Academic
2,How can I check my course schedule?,I would like to confirm my enrolled courses. W...,You can check your course schedule on the univ...,Academic
3,How can I know my course credits?,I need to check how many credits I have comple...,You can check your course credits by viewing y...,Academic
4,Where can I find my class timetable?,I want to see my class schedule for this semes...,You can access your class timetable for this s...,Academic


In [20]:
# Clean and preprocess the data


# Check for missing values
print("Missing values in each column:")
print(combined_df.isnull().sum())

# Drop rows with missing values in important columns
clean_df = combined_df.dropna(subset=['Subject', 'Content', 'Category'])

# Combine Subject and Content for classification
clean_df['Text'] = clean_df['Subject'] + " " + clean_df['Content']

# Check class distribution
print("\nClass distribution:")
print(clean_df['Category'].value_counts())

# Keep only the columns we need
clean_df = clean_df[['Text', 'Category']]

Missing values in each column:
Subject     0
Content     0
Answer      1
Category    0
dtype: int64

Class distribution:
Category
FAQ            127
Academic        47
work permit     20
Name: count, dtype: int64


In [21]:
# Encode the labels

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode the categories
clean_df['encoded_category'] = label_encoder.fit_transform(clean_df['Category'])

# Display mapping between categories and encoded values
for i, category in enumerate(label_encoder.classes_):
    print(f"Category: {category} -> Encoded value: {i}")

# Save the label encoder for later use
import pickle
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

Category: Academic -> Encoded value: 0
Category: FAQ -> Encoded value: 1
Category: work permit -> Encoded value: 2


In [None]:
# Split the data into training and testing sets

# Split the data into train and test sets (80% train, 20% test)
train_df, test_df = train_test_split(
    clean_df,
    test_size=0.2,
    random_state=42,
    stratify=clean_df['encoded_category']  # Ensure the class distribution is maintained
)

print(f"Training set size: {len(train_df)}")
print(f"Testing set size: {len(test_df)}")

Training set size: 155
Testing set size: 39


In [24]:
# Convert DataFrames to HuggingFace Dataset format

# Convert pandas DataFrames to HuggingFace Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Check the dataset structure
print("Training dataset:", train_dataset)
print("Test dataset:", test_dataset)

Training dataset: Dataset({
    features: ['Text', 'Category', 'encoded_category', '__index_level_0__'],
    num_rows: 155
})
Test dataset: Dataset({
    features: ['Text', 'Category', 'encoded_category', '__index_level_0__'],
    num_rows: 39
})


In [25]:
# Initialize the DistilBERT tokenizer

# Initialize the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Example tokenization to verify it works
sample_text = train_df['Text'].iloc[0]
tokens = tokenizer(sample_text, padding='max_length', truncation=True, max_length=512)
print(f"Example tokenization:\nInput text: {sample_text[:50]}...\nTokens: {list(tokens.keys())}")

Example tokenization:
Input text: Are financial proofs mandatory? Dear [Recipient],
...
Tokens: ['input_ids', 'attention_mask']


In [26]:
# Tokenize the datasets

# Define tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples['Text'],
        padding='max_length',
        truncation=True,
        max_length=512
    )

# Apply tokenization to datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set the format for PyTorch
tokenized_train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'encoded_category'])
tokenized_test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'encoded_category'])

Map:   0%|          | 0/155 [00:00<?, ? examples/s]

Map:   0%|          | 0/39 [00:00<?, ? examples/s]

In [None]:
# Initialize the DistilBERT model for sequence classification

# Get the number of unique categories
num_labels = len(label_encoder.classes_)

# Initialize the model
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=num_labels
)

print(f"Model initialized with {num_labels} output classes")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model initialized with 3 output classes


In [30]:
# Define training arguments

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Directory to save model checkpoints
    num_train_epochs=3,              # Number of training epochs
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=64,   # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps
    weight_decay=0.01,               # Weight decay for regularization
    logging_dir='./logs',            # Directory for logs
    logging_steps=10,                # Log every X steps
    evaluation_strategy="epoch",     # Evaluate after each epoch
    save_strategy="epoch",           # Save model after each epoch
    load_best_model_at_end=True,     # Load the best model at the end of training
    metric_for_best_model="accuracy" # Metric to use for best model
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`