<a href="https://colab.research.google.com/github/ntatfff/bert-demo/blob/main/transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers



In [14]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd

In [None]:
# get trainning dataset with 50 spam and 50 ham
data= 'https://raw.githubusercontent.com/ntatfff/bert-demo/main/spam_ham_dataset.csv'
df_all = pd.read_csv(data)
df_50_spam = df_all[df_all['label'] == 'spam'][:2]
df_50_ham = df_all[df_all['label'] == 'ham'][:3]
frames = [df_50_spam, df_50_ham]
df_train = pd.concat(frames)

In [None]:
df_train['text'][0]

"Subject: enron methanol ; meter # : 988291\r\nthis is a follow up to the note i gave you on monday , 4 / 3 / 00 { preliminary\r\nflow data provided by daren } .\r\nplease override pop ' s daily volume { presently zero } to reflect daily\r\nactivity you can obtain from gas control .\r\nthis change is needed asap for economics purposes ."

In [16]:
# get trainning dataset with 50 spam and 50 ham
data= 'https://raw.githubusercontent.com/ntatfff/bert-demo/main/spam_ham_dataset.csv'
df_all = pd.read_csv(data)
df_50_spam = df_all[df_all['label'] == 'spam'][:50]
df_50_ham = df_all[df_all['label'] == 'ham'][:50]
frames = [df_50_spam, df_50_ham]
df_train = pd.concat(frames)

# Step 1: Load and preprocess your dataset
# Assume you have a CSV file with a 'text' column and a 'label' column
# For simplicity, let's create a synthetic dataset for this example
# data = {'text': ["I love this product!", "Not bad, but could be better.", "Terrible experience."],
#         'label': [1, 0, 0]}
# data = {'text': list(df_train['text']),
#         'label': list(df_train['label_num'])}

# Import necessary libraries

# Example data (replace this with your actual dataset)
# texts = ["Buy now and get 50% off!", "Hello, how are you?", "Congratulations! You've won a prize!",
#          "Meeting at 2 PM in the conference room.", "Click here to claim your reward!"]
# labels = [1, 0, 1, 0, 1]  # 1 for spam, 0 for not spam
texts = list(df_train['text'])
labels = list(df_train['label_num'])  # 1 for spam, 0 for not spam

# Split the data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Tokenize the texts
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

# Create DataLoader for training and testing sets
train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']),
                              torch.tensor(train_encodings['attention_mask']),
                              torch.tensor(train_labels))

test_dataset = TensorDataset(torch.tensor(test_encodings['input_ids']),
                             torch.tensor(test_encodings['attention_mask']),
                             torch.tensor(test_labels))

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

# Initialize the model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Training loop
num_epochs = 3
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss}')

# Evaluation
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f'Accuracy: {accuracy}')

# Prediction example
new_texts = ["Special offer! Limited time discount!", "Meeting rescheduled to 3 PM."]
new_encodings = tokenizer(new_texts, truncation=True, padding=True, return_tensors='pt')

with torch.no_grad():
    input_ids = new_encodings['input_ids'].to(device)
    attention_mask = new_encodings['attention_mask'].to(device)

    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits

    predictions = torch.argmax(logits, dim=1).cpu().numpy()

print("Predictions:", predictions)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Loss: 26.99244463443756
Epoch 2/3, Loss: 22.787923336029053
Epoch 3/3, Loss: 10.644275188446045
Accuracy: 1.0
Predictions: [1 0]
