In [None]:
!pip install transformers

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset

In [3]:
data = pd.read_csv('training_data_english.csv')
texts = data['Text'].tolist()
labels = data['Category'].astype('category').cat.codes.tolist()

In [4]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(data['Category'].unique()))


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)


In [7]:
train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']),
                              torch.tensor(train_encodings['attention_mask']),
                              torch.tensor(train_labels))
val_dataset = TensorDataset(torch.tensor(val_encodings['input_ids']),
                            torch.tensor(val_encodings['attention_mask']),
                            torch.tensor(val_labels))

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)


In [8]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device) 
optimizer = AdamW(model.parameters(), lr=1e-5)



In [9]:
for epoch in range(3):
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

In [10]:
# Save the model
model.save_pretrained('saved_model')

In [11]:
model.eval()
correct = 0
total = 0

In [12]:
with torch.no_grad():
    for batch in val_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        predicted_labels = torch.argmax(outputs.logits, dim=1)
        total += labels.size(0)
        correct += (predicted_labels == labels).sum().item()

In [13]:
accuracy = correct / total
print(f'Validation Accuracy: {accuracy:.2f}')

Validation Accuracy: 0.59


In [None]:
loaded_model = BertForSequenceClassification.from_pretrained('saved_model')
loaded_model.to(device)  # Ensure the loaded model is on the same device

In [40]:
user_input = input("Enter a text: ")

Enter a text: Stay informed about the latest developments in the world of finance and economics. Our team of experts analyzes market trends, investment strategies, and economic indicators to provide you with valuable insights. From personal finance advice to global economic outlooks, our blog equips you with the knowledge to make informed financial decisions.


In [41]:
user_input_encodings = tokenizer(user_input, truncation=True, padding=True, return_tensors='pt')
user_input_ids = user_input_encodings['input_ids'].to(device)
user_input_attention_mask = user_input_encodings['attention_mask'].to(device)

In [44]:
with torch.no_grad():
    loaded_model.eval()
    outputs = loaded_model(input_ids=user_input_ids, attention_mask=user_input_attention_mask)
    predicted_label = torch.argmax(outputs.logits, dim=1).item()
    predicted_probabilities = torch.softmax(outputs.logits, dim=1).cpu().numpy()[0]


In [45]:
predicted_category = data['Category'][predicted_label]

In [46]:
print(f'Predicted Category: {predicted_category}')
print('Predicted Probabilities:')
seen_categories = set()
for category, probability in zip(data['Category'], predicted_probabilities):
    if category not in seen_categories:
        print(f'{category}: {probability:.4f}')
        seen_categories.add(category)

Predicted Category: other
Predicted Probabilities:
blog: 0.1686
news: 0.0738
other: 0.3404
