In [1]:
import numpy as np 
import pandas as pd
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('Language_det_train.csv')
df.head()

Unnamed: 0,Text,Language
0,στη Γαλλία νωρίτερα ραντεβού χρησιμοποιήθηκε α...,Greek
1,e con ciò lei salì nella sua carrozza e senza ...,Italian
2,buna değmez.,Turkish
3,Viktiga skillnader är att i en wiki lagras sid...,Sweedish
4,تعرف على ما إذا كان شخص ما يقول نكتة رائعة يمك...,Arabic


In [72]:
df['Language'].value_counts()

Language
English       1316
French         963
Spanish        778
Portugeese     702
Italian        663
Russian        657
Sweedish       642
Malayalam      564
Dutch          519
Arabic         509
Turkish        450
German         446
Tamil          446
Danish         407
Kannada        351
Greek          347
Hindi           60
Name: count, dtype: int64

In [3]:
# encode the columns Languages to ids  
df['Language'] = df['Language'].astype('category')
df['Language'] = df['Language'].cat.codes

In [17]:
class TextClassificationDataset(Dataset):
    
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

In [18]:
class MultiLingualClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(MultiLingualClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            pooled_output = outputs.pooler_output
            x = self.dropout(pooled_output)
            logits = self.fc(x)
            return logits

## why CrossEntropyLoss?

- **cross-entropy**: measures the difference between the discovered probability distribution of a classification model and the predicted values.
- as this is a mulitclass classification problem CrossEntropy losss is the most popular loss function.
- also it is differentiable so it can help in the optimization phase.

In [76]:
def train(model, data_loader, optimizer, device):
    model.train()
    # loop through the training dataloader
    for batch in data_loader:
        model.zero_grad()
        # move input ids and attention masks to the same device as the model
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, torch.Tensor.long(labels))
        loss.backward()
        optimizer.step()

In [77]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [78]:
def predict_sentiment(text, model, tokenizer, device, max_length=128):
#     model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
    return preds

In [20]:
# Set up parameters
model_id = 'amberoad/bert-multilingual-passage-reranking-msmarco'
num_classes = 17
max_length = 128
batch_size = 32
num_epochs = 4
learning_rate = 2e-5

In [80]:
texts, labels = df['Text'].values, df['Language'].values
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [82]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultiLingualClassifier(model_id, num_classes).to(device)

In [22]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs



NameError: name 'train_dataloader' is not defined

In [None]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train(model, train_dataloader, optimizer, device)
    accuracy, report = evaluate(model, val_dataloader, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)

Epoch 1/4


KeyboardInterrupt: 

In [128]:
id2lang = {3: 'English', 4:'French',13:'Spanish', 11:"Portugeese", 
8:"Italian", 12: "Russian", 14: "Sweedish", 10:"Malayalam", 2: "Dutch",
 0: "Arabic", 16:"Turkish", 5: "German", 15: "Tamil", 1: "Danish", 9: "Kannada", 6: "Greek", 7: "Hindi"}

In [None]:
# Test sentiment prediction
test_text = "The movie was great and I really enjoyed the performances of the actors."
sentiment = predict_sentiment(test_text, model, tokenizer, device)
print("The movie was great and I really enjoyed the performances of the actors.")
print(f"Predicted sentiment: {sentiment}")

TypeError: 'collections.OrderedDict' object is not callable

In [None]:
# Test sentiment prediction
test_text = "تعرف على ما إذا كان شخص ما يقول نكتة رائعة يمكن"
sentiment = predict_sentiment(test_text, model, tokenizer, device)
print("The movie was great and I really enjoyed the performances of the actors.")
print(f"Predicted sentiment: {sentiment}")

Unnamed: 0,Text,Language
4,تعرف على ما إذا كان شخص ما يقول نكتة رائعة يمك...,0
16,هذا سؤال جيد.,0
17,[23] عزا فريق في مركز بالو ألتو للأبحاث هذا ال...,0
25,إنها جيدة بالنسبة لك ، سيكون أفضل بدونك وهذا م...,0
34,"تم تدوين سياسة ويكيبيديا الخاصة بـ ""وجهة نظر م...",0
...,...,...
9784,طلب للحصول على معلومات.,0
9789,للإجابة على سؤال واحد في الاختبار أو واجه الرج...,0
9801,لا يهم مثل لا تقلق إذا أسقطت قهوتك فوقي.,0
9805,من دواعي سروري.,0
