In [None]:
#Step 7
#English
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from tqdm import tqdm
import re
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load cleaned English sentiment dataset
english_df = pd.read_csv('IMDB Dataset.csv')

# Preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

english_df['review'] = english_df['review'].apply(clean_text)
english_df = english_df.rename(columns={'review': 'text', 'sentiment': 'label'})
english_df['label'] = english_df['label'].map({'positive': 1, 'negative': 0})

# Check
print(english_df.head())

# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

class SentimentDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        return {'input_ids': encoding['input_ids'].squeeze(0),
                'attention_mask': encoding['attention_mask'].squeeze(0),
                'label': torch.tensor(label)}

# Create Dataset
dataset = SentimentDataset(english_df['text'].tolist(), english_df['label'].tolist())

# Split train-test
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Evaluation Function
def evaluate(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary')

    return accuracy, precision, recall, f1

# Training Loop
num_epochs = 2

for epoch in range(num_epochs):
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

    # Evaluate after each epoch
    acc, prec, rec, f1 = evaluate(model, test_loader)
    print(f"\nEvaluation after Epoch {epoch+1}:")
    print(f"Accuracy: {acc:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f} | F1 Score: {f1:.4f}\n")

print("Training completed ‚úÖ")

# Save the model
save_path = './sentiment_model'
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print(f"Model saved at {save_path} ‚úÖ")

# -------------------
# Prediction on new sentences
def predict_sentiment(sentences):
    model.eval()
    inputs = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=1)
    preds = torch.argmax(probs, dim=1)
    return preds.cpu().numpy()

# Test Predictions
test_sentences = [
    "I absolutely loved the movie! It was fantastic.",
    "The film was too slow and boring.",
    "Amazing acting and a gripping story!",
    "I didn't enjoy the movie at all."
]

predictions = predict_sentiment(test_sentences)

for sentence, pred in zip(test_sentences, predictions):
    sentiment = "Positive" if pred == 1 else "Negative"
    print(f"Sentence: {sentence}\nPredicted Sentiment: {sentiment}\n")

In [None]:
#English testing
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import pandas as pd
import re
import random

# Load model and tokenizer
model_path = './sentiment_model'
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

# Preprocessing function
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Load and preprocess dataset
df = pd.read_csv('IMDB Dataset.csv')
df['review'] = df['review'].apply(clean_text)
df = df.rename(columns={'review': 'text', 'sentiment': 'label'})
df['label'] = df['label'].map({'positive': 1, 'negative': 0})

# Select 20 random samples
samples = df.sample(n=20, random_state=42)

# Prediction function
def predict_sentiment(sentences):
    inputs = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=1)
    preds = torch.argmax(probs, dim=1)
    return preds.cpu().numpy()

# Predict and print
texts = samples['text'].tolist()
true_labels = samples['label'].tolist()
predictions = predict_sentiment(texts)

for i in range(20):
    predicted = "Positive" if predictions[i] == 1 else "Negative"
    actual = "Positive" if true_labels[i] == 1 else "Negative"
    print(f"Sample {i+1}:")
    print(f"Text: {texts[i]}")
    print(f"Predicted Sentiment: {predicted} | Actual Sentiment: {actual}\n")


In [None]:
#Spanish
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from tqdm import tqdm
import re
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import load_dataset

# Load the dataset (which only has the 'test' split)
dataset = load_dataset("TheFinAI/flare-es-tsa")

# Convert to Pandas DataFrame (use 'test' split)
spanish_df = dataset['test'].to_pandas()

# Preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z√°√©√≠√≥√∫√º√±√Å√â√ç√ì√ö√ú√ë0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

spanish_df['text'] = spanish_df['text'].apply(clean_text)

spanish_df = spanish_df.rename(columns={'answer': 'label'}) 
# Map sentiment strings to integers
label_mapping = {'negativo': 0, 'neutral': 1, 'positivo': 2}
spanish_df['label'] = spanish_df['label'].map(label_mapping)

# Check
print(spanish_df.head())

# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

class SentimentDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        return {'input_ids': encoding['input_ids'].squeeze(0),
                'attention_mask': encoding['attention_mask'].squeeze(0),
                'label': torch.tensor(label)}

# Split dataset into train and test (80% train, 20% test)
train_size = int(0.8 * len(spanish_df))
test_size = len(spanish_df) - train_size
train_df = spanish_df[:train_size]
test_df = spanish_df[train_size:]

# Create Dataset objects
train_dataset = SentimentDataset(train_df['text'].tolist(), train_df['label'].tolist())
test_dataset = SentimentDataset(test_df['text'].tolist(), test_df['label'].tolist())

# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# Load multilingual BERT model for 3 classes
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=3)
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Evaluation function (macro average for multiclass)
def evaluate(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='macro')

    return accuracy, precision, recall, f1

# Training Loop
num_epochs = 2

for epoch in range(num_epochs):
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

    # Evaluate after each epoch
    acc, prec, rec, f1 = evaluate(model, test_loader)
    print(f"\nEvaluation after Epoch {epoch+1}:")
    print(f"Accuracy: {acc:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f} | F1 Score: {f1:.4f}\n")

print("Training completed ‚úÖ")

# Save the model
save_path = './spanish_sentiment_model'
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print(f"Model saved at {save_path} ‚úÖ")

# -------------------
# Prediction on new Spanish sentences
def predict_sentiment(sentences):
    model.eval()
    inputs = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=1)
    preds = torch.argmax(probs, dim=1)
    return preds.cpu().numpy()

# Example Spanish sentences
test_sentences = [
    "El mercado burs√°til cerr√≥ con una subida inesperada.",
    "La empresa anunci√≥ p√©rdidas trimestrales significativas.",
    "Los resultados fueron neutrales, sin grandes sorpresas."
]

predictions = predict_sentiment(test_sentences)

label_reverse_map = {0: "Negativo", 1: "Neutral", 2: "Positivo"}

for sentence, pred in zip(test_sentences, predictions):
    sentiment = label_reverse_map[pred]
    print(f"Sentence: {sentence}\nPredicted Sentiment: {sentiment}\n")

In [3]:
#Spanish Testing 
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Load saved model and tokenizer
model_path = './spanish_sentiment_model'
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

# Prediction function
def predict_sentiment(sentence):
    inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=1)
    pred = torch.argmax(probs, dim=1).item()
    return pred

# Reverse map for labels
label_reverse_map = {0: "Negativo", 1: "Neutral", 2: "Positivo"}

# Input sentence from user
user_input = input("Introduce una oraci√≥n en espa√±ol para analizar el sentimiento: ")
prediction = predict_sentiment(user_input)
print(f"Sentimiento predicho: {label_reverse_map[prediction]}")


Sentimiento predicho: Negativo


In [None]:
#French
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from tqdm import tqdm
import re
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import load_dataset
import numpy as np
import os

# Load the allocine dataset
dataset = load_dataset("allocine")
train_data = dataset["train"]
test_data = dataset["test"]

# Convert to pandas DataFrame for easier processing
train_df = pd.DataFrame({"text": train_data["review"], "label": train_data["label"]})
test_df = pd.DataFrame({"text": test_data["review"], "label": test_data["label"]})

# Preprocessing function for French text
def clean_text(text):
    text = text.lower()
    # Keep French accents and special characters
    text = re.sub(r"[^a-zA-Z√Ä-√ø0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Apply cleaning
train_df['text'] = train_df['text'].apply(clean_text)
test_df['text'] = test_df['text'].apply(clean_text)

# Create balanced subset (25k positive + 25k negative = 50k total)
positive_samples = train_df[train_df['label'] == 1].sample(n=25000, random_state=42)
negative_samples = train_df[train_df['label'] == 0].sample(n=25000, random_state=42)
balanced_train_df = pd.concat([positive_samples, negative_samples]).sample(frac=1, random_state=42)  # Shuffle

print(f"Final training set size: {len(balanced_train_df)}")
print(balanced_train_df['label'].value_counts())

# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Tokenizer and model (using multilingual BERT)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

class SentimentDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        return {'input_ids': encoding['input_ids'].squeeze(0),
                'attention_mask': encoding['attention_mask'].squeeze(0),
                'label': torch.tensor(label)}

# Create Datasets
train_dataset = SentimentDataset(balanced_train_df['text'].tolist(), balanced_train_df['label'].tolist())
test_dataset = SentimentDataset(test_df['text'].tolist(), test_df['label'].tolist())

# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Evaluation Function
def evaluate(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary')

    return accuracy, precision, recall, f1

# Training Loop
num_epochs = 2

for epoch in range(num_epochs):
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

    # Evaluate after each epoch
    acc, prec, rec, f1 = evaluate(model, test_loader)
    print(f"\nEvaluation after Epoch {epoch+1}:")
    print(f"Accuracy: {acc:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f} | F1 Score: {f1:.4f}\n")

print("Training completed ‚úÖ")

# -------------------
# Prediction on first 20 test samples
def predict_sentiment(texts):
    model.eval()
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    preds = torch.argmax(outputs.logits, dim=1)
    return preds.cpu().numpy()

# Get first 20 test samples
test_samples = test_df.head(20)
test_texts = test_samples['text'].tolist()
true_labels = test_samples['label'].tolist()

# Make predictions
predictions = predict_sentiment(test_texts)

# Print results
print("\nPredictions for first 20 test samples:")
print("-" * 60)
for i, (text, pred, true) in enumerate(zip(test_texts, predictions, true_labels), 1):
    print(f"Sample {i}:")
    print(f"Text: {text[:150]}...")  # Print first 150 chars
    print(f"Predicted: {pred} | Actual: {true}")
    print("-" * 60)

# -------------------
# Save the model to french_model folder
model_dir = "french_model"
os.makedirs(model_dir, exist_ok=True)  # Create directory if it doesn't exist

# Save model and tokenizer
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

print(f"\nModel and tokenizer saved to: {model_dir}")

# Verify the saved model can be loaded
print("\nVerifying saved model can be loaded...")
loaded_model = BertForSequenceClassification.from_pretrained(model_dir).to(device)
loaded_tokenizer = BertTokenizer.from_pretrained(model_dir)

# Test prediction with loaded model
test_text = "Ce film √©tait incroyable, je l'ai ador√©!"
inputs = loaded_tokenizer(test_text, return_tensors="pt").to(device)
with torch.no_grad():
    outputs = loaded_model(**inputs)
prediction = torch.argmax(outputs.logits).item()
print(f"\nTest prediction with loaded model on text: '{test_text}'")
print(f"Predicted sentiment: {'Positive' if prediction == 1 else 'Negative'}")

In [1]:
#French testing
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import re
from datasets import load_dataset

# Preprocessing function (same as before)
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z√Ä-√ø0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Load test data from allocine
dataset = load_dataset("allocine")
test_data = dataset["test"]

# Create DataFrame and clean
test_df = pd.DataFrame({
    "text": test_data["review"],
    "label": test_data["label"]
})
test_df['text'] = test_df['text'].apply(clean_text)

# Use first 20 samples
test_samples = test_df.head(20)
test_texts = test_samples['text'].tolist()
true_labels = test_samples['label'].tolist()

# Load model and tokenizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertForSequenceClassification.from_pretrained("french_model").to(device)
tokenizer = BertTokenizer.from_pretrained("french_model")

# Prediction function
def predict_sentiment(texts):
    model.eval()
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    preds = torch.argmax(outputs.logits, dim=1)
    return preds.cpu().numpy()

# Run predictions
predictions = predict_sentiment(test_texts)

# Display results
print("\nPredictions for first 20 test samples using the saved model:")
print("-" * 60)
for i, (text, pred, true) in enumerate(zip(test_texts, predictions, true_labels), 1):
    print(f"Sample {i}:")
    print(f"Text: {text[:150]}...")  # Print truncated text
    print(f"Predicted: {'Positive' if pred == 1 else 'Negative'} | Actual: {'Positive' if true == 1 else 'Negative'}")
    print("-" * 60)


README.md:   0%|          | 0.00/9.31k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


train-00000-of-00001.parquet:   0%|          | 0.00/60.0M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/7.58M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/7.58M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/160000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/20000 [00:00<?, ? examples/s]


Predictions for first 20 test samples using the saved model:
------------------------------------------------------------
Sample 1:
Text: magnifique √©pop√©e une belle histoire touchante avec des acteurs qui interpr√®tent tr√®s bien leur r√¥les mel gibson heath ledger jason isaacs le genre de...
Predicted: Positive | Actual: Positive
------------------------------------------------------------
Sample 2:
Text: je nai pas aim√© mais pourtant je lui mets 2 √©toiles car lexp√©rience est louable rien de conventionnel ici une visite et mais jonch√©e did√©es originales...
Predicted: Negative | Actual: Negative
------------------------------------------------------------
Sample 3:
Text: un dessin anim√© qui brille par sa f√©erie et ses chansons...
Predicted: Positive | Actual: Positive
------------------------------------------------------------
Sample 4:
Text: si cest l√† le renouveau du cin√©ma fran√ßais cest tout de m√™me foutrement chiant si lobjet est tr√®s stylis√© et la tension palpabl

In [1]:
import pandas as pd
import re
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import torch
from sklearn.model_selection import train_test_split

# Load data (already done in your case)
df = pd.read_excel("C:/Users/ushni/OneDrive/Desktop/NLP_CODES/hindidata.ods", engine="odf", header=None)
df.columns = ['text', 'label']

# Filter Hindi text only
def extract_hindi(text):
    return " ".join(re.findall(r'[\u0900-\u097F]+', str(text)))

df['text'] = df['text'].apply(extract_hindi)

# Label encoding
label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
df['label'] = df['label'].map(label_map)

# Drop rows with missing values
df = df.dropna()

# Show label distribution
print("Label distribution:\n", df['label'].value_counts())

# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, stratify=df['label'], random_state=42
)

# Convert to HuggingFace dataset format
train_dataset = Dataset.from_dict({'text': train_texts.tolist(), 'label': train_labels.tolist()})
val_dataset = Dataset.from_dict({'text': val_texts.tolist(), 'label': val_labels.tolist()})

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Model
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=3)

# Evaluation metrics
def compute_metrics(p):
    preds = torch.argmax(torch.tensor(p.predictions), axis=1)
    labels = torch.tensor(p.label_ids)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds, average='weighted'),
        'precision': precision_score(labels, preds, average='weighted'),
        'recall': recall_score(labels, preds, average='weighted')
    }

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model='f1'
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

# Train
trainer.train()
trainer.save_model("hindi-sentiment-bert")


KeyboardInterrupt: 

In [None]:
# -------------------------------
# Fine-Tune Hindi-Trained mBERT on Bengali Sentiment Data
# -------------------------------

from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import torch

# Step 1: Load Bengali sentiment dataset
bengali_dataset = load_dataset("Akash190104/bengali_sentiment_analysis")['train']

# Step 2: Load the same tokenizer used during Hindi training
tokenizer = BertTokenizer.from_pretrained("hindi_model")

# Step 3: Tokenize Bengali text
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=128)

bengali_dataset = bengali_dataset.map(tokenize, batched=True)
bengali_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Optional: Split into train/eval (recommended)
dataset_split = bengali_dataset.train_test_split(test_size=0.2, seed=42)
bengali_train = dataset_split['train']
bengali_eval = dataset_split['test']

# Step 4: Load the Hindi-trained model
model = BertForSequenceClassification.from_pretrained("hindi_model")

# Step 5: Define evaluation metrics
def compute_metrics(p):
    preds = torch.argmax(torch.tensor(p.predictions), axis=1)
    labels = torch.tensor(p.label_ids)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds, average='weighted'),
        'precision': precision_score(labels, preds, average='weighted'),
        'recall': recall_score(labels, preds, average='weighted')
    }

# Step 6: Define training arguments
training_args = TrainingArguments(
    output_dir='./bengali-finetuned-mbert',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True,
    metric_for_best_model='f1'
)

# Step 7: Fine-tune the model on Bengali data
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=bengali_train,
    eval_dataset=bengali_eval,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

trainer.train()

# Step 8: Save the fine-tuned Bengali model
trainer.save_model("bengali-sentiment-mbert")
tokenizer.save_pretrained("bengali-sentiment-mbert")


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [9]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
import re
import os

# Load Afrikaans dataset
afrikaans_df = pd.read_csv(r"T:\nlp\afrikaans_sentiment_75.csv")  # Replace with the actual file path

# Map sentiment labels to integers
label_map = {"positive": 1, "negative": 0}
afrikaans_df["label"] = afrikaans_df["sentiment"].map(label_map)

# Clean Afrikaans text
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-zA-Z√Ä-√ø0-9\s]", "", text)  # Preserve accents and alphanumeric
    text = re.sub(r"\s+", " ", text).strip()
    return text

afrikaans_df["text"] = afrikaans_df["text"].apply(clean_text)

# Tokenizer and Model: Load from saved French multilingual BERT
model_dir = "spanish_sentiment_model"
tokenizer = BertTokenizer.from_pretrained(model_dir)
model = BertForSequenceClassification.from_pretrained(model_dir)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Custom Dataset
class AfrikaansDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        encoding = tokenizer(self.texts[idx], padding="max_length", truncation=True, max_length=128, return_tensors="pt")
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(self.labels[idx])
        }

# Split into train/test
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    afrikaans_df["text"].tolist(), afrikaans_df["label"].tolist(), test_size=0.2, random_state=42)

train_dataset = AfrikaansDataset(train_texts, train_labels)
test_dataset = AfrikaansDataset(test_texts, test_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Evaluation
def evaluate(model, dataloader):
    model.eval()
    preds, labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            label = batch["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            pred = torch.argmax(outputs.logits, dim=1)

            preds.extend(pred.cpu().numpy())
            labels.extend(label.cpu().numpy())

    acc = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    return acc, prec, rec, f1

# Training Loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())

    acc, prec, rec, f1 = evaluate(model, test_loader)
    print(f"\nEpoch {epoch+1} Evaluation: Acc={acc:.4f}, Prec={prec:.4f}, Rec={rec:.4f}, F1={f1:.4f}\n")

# Save the fine-tuned Afrikaans model
afrikaans_model_dir = "afrikaans_finetuned_model"
model.save_pretrained(afrikaans_model_dir)
tokenizer.save_pretrained(afrikaans_model_dir)
print(f"Afrikaans model saved to: {afrikaans_model_dir}")

Epoch 1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8/8 [05:54<00:00, 44.35s/it, loss=0.0947]



Epoch 1 Evaluation: Acc=1.0000, Prec=1.0000, Rec=1.0000, F1=1.0000



Epoch 2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8/8 [01:27<00:00, 10.93s/it, loss=0.0207]



Epoch 2 Evaluation: Acc=1.0000, Prec=1.0000, Rec=1.0000, F1=1.0000



Epoch 3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8/8 [01:25<00:00, 10.73s/it, loss=0.0156]



Epoch 3 Evaluation: Acc=1.0000, Prec=1.0000, Rec=1.0000, F1=1.0000

Afrikaans model saved to: afrikaans_finetuned_model


In [10]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Load the fine-tuned Afrikaans model and tokenizer
model_dir = "afrikaans_finetuned_model"
tokenizer = BertTokenizer.from_pretrained(model_dir)
model = BertForSequenceClassification.from_pretrained(model_dir)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Function to clean and predict sentiment
import re

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-zA-Z√Ä-√ø0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def predict_sentiment(sentence):
    cleaned_sentence = clean_text(sentence)
    encoding = tokenizer(cleaned_sentence, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()

    sentiment = "Positive" if predicted_class == 1 else "Negative"
    return sentiment

# Example usage
example_sentence = "Hoy es un d√≠a muy lindo. El clima es muy agradable."  # Sample Afrikaans sentence
predicted_sentiment = predict_sentiment(example_sentence)
print(f"Sentence: {example_sentence}")
print(f"Predicted Sentiment: {predicted_sentiment}")


Sentence: Hoy es un d√≠a muy lindo. El clima es muy agradable.
Predicted Sentiment: Positive


In [None]:
# --- IMPORTS ---
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import re

# --- Load English Sentiment Model ---
def load_english_model():
    model_path = './sentiment_model'
    model = BertForSequenceClassification.from_pretrained(model_path)
    tokenizer = BertTokenizer.from_pretrained(model_path)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()
    return model, tokenizer, device

english_model, english_tokenizer, english_device = load_english_model()

def clean_english_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


# --- Load Spanish Model ---
def load_spanish_model():
    model_path = './spanish_sentiment_model'
    model = BertForSequenceClassification.from_pretrained(model_path)
    tokenizer = BertTokenizer.from_pretrained(model_path)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()
    return model, tokenizer, device

spanish_model, spanish_tokenizer, spanish_device = load_spanish_model()
spanish_label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}


# --- Load French Model ---
def load_french_model():
    model_path = './french_model'
    model = BertForSequenceClassification.from_pretrained(model_path)
    tokenizer = BertTokenizer.from_pretrained(model_path)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()
    return model, tokenizer, device

french_model, french_tokenizer, french_device = load_french_model()

def clean_french_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z√Ä-√ø0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


# --- Load Hindi Model ---
def load_hindi_model():
    model_path = './hindi_model'
    model = BertForSequenceClassification.from_pretrained(model_path)
    tokenizer = BertTokenizer.from_pretrained(model_path)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()
    return model, tokenizer, device

hindi_model, hindi_tokenizer, hindi_device = load_hindi_model()
hindi_label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}

def extract_hindi(text):
    return " ".join(re.findall(r'[\u0900-\u097F]+', str(text)))


# --- Load Bengali Model ---
def load_bengali_model():
    model_path = './bengali-sentiment-mbert'
    model = BertForSequenceClassification.from_pretrained(model_path)
    tokenizer = BertTokenizer.from_pretrained(model_path)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()
    return model, tokenizer, device

bengali_model, bengali_tokenizer, bengali_device = load_bengali_model()
bengali_label_map = {0: "Negative", 1: "Positive"}

def extract_bengali(text):
    return " ".join(re.findall(r'[\u0980-\u09FF]+', str(text)))


# --- Load Afrikaans Model ---
def load_afrikaans_model():
    model_path = './afrikaans_finetuned_model'  # using Spanish base model
    tokenizer = BertTokenizer.from_pretrained(model_path)
    
    # Load base model without classifier mismatch issues
    model = BertForSequenceClassification.from_pretrained(model_path)
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()
    return model, tokenizer, device


afrikaans_model, afrikaans_tokenizer, afrikaans_device = load_afrikaans_model()

def clean_afrikaans_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z√Ä-√ø0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


# --- Load Urdu Model ---
def load_urdu_model():
    model_path = './urdu_finetuned_model'
    model = BertForSequenceClassification.from_pretrained(model_path)
    tokenizer = BertTokenizer.from_pretrained(model_path)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()
    return model, tokenizer, device

urdu_model, urdu_tokenizer, urdu_device = load_urdu_model()

def extract_urdu(text):
    return " ".join(re.findall(r'[\u0600-\u06FF]+', str(text)))


# --- Load Malay Model ---
def load_malay_model():
    model_path = './malay_finetuned_model'
    model = BertForSequenceClassification.from_pretrained(model_path)
    tokenizer = BertTokenizer.from_pretrained(model_path)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()
    return model, tokenizer, device

malay_model, malay_tokenizer, malay_device = load_malay_model()

def clean_malay_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


# --- Unified Sentiment Prediction Function ---
def predict_sentiment_language_specific(sentence, lang):
    lang = lang.lower()

    if lang == 'en':
        cleaned = clean_english_text(sentence)
        inputs = english_tokenizer(cleaned, return_tensors='pt', padding=True, truncation=True, max_length=128).to(english_device)
        with torch.no_grad():
            outputs = english_model(**inputs)
        pred = torch.argmax(outputs.logits, dim=1).item()
        sentiment = "Positive" if pred == 1 else "Negative"
        print(f"üß† Sentiment Prediction (English): {sentiment}")
        return sentiment

    elif lang == 'es':
        inputs = spanish_tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=128).to(spanish_device)
        with torch.no_grad():
            outputs = spanish_model(**inputs)
        pred = torch.argmax(outputs.logits, dim=1).item()
        sentiment = spanish_label_map[pred]
        print(f"üß† Sentiment Prediction (Spanish): {sentiment}")
        return sentiment

    elif lang == 'fr':
        cleaned = clean_french_text(sentence)
        inputs = french_tokenizer(cleaned, return_tensors='pt', padding=True, truncation=True, max_length=128).to(french_device)
        with torch.no_grad():
            outputs = french_model(**inputs)
        pred = torch.argmax(outputs.logits, dim=1).item()
        sentiment = "Positive" if pred == 1 else "Negative"
        print(f"üß† Sentiment Prediction (French): {sentiment}")
        return sentiment

    elif lang == 'hi' or 'or':
        cleaned = extract_hindi(sentence)
        inputs = hindi_tokenizer(cleaned, return_tensors='pt', padding=True, truncation=True, max_length=128).to(hindi_device)
        with torch.no_grad():
            outputs = hindi_model(**inputs)
        pred = torch.argmax(outputs.logits, dim=1).item()
        sentiment = hindi_label_map[pred]
        print(f"üß† Sentiment Prediction (Hindi): {sentiment}")
        return sentiment

    elif lang == 'bn':
        cleaned = extract_bengali(sentence)
        inputs = bengali_tokenizer(cleaned, return_tensors='pt', padding=True, truncation=True, max_length=128).to(bengali_device)
        with torch.no_grad():
            outputs = bengali_model(**inputs)
        pred = torch.argmax(outputs.logits, dim=1).item()
        sentiment = bengali_label_map.get(pred, "Positive")
        print(f"üß† Sentiment Prediction (Bengali): {sentiment}")
        return sentiment

    elif lang == 'af':
        cleaned_sentence = clean_afrikaans_text(sentence)
        encoding = afrikaans_tokenizer(cleaned_sentence, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
        input_ids = encoding["input_ids"].to(device)
        attention_mask = encoding["attention_mask"].to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predicted_class = torch.argmax(logits, dim=1).item()

        sentiment = "Positive" if predicted_class == 1 else "Negative"
        return sentiment

    elif lang == 'ur':
        cleaned = extract_urdu(sentence)
        inputs = urdu_tokenizer(cleaned, return_tensors='pt', padding=True, truncation=True, max_length=128).to(urdu_device)
        with torch.no_grad():
            outputs = urdu_model(**inputs)
        pred = torch.argmax(outputs.logits, dim=1).item()
        sentiment = "Positive" if pred == 1 else "Negative"
        print(f"üß† Sentiment Prediction (Urdu): {sentiment}")
        return sentiment

    elif lang == 'ms':
        cleaned = clean_malay_text(sentence)
        inputs = malay_tokenizer(cleaned, return_tensors='pt', padding=True, truncation=True, max_length=128).to(malay_device)
        with torch.no_grad():
            outputs = malay_model(**inputs)
        pred = torch.argmax(outputs.logits, dim=1).item()
        sentiment = "Positive" if pred == 1 else "Negative"
        print(f"üß† Sentiment Prediction (Malay): {sentiment}")
        return sentiment

    else:
        print(f"‚ö†Ô∏è Language '{lang}' not supported for sentiment analysis.")
        return None



predict_sentiment_language_specific(model_input, lang)

In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
import re
import os

# Load Urdu CSV (no header)
urdu_df = pd.read_csv("urdu.csv", header=None, names=["text", "label"])

# Map P/N to 1/0
urdu_df["label"] = urdu_df["label"].map({"P": 1, "N": 0})

# Clean Urdu text
def clean_urdu(text):
    text = str(text)
    text = re.sub(r"[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

urdu_df["text"] = urdu_df["text"].apply(clean_urdu)

# Train/test split
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(urdu_df, test_size=0.2, random_state=42, stratify=urdu_df["label"])

# Load French fine-tuned model
model_dir = "french_model"
tokenizer = BertTokenizer.from_pretrained(model_dir)
model = BertForSequenceClassification.from_pretrained(model_dir)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Freeze all BERT layers (retain previous French learning)
for name, param in model.bert.named_parameters():
    param.requires_grad = False

# Dataset class
class UrduSentimentDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        encoding = tokenizer(self.texts[idx], padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(self.labels[idx])
        }

# Create DataLoaders
train_dataset = UrduSentimentDataset(train_df["text"].tolist(), train_df["label"].tolist())
test_dataset = UrduSentimentDataset(test_df["text"].tolist(), test_df["label"].tolist())

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# Optimizer (only train classifier head)
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5)

# Evaluation function
def evaluate(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    prec, rec, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="binary")
    return acc, prec, rec, f1

# Training loop
num_epochs = 2
for epoch in range(num_epochs):
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())

    acc, prec, rec, f1 = evaluate(model, test_loader)
    print(f"\nEvaluation after Epoch {epoch+1}:")
    print(f"Accuracy: {acc:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f} | F1 Score: {f1:.4f}\n")

# Save fine-tuned Urdu model
urdu_model_dir = "urdu_finetuned_model"
os.makedirs(urdu_model_dir, exist_ok=True)
model.save_pretrained(urdu_model_dir)
tokenizer.save_pretrained(urdu_model_dir)
print(f"Urdu model saved to {urdu_model_dir}")


  0%|          | 0/49 [00:14<?, ?it/s]


KeyboardInterrupt: 

In [4]:
# -------------------------------------
# Fine-Tune Hindi-Trained mBERT on Odia Sentiment Data
# -------------------------------------

import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
import torch

# Step 1: Load Odia sentiment dataset from CSV
df = pd.read_csv("odia_sentiment_100.csv")  # Replace with your actual CSV path

# Step 2: Encode 'positive' ‚Üí 1, 'negative' ‚Üí 0
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['sentiment'])  # Now we have a numeric 'label' column

# Optional: Check the encoding
print("Label mapping:", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

# Step 3: Convert to HuggingFace Dataset format
dataset = Dataset.from_pandas(df[['text', 'label']])

# Step 4: Load the tokenizer (from your Hindi-trained model)
tokenizer = BertTokenizer.from_pretrained("hindi_model")

# Step 5: Tokenize Odia text
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=128)

dataset = dataset.map(tokenize, batched=True)
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Step 6: Split into train and eval sets
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset['train']
eval_dataset = split_dataset['test']

# Step 7: Load the pre-trained Hindi model
model = BertForSequenceClassification.from_pretrained("hindi_model")

# Step 8: Define evaluation metrics
def compute_metrics(p):
    preds = torch.argmax(torch.tensor(p.predictions), axis=1)
    labels = torch.tensor(p.label_ids)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds, average='weighted'),
        'precision': precision_score(labels, preds, average='weighted'),
        'recall': recall_score(labels, preds, average='weighted')
    }

# Step 9: Define training arguments
training_args = TrainingArguments(
    output_dir='./odia-finetuned-mbert',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True,
    metric_for_best_model='f1'
)

# Step 10: Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

# Step 11: Fine-tune the model
trainer.train()

# Step 12: Save final model and tokenizer
trainer.save_model("odia-sentiment-mbert")
tokenizer.save_pretrained("odia-sentiment-mbert")


Label mapping: {'negative': 0, 'positive': 1}


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [6]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import re

# Load Urdu dataset
urdu_df = pd.read_csv("urdu_sentiment_100.csv", encoding="utf-8")  # Use utf-8 or try encoding="ISO-8859-1" if needed

# Map sentiment labels to integers
label_map = {"positive": 1, "negative": 0}
urdu_df["label"] = urdu_df["sentiment"].map(label_map)

# Clean Urdu text
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^\w\s\u0600-\u06FF]", "", text)  # Keep Urdu and digits
    text = re.sub(r"\s+", " ", text).strip()
    return text

urdu_df["text"] = urdu_df["text"].apply(clean_text)

# Load tokenizer and model
model_dir = "french_model"  # French mBERT checkpoint path
tokenizer = BertTokenizer.from_pretrained(model_dir)
model = BertForSequenceClassification.from_pretrained(model_dir)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Custom Dataset
class UrduDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        encoding = tokenizer(self.texts[idx], padding="max_length", truncation=True, max_length=128, return_tensors="pt")
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(self.labels[idx])
        }

# Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    urdu_df["text"].tolist(), urdu_df["label"].tolist(), test_size=0.2, random_state=42)

train_dataset = UrduDataset(train_texts, train_labels)
test_dataset = UrduDataset(test_texts, test_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Evaluation function
def evaluate(model, dataloader):
    model.eval()
    preds, labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            label = batch["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            pred = torch.argmax(outputs.logits, dim=1)

            preds.extend(pred.cpu().numpy())
            labels.extend(label.cpu().numpy())

    acc = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    return acc, prec, rec, f1

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())

    acc, prec, rec, f1 = evaluate(model, test_loader)
    print(f"\nEpoch {epoch+1} Evaluation: Acc={acc:.4f}, Prec={prec:.4f}, Rec={rec:.4f}, F1={f1:.4f}\n")

# Save fine-tuned Urdu model
urdu_model_dir = "urdu_finetuned_model"
model.save_pretrained(urdu_model_dir)
tokenizer.save_pretrained(urdu_model_dir)
print(f"Urdu model saved to: {urdu_model_dir}")


Epoch 1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [05:02<00:00, 30.23s/it, loss=0.0633]



Epoch 1 Evaluation: Acc=1.0000, Prec=1.0000, Rec=1.0000, F1=1.0000



Epoch 2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [01:51<00:00, 11.14s/it, loss=0.00185]



Epoch 2 Evaluation: Acc=1.0000, Prec=1.0000, Rec=1.0000, F1=1.0000



Epoch 3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [01:51<00:00, 11.12s/it, loss=0.00149]



Epoch 3 Evaluation: Acc=1.0000, Prec=1.0000, Rec=1.0000, F1=1.0000

Urdu model saved to: urdu_finetuned_model


In [8]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
import re

# Load Malay dataset
malay_df = pd.read_csv("malay_sentiment_75.csv", encoding="utf-8")  # Change filename if needed

# Map sentiment labels
label_map = {"positive": 1, "negative": 0}
malay_df["label"] = malay_df["sentiment"].map(label_map)

# Text preprocessing
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

malay_df["text"] = malay_df["text"].apply(clean_text)

# Load tokenizer and model (French multilingual BERT)
model_dir = "french_model"  # Update if model checkpoint is in another folder
tokenizer = BertTokenizer.from_pretrained(model_dir)
model = BertForSequenceClassification.from_pretrained(model_dir)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Custom dataset
class MalayDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        encoding = tokenizer(self.texts[idx], padding="max_length", truncation=True, max_length=128, return_tensors="pt")
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(self.labels[idx])
        }

# Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    malay_df["text"].tolist(), malay_df["label"].tolist(), test_size=0.2, random_state=42)

train_dataset = MalayDataset(train_texts, train_labels)
test_dataset = MalayDataset(test_texts, test_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Evaluation function
def evaluate(model, dataloader):
    model.eval()
    preds, labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            label = batch["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            pred = torch.argmax(outputs.logits, dim=1)

            preds.extend(pred.cpu().numpy())
            labels.extend(label.cpu().numpy())

    acc = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    return acc, prec, rec, f1

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())

    acc, prec, rec, f1 = evaluate(model, test_loader)
    print(f"\nEpoch {epoch+1} Evaluation: Acc={acc:.4f}, Prec={prec:.4f}, Rec={rec:.4f}, F1={f1:.4f}\n")

# Save fine-tuned model
malay_model_dir = "malay_finetuned_model"
model.save_pretrained(malay_model_dir)
tokenizer.save_pretrained(malay_model_dir)
print(f"Malay model saved to: {malay_model_dir}")


Epoch 1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8/8 [02:42<00:00, 20.37s/it, loss=0.00789]



Epoch 1 Evaluation: Acc=1.0000, Prec=1.0000, Rec=1.0000, F1=1.0000



Epoch 2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8/8 [01:32<00:00, 11.50s/it, loss=0.00249]



Epoch 2 Evaluation: Acc=1.0000, Prec=1.0000, Rec=1.0000, F1=1.0000



Epoch 3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8/8 [01:20<00:00, 10.08s/it, loss=0.00188]



Epoch 3 Evaluation: Acc=1.0000, Prec=1.0000, Rec=1.0000, F1=1.0000

Malay model saved to: malay_finetuned_model
