In [None]:
from google.colab import files
uploaded = files.upload()

Saving normalized_text.csv to normalized_text.csv


In [None]:
import pandas as pd
import io

# Assuming the uploaded file is a CSV and its name is stored in uploaded.keys()
# If you uploaded multiple files, you might need to adjust this to select the correct one.
for filename in uploaded.keys():
  df = pd.read_csv(io.StringIO(uploaded[filename].decode('utf-8')))
  print(f"Successfully loaded {filename} into a DataFrame.")
  display(df.head())

Successfully loaded normalized_text.csv into a DataFrame.


Unnamed: 0,tweet_normalized,sentiment
0,உன்னைத்தொட்டால் உன்னுள்ளத்தை நொருக்கமாட்டியோ எ...,Positive
1,நதியா நதியா நயில் நதியா இடை தான் கொடியா கொடி ம...,Positive
2,உறக்கம் விற்று கனவுகள் வாங்கலையா கனவுகள் விற்ற...,Negative
3,மீண்டும் உன்னை காணும் மனமே வேண்டும் எனக்கே மனம...,Negative
4,உயிரை தொலைத்தேன் அது உன்னில் தானோ இது நான் காண...,Negative


In [None]:
df.rename(columns={'tweet_normalized': 'text'}, inplace=True)

In [None]:
df.head()

Unnamed: 0,text,sentiment
0,உன்னைத்தொட்டால் உன்னுள்ளத்தை நொருக்கமாட்டியோ எ...,Positive
1,நதியா நதியா நயில் நதியா இடை தான் கொடியா கொடி ம...,Positive
2,உறக்கம் விற்று கனவுகள் வாங்கலையா கனவுகள் விற்ற...,Negative
3,மீண்டும் உன்னை காணும் மனமே வேண்டும் எனக்கே மனம...,Negative
4,உயிரை தொலைத்தேன் அது உன்னில் தானோ இது நான் காண...,Negative


In [None]:
!pip install transformers datasets torch accelerate




In [2]:
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
print(f"GPU Name: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'No GPU'}")

Using device: cuda
GPU Name: Tesla T4


In [None]:
df = pd.read_csv('normalized_text.csv')
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"Sample data:")
print(df.head())
print(f"\nSentiment distribution:")
print(df['sentiment'].value_counts())

In [None]:
label_mapping = {'positive': 1, 'negative': 0, 'neutral': 2}

if df['sentiment'].dtype == 'object':
    df['labels'] = df['sentiment'].map(label_mapping)
    df['labels'] = df['labels'].fillna(0)
else:
    df['labels'] = df['sentiment']

df['labels'] = df['labels'].astype(int)

original_size = len(df)
df = df.dropna(subset=['text', 'labels'])
print(f"Original size: {original_size}, After cleaning: {len(df)}")
print(f"Label distribution after mapping:")
print(df['labels'].value_counts().sort_index())

In [None]:
train_texts = df['text'].tolist()
train_labels = df['labels'].tolist()

print(f"TRAINING ON ENTIRE DATASET")
print(f"Total records for training: {len(train_texts)}")
print(f"Label distribution:")

unique_labels = sorted(set(train_labels))
for label in unique_labels:
    count = train_labels.count(label)
    label_name = [k for k, v in label_mapping.items() if v == label]
    label_name = label_name[0] if label_name else f"unknown_{label}"
    print(f"  {label_name} ({label}): {count} samples")

print(f"\n Will use ALL {len(train_texts)} records for training")
print(" No train/test split - maximizing training data")

In [1]:
model_name = "xlm-roberta-base"
print(f" Loading {model_name} for Tamil text...")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(unique_labels),
    problem_type="single_label_classification"
)

# Move model to GPU if available
model.to(device)
print(f" Model loaded and moved to {device}")
print(f" Model parameters: {model.num_parameters():,}")
print(f"  Number of labels: {len(unique_labels)}")

🤖 Loading xlm-roberta-base for Tamil text...


NameError: name 'AutoTokenizer' is not defined

In [None]:
def tokenize_function(examples):
    """Tokenize Tamil text with XLM-RoBERTa tokenizer"""
    return tokenizer(
        examples['text'],
        truncation=True,
        padding=True,
        max_length=512
    )

print(" Creating dataset from entire data...")

# Create single training dataset from ALL data
train_dataset = Dataset.from_dict({
    'text': train_texts,
    'labels': train_labels
})

print(f" Tokenizing {len(train_dataset)} Tamil text samples...")

# Apply tokenization
train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    desc="Tokenizing texts"
)

# Set format for PyTorch
train_dataset.set_format(
    type='torch',
    columns=['input_ids', 'attention_mask', 'labels']
)

print(" Tokenization complete!")
print(f" Training dataset: {len(train_dataset)} samples")
print(f" Using 100% of available data")

# Verify tokenization
sample = train_dataset[0]
print(f"\n🔍 Sample verification:")
print(f"  Input IDs shape: {sample['input_ids'].shape}")
print(f"  Attention mask shape: {sample['attention_mask'].shape}")
print(f"  Label: {sample['labels']}")


In [None]:
print(" Setting up training configuration...")
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,                    # Single epoch as specified
    per_device_train_batch_size=8,         # Small batch size for T4 GPU
    learning_rate=2e-5,                    # Standard learning rate
    weight_decay=0.01,                     # Regularization
    logging_steps=10,                      # Frequent logging
    save_strategy="epoch",                 # Save after epoch
    fp16=True,                            # Mixed precision for T4 GPU
    dataloader_drop_last=False,           # Use ALL data - no dropping
    dataloader_pin_memory=False,          # Memory optimization
    remove_unused_columns=False,          # Keep all columns
    report_to=None,                       # Disable external logging
    push_to_hub=False,                    # Don't push to hub
    disable_tqdm=False,                   # Show progress bars
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

total_samples = len(train_dataset)
batch_size = training_args.per_device_train_batch_size
total_steps = (total_samples + batch_size - 1) // batch_size  # Ceiling division

print(" Training setup complete!")
print(f" Training statistics:")
print(f"  Total samples: {total_samples}")
print(f"  Batch size: {batch_size}")
print(f"  Total training steps: {total_steps}")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Learning rate: {training_args.learning_rate}")
print(f"  Drop last batch: {training_args.dataloader_drop_last}")
print(f"  ALL {total_samples} samples will be used!")

In [None]:
training_output = trainer.train()

print("="*60)
print(" TRAINING COMPLETED SUCCESSFULLY!")
print(f"Model trained on ALL {len(train_dataset)} samples")
print(f" Training loss: {training_output.training_loss:.4f}")
print(f"Training time: {training_output.metrics.get('train_runtime', 'N/A'):.2f} seconds")

In [None]:
print(" TRAINING SUMMARY:")
print("="*50)
print(f" Dataset: normalized_text.csv")
print(f" Model: xlm-roberta-base")
print(f" Total samples trained: {len(train_dataset)}")
print(f" Data utilization: 100% (no train/test split)")
print(f" Epochs completed: {training_args.num_train_epochs}")
print(f" Final training loss: {training_output.training_loss:.4f}")
print(" Model ready for Tamil sentiment prediction!")


In [None]:
def predict_tamil_sentiment(text):
    """
    Predict sentiment for Tamil text using trained model
    """
    # Tokenize input
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512
    )

    # Move to device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Get prediction
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=-1).item()
        confidence = predictions[0][predicted_class].item()

    # Map back to sentiment labels
    reverse_label_mapping = {v: k for k, v in label_mapping.items()}
    predicted_sentiment = reverse_label_mapping.get(predicted_class, f'label_{predicted_class}')

    return predicted_sentiment, confidence

# Test with Tamil sentences
test_sentences = [
    "இது மிகவும் அருமையான திரைப்படம்",      # "This is a very wonderful movie"
    "இந்த உணவு மிகவும் கெட்டது",               # "This food is very bad"
    "நல்ல நாள்",                                # "Good day"
    "இது சாதாரணமான படம்",                      # "This is an average movie"
    "மிகவும் மோசமான அனுபவம்",                  # "Very bad experience"
    "அற்புதமான சேவை",                          # "Wonderful service"
]

print(" TESTING TRAINED MODEL:")
print("="*50)

for i, sentence in enumerate(test_sentences, 1):
    predicted_sentiment, confidence = predict_tamil_sentiment(sentence)
    print(f"{i}. Tamil: {sentence}")
    print(f"   Prediction: {predicted_sentiment.upper()}")
    print(f"   Confidence: {confidence:.4f} ({confidence*100:.1f}%)")
    print("-" * 40)


In [None]:
print(" SAVING TRAINED MODEL...")

# Save model and tokenizer
save_directory = './tamil_sentiment_xlm_roberta'
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

# Save label mapping for future use
import json
with open(f'{save_directory}/label_mapping.json', 'w', encoding='utf-8') as f:
    json.dump(label_mapping, f, ensure_ascii=False, indent=2)

print(" MODEL SAVED SUCCESSFULLY!")
print(f" Location: {save_directory}")
print(f" Files saved:")
print(f"   - config.json")
print(f"   - pytorch_model.bin")
print(f"   - tokenizer.json")
print(f"   - tokenizer_config.json")
print(f"   - label_mapping.json")

# =============================================================================
# CELL 13: Final Verification and Usage Instructions
# =============================================================================
print("🎯 FINAL TRAINING VERIFICATION:")
print("="*60)
print(f" Original CSV records: {original_size}")
print(f" Records after cleaning: {len(df)}")
print(f" Records used in training: {len(train_dataset)}")
print(f" Training efficiency: 100% (entire dataset)")
print(f" Data wasted: 0 records (no train/test split)")
print(f" Model: XLM-RoBERTa Base fine-tuned for Tamil sentiment")

print(f"\n HOW TO USE THE TRAINED MODEL:")
print("="*40)
print("# Load the model")
print("from transformers import AutoTokenizer, AutoModelForSequenceClassification")
print("import torch")
print("")
print(f"tokenizer = AutoTokenizer.from_pretrained('{save_directory}')")
print(f"model = AutoModelForSequenceClassification.from_pretrained('{save_directory}')")
print("")
print("# Predict sentiment")
print("text = 'உங்கள் தமிழ் வாக்கியம்'  # Your Tamil sentence")
print("prediction, confidence = predict_tamil_sentiment(text)")
print("print(f'Sentiment: {prediction}, Confidence: {confidence:.3f}')")

print(f"\n🎉 TAMIL SENTIMENT ANALYSIS MODEL READY!")
print(f"Trained on {len(train_dataset)} Tamil text samples using XLM-RoBERTa Base")