<a href="https://colab.research.google.com/github/lizagarg/hindi-sentiment-analysis/blob/main/Untitled17.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, pipeline

# Step 2: Create a Mock Dataset
def create_mock_dataset():
    data = {
        "train": {
            "text": [
                "यह शानदार है",  # Positive
                "यह बहुत खराब है",  # Negative
                "यह औसत है",  # Neutral
                "यह अद्भुत है",  # Positive
                "यह सबसे खराब है",  # Negative
                "यह ठीक-ठाक है",  # Neutral
                "मुझे यह पसंद आया",  # Positive
                "मुझे यह नफरत है",  # Negative
                "यह अच्छा अनुभव था",  # Positive
                "यह बेकार था",  # Negative
                "यह संतोषजनक था",  # Neutral
                "यह अप्रत्याशित रूप से बढ़िया था",  # Positive
                "यह पूरी तरह से निराशाजनक था",  # Negative
                "यह ठीक है लेकिन महान नहीं",  # Neutral
            ],
            "label": [
                1,  # Positive
                0,  # Negative
                2,  # Neutral
                1,  # Positive
                0,  # Negative
                2,  # Neutral
                1,  # Positive
                0,  # Negative
                1,  # Positive
                0,  # Negative
                2,  # Neutral
                1,  # Positive
                0,  # Negative
                2,  # Neutral
            ],
        },
        "validation": {
            "text": [
                "यह बहुत अच्छा था",  # Positive
                "यह बहुत बुरा था",  # Negative
                "यह औसत दर्जे का था",  # Neutral
                "यह भयानक था",  # Negative
                "यह अद्वितीय था",  # Positive
            ],
            "label": [
                1,  # Positive
                0,  # Negative
                2,  # Neutral
                0,  # Negative
                1,  # Positive
            ],
        },
    }

    train_dataset = Dataset.from_dict(data["train"])
    val_dataset = Dataset.from_dict(data["validation"])

    return DatasetDict({"train": train_dataset, "validation": val_dataset})


In [None]:
# Step 3: Tokenize the Dataset
def tokenize_dataset(dataset, tokenizer):
    def preprocess_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True)

    return dataset.map(preprocess_function, batched=True)

In [None]:
# Step 4: Load the Pre-Trained Model
def load_model(num_labels=3):  # Adjust `num_labels` as per your task
    model = AutoModelForSequenceClassification.from_pretrained("ai4bharat/indic-bert", num_labels=num_labels)
    return model

In [None]:
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}


In [None]:
# Step 5: Fine-Tune the Model
def train_model(model, tokenizer, tokenized_datasets):
    from sklearn.metrics import accuracy_score

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = logits.argmax(axis=-1)
        accuracy = accuracy_score(labels, predictions)
        return {"accuracy": accuracy}

    training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=2e-5,
        per_device_train_batch_size=4,
        num_train_epochs=30,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,  # Compute accuracy
    )

    trainer.train()
    return model



In [None]:
# Step 6: Save the Fine-Tuned Model
def save_model(model, tokenizer):
    model.save_pretrained("./fine_tuned_model")
    tokenizer.save_pretrained("./fine_tuned_model")


In [None]:
# Step 7: Test the Fine-Tuned Model
def test_model():
    fine_tuned_pipeline = pipeline("sentiment-analysis", model="./fine_tuned_model")
    test_inputs = [
        "यह अद्भुत है",  # Expected: Positive
        "यह बहुत बुरा है",  # Expected: Negative
        "यह ठीक-ठाक है",  # Expected: Neutral
    ]
    for sentence in test_inputs:
      print(fine_tuned_pipeline(sentence))



In [None]:
# Main Execution
def main():
    print("Creating mock dataset...")
    dataset = create_mock_dataset()

    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")

    print("Tokenizing dataset...")
    tokenized_datasets = tokenize_dataset(dataset, tokenizer)

    print("Loading pre-trained model...")
    model = load_model()

    print("Fine-tuning the model...")
    model = train_model(model, tokenizer, tokenized_datasets)

    print("Saving the fine-tuned model...")
    save_model(model, tokenizer)

    print("Testing the fine-tuned model...")
    test_model()

In [None]:
print("0: Negative, 1: Positive, 2: Neutral")

0: Negative, 1: Positive, 2: Neutral


In [None]:
if __name__ == "__main__":
    main()

Creating mock dataset...
Loading tokenizer...
Tokenizing dataset...


Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Loading pre-trained model...


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Fine-tuning the model...


Step,Training Loss
10,1.0971
20,1.0984
30,1.0929
40,1.0903
50,1.0662
60,1.0439
70,0.9723
80,0.9602
90,0.8652
100,0.766


Saving the fine-tuned model...
Testing the fine-tuned model...


Device set to use cpu


[{'label': 'LABEL_1', 'score': 0.6361380219459534}]
[{'label': 'LABEL_0', 'score': 0.459554523229599}]
[{'label': 'LABEL_2', 'score': 0.3610880374908447}]


In [None]:
pip install --upgrade transformers

Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Downloading transformers-4.52.4-py3-none-any.whl (10.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.52.3
    Uninstalling transformers-4.52.3:
      Successfully uninstalled transformers-4.52.3
Successfully installed transformers-4.52.4
