# Sentiment Analysis in Nepali Language

This google colab is about sentiment analysis in Nepali language by fine-tuning BERT-derivative model. The dataset used in this notebook is mainly from [here](https://github.com/oya163/nepali-sentiment-analysis/blob/master/data/nepcls/csv/ss_ac_at_txt_unbal.csv)

## Installation

In [None]:
!python3 -m pip install -U huggingface_hub
!python3 -m pip install -U transformers
!python3 -m pip install -U datasets evaluate
!python3 -m pip install -U accelerate
!python3 -m pip install -U seqeval

In [None]:
# Wrap the text in ipython notebook
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

# Data Preprocessing

## Prepare NepSA dataset


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# filepath = '/content/drive/MyDrive/nepsa_data/nepsa'

In [None]:
!wget https://raw.githubusercontent.com/oya163/nepali-sentiment-analysis/master/data/nepcls/csv/ss_ac_at_txt_unbal.csv

In [None]:
import matplotlib.pyplot as plt
import os
import pandas as pd
import torch
import numpy as np
pd.set_option('display.max_colwidth', None)

In [None]:
filepath = "/kaggle/working/ss_ac_at_txt_unbal.csv"
df = pd.read_csv(filepath,
                   names=["Severity", "Category", "Aspect Word", "text"])

In [None]:
df[df['Category']=='PROFANITY']

In [None]:
df['Category'].unique()

In [None]:
df = df[~df['Category'].isin(['FEEDBACK'])]
df['Category'].unique()

In [None]:
def create_label(row):
    if row['Category'] == "GENERAL" and row['Severity'] == 0:
        return 0
    elif row['Category'] == "GENERAL" and row['Severity'] == 1:
        return 1
    elif row['Category'] == "PROFANITY":
        return 2
    elif row['Category'] == "VIOLENCE":
        return 3

df['label'] = df.apply(create_label, axis=1).astype(int)

df = df.drop(['Severity', 'Category', 'Aspect Word'], axis=1)
df.head()


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, shuffle=True, random_state=21)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, shuffle=True, random_state=21)

In [None]:
def create_csv(X, y, filename):
    df = pd.DataFrame()
    df['text'] = X
    df['label'] = y
    df.to_csv(f"{filename}.txt", sep='\t', header=False, index=False)

create_csv(X_train, y_train, 'train')
create_csv(X_val, y_val, 'valid')
create_csv(X_test, y_test, 'test')

## Load NepSA dataset

In [None]:
from datasets import load_dataset

filepath1="/kaggle/working" #"/kaggle/input/nepsa-data/nepsa"
filepath="/kaggle/input/d/merishnasuwal/nepsa-data/nepsa" #"/kaggle/input/nepsa-data/nepsa"

data_files = {
    "train": os.path.join(filepath1, "train.txt"),
    "validation": os.path.join(filepath1, "valid.txt"),
    "test": os.path.join(filepath1, "test.txt"),
}

raw_datasets = load_dataset(os.path.join(filepath, "load_sa.py"), data_files=data_files)

Check the basic information on the loaded dataset

In [None]:
import csv
fp ="/kaggle/working/test.txt"
# Open text file with specified delimiter
with open(fp, 'r') as f:
    reader = csv.reader(f, delimiter=',')


In [None]:
reader = pd.read_csv(fp, )

In [None]:
train_data = raw_datasets['train']
test_data = raw_datasets['test']
valid_data = raw_datasets['validation']

Check sample of tokens from train dataset

In [None]:
print(train_data[10]["text"])

Check the NER tags (its IDS) of the corresponding sample

In [None]:
print(raw_datasets["train"][10]["label"])

In [None]:
ner_feature = raw_datasets["train"].features["label"]
ner_feature

## Tokenization

In [None]:
from transformers import AutoTokenizer

# model_checkpoint = "NepBERTa/NepBERTa"
# model_checkpoint = "Rajan/NepaliBERT"
# model_checkpoint = "Rajan/nepbertaTorch"
# model_checkpoint = "Sakonii/distilbert-base-nepali"
model_checkpoint = "Sakonii/deberta-base-nepali"
# model_checkpoint = "mrm8488/bert-multi-cased-finetuned-xquadv1"
# model_checkpoint = "xlm-roberta-large"
# model_checkpoint = "nlptown/bert-base-multilingual-uncased-sentiment"
# model_checkpoint = "bert-base-multilingual-uncased"
# model_checkpoint = "cardiffnlp/twitter-xlm-roberta-base-sentiment"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

## Data Preprocessing

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_train = train_data.map(preprocess_function, batched=True)
tokenized_test = test_data.map(preprocess_function, batched=True)
tokenized_val = valid_data.map(preprocess_function, batched=True)

# Fine Tuning

## Data Collation

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Load pre-trained model

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=4)

## Setup Evaluation

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
        }

In [None]:
model.config.num_labels

## Training

In [None]:
import wandb
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wandb")
wandb.login(key=secret_value_0)

In [None]:
# from google.colab import userdata
# from huggingface_hub import login, notebook_login

# login(token=userdata.get('hugging_face'))

In [None]:
from transformers import TrainingArguments, Trainer

model_name = "nepsa"

args = TrainingArguments(
    model_name,
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    num_train_epochs=6,
    weight_decay=0.01,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

In [None]:
trainer.evaluate()

## Save the model

In [None]:
saved_model_path='nepsa'
trainer.save_model(saved_model_path)

## Evaluation

In [None]:
predictions = trainer.predict(tokenized_test)

In [None]:
predictions.label_ids

In [None]:
raw_datasets['test'].features

In [None]:
from tabulate import tabulate

metrics = ['precision', 'recall', 'f1', 'accuracy']
prediction_results = []

for key, val in predictions.metrics.items():
    if any(item in key for item in metrics):
        prediction_results.append([key, str(round(val,4)*100)+'%'])

print(tabulate(prediction_results, headers=['Metric', 'Score']))

## Inference

In [None]:
from transformers import pipeline

text_classifier = pipeline("text-classification", model=saved_model_path)


In [None]:
inference_sentences = ["वा जैनु जि तपाइलाइ धन्यबाद छ गगन्या चोर लाई मुख भरी जबाब दिएको मा",
                       "ओली दलाल मुर्दाबाद",
                       "यो गोविन्दे लाई देश निकाला गर्नु पर्छ",
                       "यो मुला गोबिन्द ठिक छैन",
                       "यो पुण्य गौतम जड्या हो जस्तो कस कस लाई लाग्छ ।",
                      "तपाईं कुवा मा दुबेर मरे हुन्छ ।",
                      "अनुहार हेर्दा ठमेल को भालू हो ।"]

results = text_classifier(inference_sentences)


In [None]:
label_map = {
    0: 'GENERAL POSITIVE',
    1: 'GENERAL NEGATIVE',
    2: 'PROFANITY',
    3: 'VIOLENCE'
}

prediction_results = []
for sent, result in zip(inference_sentences, results):
    pred = result['label'].split('_')[1]
    prediction_results.append([sent, pred, label_map[int(pred)]])

print(tabulate(prediction_results, headers=['Sentences', 'Labels', 'Remarks'], tablefmt='orgtbl'))
