A code to train sentiment analysis for NusaX dataset.

Simply `runtime > run all` to train and test.
Modify the language on the bottom part of this code.

# Training code

In [5]:
# grab the data first
!git clone https://github.com/IndoNLP/nusax.git

Cloning into 'nusax'...
remote: Enumerating objects: 301, done.[K
remote: Counting objects: 100% (5/5), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 301 (delta 4), reused 2 (delta 2), pack-reused 296 (from 1)[K
Receiving objects: 100% (301/301), 3.74 MiB | 14.85 MiB/s, done.
Resolving deltas: 100% (136/136), done.


In [2]:
import pandas as pd
from nltk import word_tokenize
import nltk
nltk.download('punkt')

# read csv data
# return a pair of (list of data, list of label)
# also tokenize the input first
def load_data(filedir):
    df = pd.read_csv(filedir)
    data = list(df['text'])
    data = [" ".join(word_tokenize(sent)) for sent in data]
    print(list(df['label']))
    return (data, list(df['label']))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import f1_score,accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit
from scipy.sparse import vstack
nltk.download('punkt_tab')

import numpy as np

def hyperparam_tuning(xtrain, ytrain, xvalid, yvalid, classifier, param_grid):
    # combine train and valid
    x = vstack([xtrain, xvalid])
    y = ytrain + yvalid

    # create predefined split
    # -1 for all training and 0 for all validation
    ps = PredefinedSplit([-1] * len(ytrain) + [0] * len(yvalid))
    clf = GridSearchCV(classifier, param_grid, cv = ps)
    clf = clf.fit(x, y)

    return clf


def train_and_test(lang, directory="/content/nusax/datasets/sentiment/", feature="BoW", classifier="nb"):
    xtrain, ytrain = load_data(directory + lang +"/train.csv")
    xvalid, yvalid = load_data(directory + lang + "/valid.csv")
    xtest, ytest = load_data(directory + lang + "/test.csv")

    # train feature on train data
    if feature == "bow":
        vectorizer = CountVectorizer()
    elif feature == "tfidf":
        vectorizer = TfidfVectorizer()
    else:
        raise Exception('Vectorizer unknown. Use "BoW" or "tfidf"')
    vectorizer.fit(xtrain)

    # transform
    xtrain = vectorizer.transform(xtrain)
    xvalid = vectorizer.transform(xvalid)
    xtest = vectorizer.transform(xtest)

    # all classifiers
    classifier_model = {"nb" : MultinomialNB(),
                        "svm": SVC(),
                        "lr" : LogisticRegression(),
                       }
    # all params for grid-search
    param_grids = {"nb" : {"alpha": np.linspace(0.001,1,50)},
                   "svm": {'C': [0.01, 0.1, 1, 10, 100], 'kernel': ['rbf', 'linear']},
                   "lr" : {'C': np.linspace(0.001,10,100)},
                  }

    clf = hyperparam_tuning(xtrain, ytrain, xvalid, yvalid,
                            classifier=classifier_model[classifier],
                            param_grid=param_grids[classifier])

    pred = clf.predict(xtest.toarray())
    f1score = f1_score(ytest,pred, average='macro')

    return f1score, clf, vectorizer

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


# Testing

In [6]:
#@title Sentiment analysis demo
language = "indonesian" #@param ["indonesian", "english", "javanese", "sundanese", "balinese", "madurese", "minangkabau", "toba_batak", "acehnese", "buginese", "ngaju", "banjarese"]
input_sentiment = "abang saya keterima kerja di kamboja" #@param {type:"string"}


print(f"Training for sentiment analysis classifier {language}")
f1, clf, vectorizer = train_and_test(language, feature="bow")
print(f"Training done. F1 on test set is {f1}")

input_sentiment = " ".join(word_tokenize(input_sentiment))
sent = clf.predict(vectorizer.transform([input_sentiment]).toarray())
print(f"\nSentiment on the input text is {sent}")





Training for sentiment analysis classifier indonesian
['neutral', 'positive', 'neutral', 'positive', 'positive', 'neutral', 'neutral', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative', 'positive', 'positive', 'positive', 'neutral', 'neutral', 'neutral', 'negative', 'neutral', 'positive', 'positive', 'negative', 'neutral', 'negative', 'negative', 'negative', 'negative', 'positive', 'positive', 'neutral', 'positive', 'positive', 'positive', 'neutral', 'positive', 'positive', 'negative', 'neutral', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative', 'positive', 'positive', 'positive', 'positive', 'neutral', 'neutral', 'negative', 'positive', 'neutral', 'positive', 'negative', 'negative', 'negative', 'positive', 'positive', 'negative', 'neutral', 'positive', 'neutral', 'positive', 'neutral', 'neutral', 'negative', 'neutral', 'negative', 'positive', 'negative', 'neutral', 'negative', 'positive', 'positive', 'negative', 'positive', 'positive', 'posit