#### Importing Modules

In [34]:
import pandas as pd
import torch
from transformers import BertTokenizerFast, BertForTokenClassification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import plot_tree
import numpy as np
from tqdm import tqdm
from spellchecker import SpellChecker
import spacy
import re
import matplotlib.pyplot as plt
import nltk
from nltk import word_tokenize
from typing import List

In [2]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /home/mbchavez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### Setting up Dataset

In [14]:
print("Dataset: ")
language = pd.read_csv("../data/final_annotations.csv")

language.head()

Dataset: 


Unnamed: 0,word_id,sentence_id,word,label,is_ne,is_spelling_correct
0,45,1,Gusto,FIL,,True
1,46,1,kong,FIL,,True
2,47,1,intindihin,FIL,,True
3,48,1,pero,FIL,,True
4,49,1,hindi,FIL,,True


In [15]:
# Check Empty Tokens
language.isna().sum()

word_id                    0
sentence_id                0
word                      16
label                      0
is_ne                  21454
is_spelling_correct        0
dtype: int64

In [16]:
# Drop Empty Tokens 
language = language.dropna(subset=['word'])
language.isna().sum()

word_id                    0
sentence_id                0
word                       0
label                      0
is_ne                  21438
is_spelling_correct        0
dtype: int64

#### Join Sentences

In [17]:
# Ensure words are strings
language["word"] = language["word"].astype(str)

# Group by sentence_id and combine words
sentences = language.groupby("sentence_id")["word"].apply(lambda x: " ".join(x))

# Map the combined sentence back to the original dataframe
language["sentence"] = language["sentence_id"].map(sentences)


language.head()

Unnamed: 0,word_id,sentence_id,word,label,is_ne,is_spelling_correct,sentence
0,45,1,Gusto,FIL,,True,Gusto kong intindihin pero hindi ko maintindih...
1,46,1,kong,FIL,,True,Gusto kong intindihin pero hindi ko maintindih...
2,47,1,intindihin,FIL,,True,Gusto kong intindihin pero hindi ko maintindih...
3,48,1,pero,FIL,,True,Gusto kong intindihin pero hindi ko maintindih...
4,49,1,hindi,FIL,,True,Gusto kong intindihin pero hindi ko maintindih...


In [None]:
# Map Labels to Numbers
label_map = {label: idx for idx, label in enumerate(language['label'].unique())}
language["label_id"] = language['label'].map(label_map)

language["label_id"].head()

0    0
1    0
2    0
3    0
4    0
Name: label_id, dtype: int64

In [None]:
# Group Sentences, Labels to one Dataframe
sentences = language.groupby("sentence_id")['word'].apply(list).tolist()
labels = language.groupby("sentence_id")['label'].apply(list).tolist()
label_ids = language.groupby("sentence_id")['label_id'].apply(list).tolist()  

sentences_df = pd.DataFrame({
    "sentence_id": language['sentence_id'].unique(),
    "words": sentences,
    "labels": labels,
    "label_ids": label_ids
})

sentences_df

Unnamed: 0,sentence_id,words,labels,label_ids
0,1,"[Gusto, kong, intindihin, pero, hindi, ko, mai...","[FIL, FIL, FIL, FIL, FIL, FIL, FIL, OTH, FIL, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, ..."
1,2,"[Kaya, kayong, mga, babae, wag, kayong, basta,...","[FIL, FIL, FIL, FIL, FIL, FIL, FIL, FIL, FIL, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
2,3,"[Kasalan, naman, nila, bakit, hindi, nila, sin...","[FIL, FIL, FIL, FIL, FIL, FIL, FIL, FIL, FIL, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
3,4,"[At, sila-sila, rin, ang, umuunlad, ?]","[FIL, FIL, FIL, FIL, FIL, OTH]","[0, 0, 0, 0, 0, 1]"
4,5,"[Nakakamiss, ung, mga, gantong, content, ni, k...","[FIL, FIL, FIL, FIL, ENG, FIL, FIL, OTH, OTH]","[0, 0, 0, 0, 2, 0, 0, 1, 1]"
...,...,...,...,...
1305,2601,"[Wala, naman, po, akong, nararamdamang, sintom...","[FIL, FIL, FIL, FIL, FIL, FIL, FIL, FIL, OTH]","[0, 0, 0, 0, 0, 0, 0, 0, 1]"
1306,2602,"[Mabigat, na, rin, naman, ang, nabubuhat, ko, .]","[FIL, FIL, FIL, FIL, FIL, FIL, FIL, OTH]","[0, 0, 0, 0, 0, 0, 0, 1]"
1307,2603,"[Sana, hindi, po, hindi, masarap, ulam, nila, ...","[FIL, FIL, FIL, FIL, FIL, FIL, FIL, FIL, FIL, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1308,2604,"[Hello, po, mag, ask, po, ako, sa, inyo, ng, h...","[ENG, FIL, FIL, ENG, FIL, FIL, FIL, FIL, FIL, ...","[2, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, ..."


#### Load BERT

In [35]:
mbert = "bert-base-multilingual-cased"

# Fast tokenizer for word_ids()
mbert_tokenizer = BertTokenizerFast.from_pretrained(mbert)

# Token classification model with 3 labels: FIL=0, OTH=1, ENG=2
mbert_model = BertForTokenClassification.from_pretrained(mbert, num_labels=3)
mbert_model.eval()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-1

In [36]:
tokenized_inputs = mbert_tokenizer(sentences,is_split_into_words=True,return_tensors="pt", padding=True, truncation=True)

aligned_labels = []

for i, label in enumerate(label_ids):
    word_ids = tokenized_inputs.word_ids(batch_index=i)
    aligned_label = []

    for word_id in word_ids:
        if word_id is None:
            aligned_label.append(-100)
        else:
            aligned_label.append(label[word_id])
    aligned_labels.append(aligned_label)

tokenized_inputs["labels"] = torch.tensor(aligned_labels)