#### Importing Modules

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import plot_tree
import numpy as np
from tqdm import tqdm
from spellchecker import SpellChecker
import spacy
import re
import matplotlib.pyplot as plt
import nltk
from nltk import word_tokenize

[nltk_data] Downloading package punkt to /home/mbchavez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
nltk.download("punkt")

#### Setting up Dataset

In [2]:
print("Dataset: ")
language = pd.read_csv("../data/final_annotations.csv")

language.head()

Dataset: 


Unnamed: 0,word_id,sentence_id,word,label,is_ne,is_spelling_correct
0,45,1,Gusto,FIL,,True
1,46,1,kong,FIL,,True
2,47,1,intindihin,FIL,,True
3,48,1,pero,FIL,,True
4,49,1,hindi,FIL,,True


#### Convert the Words into Embeddings

In [27]:
# Load HuggingFace Sentence Transformer Model
model = SentenceTransformer('all-mpnet-base-v2')

# Convert "words" that arent string to string
language['word'] = language['word'].astype(str)

#Generate embeddings for words
language['embeddings'] = list(
        model.encode(
            language["word"].tolist(), convert_to_tensor=False, show_progress_bar=True
        )
    )


language['embeddings']

Batches:  14%|█▍        | 102/724 [00:20<02:05,  4.98it/s]


KeyboardInterrupt: 

### Extracting Other Features

In [None]:
#Extract Names Entity labels
language['is_ne'].value_counts(dropna=False)
language['is_ne'] = language['is_ne'].fillna('NONE')
language['is_ne']

is_ne = pd.get_dummies(language['is_ne'], prefix='is_ne')

is_ne.head()


Unnamed: 0,is_ne_ABB,is_ne_ABB_EXPR,is_ne_ABB_NE,is_ne_EXPR,is_ne_NE,is_ne_NONE
0,False,False,False,False,False,True
1,False,False,False,False,False,True
2,False,False,False,False,False,True
3,False,False,False,False,False,True
4,False,False,False,False,False,True


In [None]:
#Extract Spelling Correctness labels
is_spelling_correct = language["is_spelling_correct"].astype(int).to_numpy().reshape(-1,1)

is_spelling_correct

array([[1],
       [1],
       [1],
       ...,
       [1],
       [1],
       [1]], shape=(23150, 1))

In [None]:
def classify_if_is_spelling_correct(token: str) -> int:
    """
    Classifies if a token is spelled correctly.
    Args:
        token: The word token (string).
    Returns:
        is_correct: 1 if spelled correctly, 0 otherwise.
    """
    spell = SpellChecker()

    return 1 if token in spell else 0

In [None]:
classify_if_is_spelling_correct("kasalanan")

0

In [None]:
def classify_if_is_ne(token: str) -> str:
    """
    Classifies if a token is a named entity.
    Args:
        token: The word token (string).
    Returns:
        "ABB_NE" -> abbreviation named entity
        "NE" -> named entity
        "NONE" -> neither
        "EXPR" -> expression
    """
    name_checker = spacy.load("xx_ent_wiki_sm")
    abbr_pattern = r'^([A-Z0-9]\.?)+$'

    doc = name_checker(token)

    is_ne = bool(doc[0].ent_type_) 
    is_abbr = bool(re.match(abbr_pattern, token))

    if is_ne and is_abbr:
        return "ABB_NE"
    elif is_ne:
        return "NE"
    elif is_abbr:
        return "ABB"
    else:
        return "NONE"

In [None]:
classify_if_is_ne("January")

'NONE'

#### Training the Model From the Extracted Featurse

In [None]:
#Prepare the Features and Labels to Split the Data
# X = np.hstack([np.vstack(language["embeddings"]), is_spelling_correct, is_ne.values])
X = np.hstack([np.vstack(language["embeddings"]), is_ne.values])
y = language["label"]

# 70% Train, 15% Validation, 15% Test Split
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

# 30% of 30% = 15% Validation, 15% Test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

print(f"Train: {len(X_train)}, Validation: {len(X_val)}, Test: {len(X_test)}")

Train: 16205, Validation: 3472, Test: 3473


In [None]:
print("Apply PCA...")
pca = PCA(n_components=0.95, random_state=42)
X_train = pca.fit_transform(X_train)
X_val = pca.transform(X_val)
X_test = pca.transform(X_test)

print(f"PCA retained {np.sum(pca.explained_variance_ratio_):.2%} of variance")

print("Training the model...")
clf = RandomForestClassifier(n_estimators=300, random_state=42,verbose=1)
clf.fit(X_train, y_train)

Apply PCA...
PCA retained 95.05% of variance
Training the model...


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    6.2s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:   25.0s
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:   39.0s finished


0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


#### Visualize the Trees from the Random Forest

In [None]:
# Pick a few trees from your forest (since 300 is too many)
n_trees_to_plot = 3
for i in range(n_trees_to_plot):
    estimator = clf.estimators_[i]
    plt.figure(figsize=(20, 10))
    plot_tree(
        estimator,
        filled=True,
        rounded=True,
        max_depth=3,  # limit depth for readability
        feature_names=[f"PC{i+1}" for i in range(X_train.shape[1])],
        class_names=[str(c) for c in clf.classes_],
    )
    plt.title(f"Decision Tree {i+1}")
    plt.show()

In [None]:
# Evaluate Model
print("Evaluating model...\n")
y_val_pred = clf.predict(X_val)
print("Validation Performance:")
print(classification_report(y_val, y_val_pred))

y_test_pred = clf.predict(X_test)
print("\nTest Performance:")
print(classification_report(y_test, y_test_pred))
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.1s


Validation Performance:
              precision    recall  f1-score   support

         ENG       0.96      0.89      0.92       288
         FIL       0.99      0.99      0.99      2692
         OTH       0.95      0.96      0.95       492

    accuracy                           0.98      3472
   macro avg       0.97      0.95      0.96      3472
weighted avg       0.98      0.98      0.98      3472


Test Performance:
              precision    recall  f1-score   support

         ENG       0.94      0.90      0.92       288
         FIL       0.99      1.00      0.99      2693
         OTH       0.97      0.95      0.96       492

    accuracy                           0.98      3473
   macro avg       0.97      0.95      0.96      3473
weighted avg       0.98      0.98      0.98      3473

Test Accuracy: 0.9813


[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.1s finished
