#### Importing Modules

In [53]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

#### Setting up Dataset

In [54]:
print("Dataset: ")
language = pd.read_csv("../data/final_annotations.csv")

language.head()

Dataset: 


Unnamed: 0,word_id,sentence_id,word,label,is_ne,is_spelling_correct
0,45,1,Gusto,FIL,,True
1,46,1,kong,FIL,,True
2,47,1,intindihin,FIL,,True
3,48,1,pero,FIL,,True
4,49,1,hindi,FIL,,True


In [55]:
# Load HuggingFace Sentence Transformer Model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert "words" that arent string to string
language['word'] = language['word'].astype(str)

#Generate embeddings for words
language['embeddings'] = list(model.encode(language['word'].tolist()))


language['embeddings']

0        [-0.049598854, 0.012619668, 0.015824769, 0.035...
1        [-0.03217564, 0.029607292, 0.024639964, 0.0776...
2        [-0.010012416, 0.0578119, 0.021821635, -0.0052...
3        [-0.038002297, 0.05431241, -0.027921673, -0.02...
4        [-0.046178948, -0.0077228756, -0.05874152, 0.0...
                               ...                        
23145    [-0.07845599, 0.028494067, -0.11666181, 0.0732...
23146    [-0.013240754, 0.08617054, -0.06811582, -0.004...
23147    [-0.06914428, 0.016087493, 0.05119962, 0.01648...
23148    [0.0041266084, 0.17076877, -0.039219838, 0.023...
23149    [-0.13382289, 0.014150828, -0.016216075, -0.02...
Name: embeddings, Length: 23150, dtype: object

In [56]:
#Prepare the Features and Labels to Split the Data
X = language["embeddings"]
y = language["label"]

# 70% Train, 15% Validation, 15% Test Split
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

# 30% of 30% = 15% Validation, 15% Test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

print(f"Train: {len(X_train)}, Validation: {len(X_val)}, Test: {len(X_test)}")

Train: 16205, Validation: 3472, Test: 3473


In [57]:
X_train = np.vstack(X_train)
X_val = np.vstack(X_val)
X_test = np.vstack(X_test)

clf = RandomForestClassifier(n_estimators=300, random_state=42)
clf.fit(X_train, y_train)

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [58]:
y_val_pred = clf.predict(X_val)
print("Validation Performance:")
print(classification_report(y_val, y_val_pred))

y_test_pred = clf.predict(X_test)
print("\nTest Performance:")
print(classification_report(y_test, y_test_pred))
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")

Validation Performance:
              precision    recall  f1-score   support

         ENG       0.88      0.89      0.89       288
         FIL       0.96      1.00      0.98      2692
         OTH       0.98      0.77      0.86       492

    accuracy                           0.96      3472
   macro avg       0.94      0.89      0.91      3472
weighted avg       0.96      0.96      0.95      3472


Test Performance:
              precision    recall  f1-score   support

         ENG       0.89      0.87      0.88       288
         FIL       0.96      1.00      0.98      2693
         OTH       0.98      0.81      0.89       492

    accuracy                           0.96      3473
   macro avg       0.94      0.89      0.91      3473
weighted avg       0.96      0.96      0.96      3473

Test Accuracy: 0.9588


In [75]:
# Create a function to predict the language of a sentence
def predict_language(sentence):
    words = sentence.split()

    embeddings = np.vstack([model.encode(word) for word in words])

    predictions = clf.predict(embeddings)

    print("\nLanguage Predictions:")
    for word, pred in zip(words, predictions):
        print(f"{word:}: {pred}")

In [74]:
# Call Prediction Function
predict_language("Gusto kong mag-chill sa coffee shop after class kasi sobrang stressful ng exams. hahaahah gago tangina, gusto ko magjabol pota")


Sentence Language Predictions:
Gusto: FIL
kong: FIL
mag-chill: FIL
sa: FIL
coffee: ENG
shop: ENG
after: ENG
class: ENG
kasi: FIL
sobrang: FIL
stressful: ENG
ng: FIL
exams.: ENG
hahaahah: OTH
gago: FIL
tangina,: FIL
gusto: FIL
ko: FIL
magjabol: FIL
pota: FIL
