#### Importing Modules

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
from tqdm import tqdm

#### Setting up Dataset

In [22]:
print("Dataset: ")
language = pd.read_csv("../data/final_annotations.csv")

language.head()

Dataset: 


Unnamed: 0,word_id,sentence_id,word,label,is_ne,is_spelling_correct
0,45,1,Gusto,FIL,,True
1,46,1,kong,FIL,,True
2,47,1,intindihin,FIL,,True
3,48,1,pero,FIL,,True
4,49,1,hindi,FIL,,True


In [None]:
# Load HuggingFace Sentence Transformer Model
model = SentenceTransformer('all-mpnet-base-v2')

# Convert "words" that arent string to string
language['word'] = language['word'].astype(str)

#Generate embeddings for words
language['embeddings'] = list(model.encode(language['word'].tolist(), convert_to_tensor=False), show_progress_bar=True)


language['embeddings']

0        [-0.015572627, 0.05882696, -0.02446479, -0.032...
1        [0.07406978, 0.035357483, -0.027739374, -0.028...
2        [-0.052020285, -0.033879783, -0.04971939, 0.01...
3        [-0.0506149, 0.026319128, -0.0010626945, -0.02...
4        [0.030711884, -0.00059579074, -0.017216908, 0....
                               ...                        
23145    [-0.0093519455, -0.06878516, -0.044534426, -0....
23146    [0.063009344, 0.020623386, 0.0049647195, -0.03...
23147    [0.045219544, 0.008649993, -0.026441053, 0.015...
23148    [-0.021650821, 0.03991619, 0.0031559265, -0.05...
23149    [-0.011788345, 0.061924104, -0.0068219206, 0.0...
Name: embeddings, Length: 23150, dtype: object

In [24]:
#Prepare the Features and Labels to Split the Data
X = language["embeddings"]
y = language["label"]

# 70% Train, 15% Validation, 15% Test Split
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

# 30% of 30% = 15% Validation, 15% Test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

print(f"Train: {len(X_train)}, Validation: {len(X_val)}, Test: {len(X_test)}")

Train: 16205, Validation: 3472, Test: 3473


In [None]:
X_train = np.vstack(X_train)
X_val = np.vstack(X_val)
X_test = np.vstack(X_test)

print("Apply PCA...")
pca = PCA(n_components=0.95, random_state=42)
X_train = pca.fit_transform(X_train)
X_val = pca.transform(X_val)
X_test = pca.transform(X_test)

print(f"PCA retained {np.sum(pca.explained_variance_ratio_):.2%} of variance")

print("Training the model...")
clf = RandomForestClassifier(n_estimators=300, random_state=42)
clf.fit(X_train, y_train)

Apply PCA...


0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [26]:
y_val_pred = clf.predict(X_val)
print("Validation Performance:")
print(classification_report(y_val, y_val_pred))

y_test_pred = clf.predict(X_test)
print("\nTest Performance:")
print(classification_report(y_test, y_test_pred))
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")

Validation Performance:
              precision    recall  f1-score   support

         ENG       0.93      0.93      0.93       288
         FIL       0.96      1.00      0.98      2692
         OTH       0.98      0.78      0.87       492

    accuracy                           0.96      3472
   macro avg       0.96      0.90      0.93      3472
weighted avg       0.96      0.96      0.96      3472


Test Performance:
              precision    recall  f1-score   support

         ENG       0.89      0.93      0.91       288
         FIL       0.97      1.00      0.98      2693
         OTH       0.99      0.80      0.88       492

    accuracy                           0.96      3473
   macro avg       0.95      0.91      0.93      3473
weighted avg       0.96      0.96      0.96      3473

Test Accuracy: 0.9631


In [27]:
# Create a function to predict the language of a sentence
def predict_language(sentence):
    words = sentence.split()

    embeddings = np.vstack([model.encode(word) for word in words])

    predictions = clf.predict(embeddings)

    print("\nLanguage Predictions:")
    for word, pred in zip(words, predictions):
        print(f"{word:}: {pred}")

In [28]:
# Call Prediction Function
predict_language("speed's stream was fun to watch. maybe its just me, but watching the stream, i realized most Filipinos are acting like a fucking animal.")

ValueError: X has 768 features, but RandomForestClassifier is expecting 229 features as input.