In [None]:
import os
import pandas as pd
import random
import re
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

STOPWORDS = set(stopwords.words('english'))

def preprocess_text(text, remove_stopwords=True):
    #text IDs
    text = re.sub(r"##\d+", "", text)

    #Repeated '@'
    text = re.sub(r"(@\s?)+", "", text)

    #HTML-like tags
    text = re.sub(r"<[^>]+>", " ", text)

    #Punctuation and numbers
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\d+", "", text)
    text = text.lower()

    if remove_stopwords:
        text = " ".join([word for word in text.split() if word not in STOPWORDS])
    text = re.sub(r"\s+", " ", text).strip()
    return text

def keep_stop_words():

    text = [word for word in text.split() if word in STOPWORDS]

    return text

doc_dict = {}
country_texts = {}
word_count_dict = {}
country_set = set()

sources_df = pd.read_excel("/content/drive/MyDrive/text/sampleSources.xlsx", sheet_name="texts")

for text_id, (country_code, doc_type), word_count in [(l[0], tuple(l[1].split()), l[2]) for l in sources_df[["textID", "country|genre", "# words"]].values.tolist()]:
    file_path = f"/content/drive/MyDrive/text/w_{country_code.lower()}_{doc_type.lower()}.txt"
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding="utf-8") as file:
            IS_DOC = False
            lines = file.readlines()
            for i, line in enumerate(lines):
                if line.strip().startswith(f"##{text_id}"):
                    IS_DOC = True
                elif line.strip().startswith("##"):
                    IS_DOC = False
                if IS_DOC:
                    if text_id not in doc_dict:
                        doc_dict[text_id] = [w.lower() for w in line.split()]
                    else:
                        doc_dict[text_id] += [w.lower() for w in line.split()]
            word_count_dict[text_id] = word_count
            country_set.add(country_code)
            if country_code not in country_texts:
                country_texts[country_code] = []

            full_text = " ".join(doc_dict[text_id])
            cleaned_text = preprocess_text(full_text)
            country_texts[country_code].append(cleaned_text)

min_samples = min(len(texts) for texts in country_texts.values())
balanced_country_texts = {
    country: random.sample(texts, min_samples)
    for country, texts in country_texts.items()
}

combined_texts = {country: " ".join(texts) for country, texts in balanced_country_texts.items()}

data = []
for country, texts in balanced_country_texts.items():
    for text in texts:
        data.append({"country": country, "text": text})

df = pd.DataFrame(data)

df.to_excel("/content/drive/MyDrive/text/CVTFIDFbalanced_dataset.xlsx", index=False)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


GLOVE DATASET

In [None]:
import os
import pandas as pd
import random
import re

doc_dict = {}
country_texts = {}
word_count_dict = {}
country_set = set()

sources_df = pd.read_excel("/content/drive/MyDrive/text/sampleSources.xlsx", sheet_name="texts")

for text_id, (country_code, doc_type), word_count in [(l[0], tuple(l[1].split()), l[2]) for l in sources_df[["textID", "country|genre", "# words"]].values.tolist()]:
    file_path = f"/content/drive/MyDrive/text/w_{country_code.lower()}_{doc_type.lower()}.txt"
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding="utf-8") as file:
            IS_DOC = False
            lines = file.readlines()
            for i, line in enumerate(lines):
                if line.strip().startswith(f"##{text_id}"):
                    IS_DOC = True
                elif line.strip().startswith("##"):
                    IS_DOC = False
                if IS_DOC:
                    if text_id not in doc_dict:
                        doc_dict[text_id] = [w.lower() for w in line.split()]
                    else:
                        doc_dict[text_id] += [w.lower() for w in line.split()]
            word_count_dict[text_id] = word_count
            country_set.add(country_code)
            if country_code not in country_texts:
                country_texts[country_code] = []

            full_text = " ".join(doc_dict[text_id])
            cleaned_text = preprocess_text(full_text, remove_stopwords=False)
            country_texts[country_code].append(cleaned_text)

min_samples = min(len(texts) for texts in country_texts.values())
balanced_country_texts = {
    country: random.sample(texts, min_samples)
    for country, texts in country_texts.items()
}

combined_texts = {country: " ".join(texts) for country, texts in balanced_country_texts.items()}

data = []
for country, texts in balanced_country_texts.items():
    for text in texts:
        data.append({"country": country, "text": text})

df = pd.DataFrame(data)

df.to_excel("/content/drive/MyDrive/text/GLOVEDATSASET.xlsx", index=False)

Count vectorizer

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

df = pd.read_excel("/content/drive/MyDrive/text/CVTFIDFbalanced_dataset.xlsx")

X = df["text"]
y = df["country"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
count_vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=10000)

CVX_train_count = count_vectorizer.fit_transform(X_train)
CVX_test_count = count_vectorizer.transform(X_test)

In [None]:
CV_LR_model = LogisticRegression(max_iter=2000, class_weight='balanced', random_state=3)

CV_LR_model.fit(CVX_train_count, y_train)

CVy_pred_count = CV_LR_model.predict(CVX_test_count)

print("CountVectorizer Results:")
print(f"Accuracy: {accuracy_score(y_test, CVy_pred_count)}")
print(classification_report(y_test, CVy_pred_count))


CountVectorizer Results:
Accuracy: 0.22580645161290322
              precision    recall  f1-score   support

          AU       0.10      0.17      0.12         6
          BD       0.50      0.33      0.40         6
          CA       0.00      0.00      0.00         6
          GB       0.06      0.17      0.09         6
          GH       0.33      0.17      0.22         6
          HK       0.67      0.33      0.44         6
          IE       0.00      0.00      0.00         7
          IN       0.17      0.14      0.15         7
          JM       0.00      0.00      0.00         6
          KE       0.25      0.17      0.20         6
          LK       0.57      0.57      0.57         7
          MY       0.29      0.33      0.31         6
          NG       0.25      0.33      0.29         6
          NZ       0.33      0.33      0.33         6
          PH       0.22      0.29      0.25         7
          PK       0.40      0.33      0.36         6
          SG       0.17   

In [None]:
from sklearn.ensemble import RandomForestClassifier

CV_RF_model = RandomForestClassifier(random_state=3, class_weight='balanced', n_estimators=100)

CV_RF_model.fit(CVX_train_count, y_train)

CVy_pred_count_rf = CV_RF_model.predict(CVX_test_count)

print("Random Forest with CountVectorizer Results:")
print(f"Accuracy: {accuracy_score(y_test, CVy_pred_count_rf)}")
print(classification_report(y_test, CVy_pred_count_rf))

Random Forest with CountVectorizer Results:
Accuracy: 0.3387096774193548
              precision    recall  f1-score   support

          AU       0.00      0.00      0.00         6
          BD       0.44      0.67      0.53         6
          CA       0.00      0.00      0.00         6
          GB       0.14      0.33      0.20         6
          GH       0.75      0.50      0.60         6
          HK       0.25      0.17      0.20         6
          IE       0.50      0.29      0.36         7
          IN       0.50      0.14      0.22         7
          JM       0.57      0.67      0.62         6
          KE       0.50      0.17      0.25         6
          LK       1.00      0.71      0.83         7
          MY       1.00      0.17      0.29         6
          NG       0.71      0.83      0.77         6
          NZ       0.43      0.50      0.46         6
          PH       0.10      0.14      0.12         7
          PK       0.33      0.33      0.33         6
        

TFIDF

In [None]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=10000)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [None]:
TF_LR_model = LogisticRegression(max_iter=2000, class_weight='balanced', random_state=3)

TF_LR_model.fit(X_train_tfidf, y_train)

y_pred_tfidf = TF_LR_model.predict(X_test_tfidf)

print("TF-IDF LogReg Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_tfidf)}")
print(classification_report(y_test, y_pred_tfidf))


TF-IDF LogReg Results:
Accuracy: 0.3387096774193548
              precision    recall  f1-score   support

          AU       0.14      0.17      0.15         6
          BD       0.43      0.50      0.46         6
          CA       0.50      0.17      0.25         6
          GB       0.17      0.17      0.17         6
          GH       0.29      0.33      0.31         6
          HK       0.33      0.33      0.33         6
          IE       0.43      0.43      0.43         7
          IN       0.40      0.29      0.33         7
          JM       0.50      0.33      0.40         6
          KE       0.22      0.33      0.27         6
          LK       0.80      0.57      0.67         7
          MY       0.50      0.33      0.40         6
          NG       0.44      0.67      0.53         6
          NZ       0.38      0.50      0.43         6
          PH       0.33      0.14      0.20         7
          PK       0.29      0.33      0.31         6
          SG       0.29      

In [None]:
TF_RF_model = RandomForestClassifier(random_state=3, class_weight='balanced', n_estimators=100)

TF_RF_model.fit(X_train_tfidf, y_train)

y_pred_tfidf_rf = TF_RF_model.predict(X_test_tfidf)

print("TF-IDF Random Forest Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_tfidf_rf)}")
print(classification_report(y_test, y_pred_tfidf_rf))

TF-IDF Random Forest Results:
Accuracy: 0.3387096774193548
              precision    recall  f1-score   support

          AU       0.00      0.00      0.00         6
          BD       0.56      0.83      0.67         6
          CA       0.00      0.00      0.00         6
          GB       0.14      0.17      0.15         6
          GH       0.67      0.33      0.44         6
          HK       0.40      0.33      0.36         6
          IE       0.80      0.57      0.67         7
          IN       1.00      0.14      0.25         7
          JM       0.33      0.33      0.33         6
          KE       0.25      0.17      0.20         6
          LK       0.71      0.71      0.71         7
          MY       0.33      0.17      0.22         6
          NG       1.00      0.83      0.91         6
          NZ       0.50      0.50      0.50         6
          PH       0.11      0.14      0.12         7
          PK       0.33      0.33      0.33         6
          SG       0.1

GLOVE!!

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

def load_glove_embeddings(glove_file):
    embeddings_index = {}
    with open(glove_file, "r", encoding="utf-8") as file:
        for line in tqdm(file, desc="Loading GloVe"):
            try:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = vector
            except ValueError:
                print(f"Skipping line: {line[:50]}...")
    return embeddings_index

glove_file_path = "/content/drive/MyDrive/text/glove.840B.300d.txt"
glove_embeddings = load_glove_embeddings(glove_file_path)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Loading GloVe: 54813it [00:09, 12370.89it/s]

Skipping line: . . . -0.1573 -0.29517 0.30453 -0.54773 0.098293 -...


Loading GloVe: 129116it [00:18, 6789.22it/s]

Skipping line: at name@domain.com 0.0061218 0.39595 -0.22079 0.78...


Loading GloVe: 153601it [00:21, 12391.43it/s]

Skipping line: . . . . . -0.23773 -0.82788 0.82326 -0.91878 0.358...


Loading GloVe: 201014it [00:29, 6546.06it/s]

Skipping line: to name@domain.com 0.33865 0.12698 -0.16885 0.5547...


Loading GloVe: 210531it [00:31, 4373.45it/s]

Skipping line: . . 0.035974 -0.024421 0.71402 -0.61127 0.012771 -...


Loading GloVe: 221795it [00:33, 5885.24it/s]

Skipping line: . . . . 0.033459 -0.085658 0.27155 -0.56132 0.6041...


Loading GloVe: 254742it [00:38, 10897.89it/s]

Skipping line: email name@domain.com 0.33529 0.32949 0.2646 0.642...


Loading GloVe: 366392it [00:51, 6868.04it/s]

Skipping line: or name@domain.com 0.48374 0.49669 -0.25089 0.9038...


Loading GloVe: 533885it [01:13, 12000.62it/s]

Skipping line: contact name@domain.com 0.016426 0.13728 0.18781 0...


Loading GloVe: 718887it [01:35, 10944.53it/s]

Skipping line: Email name@domain.com 0.37344 0.024573 -0.12583 0....


Loading GloVe: 835450it [01:48, 7714.17it/s] 


In [None]:
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
def text_to_embedding(text, embeddings_index, embedding_dim=300):
    tokens = word_tokenize(text.lower())
    word_vectors = [embeddings_index.get(word) for word in tokens if word in embeddings_index]
    if len(word_vectors) == 0:
        return np.zeros(embedding_dim)
    return np.mean(word_vectors, axis=0)

df = pd.read_excel("/content/drive/MyDrive/text/GLOVEDATSASET.xlsx")

embedding_dim = 300  # size of GloVe vectors
GloVe_X = np.array([text_to_embedding(text, glove_embeddings, embedding_dim) for text in tqdm(df["text"], desc="processing")])
y = df["country"]

# Train-test split
GloVe_X_train, GloVe_X_test, y_train, y_test = train_test_split(GloVe_X, y, test_size=0.2, random_state=3, stratify=y)

processing: 100%|██████████| 620/620 [00:06<00:00, 93.13it/s] 


In [None]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")
    print(classification_report(y_test, y_pred))

In [None]:
y_train

Unnamed: 0,country
587,US
289,KE
84,CA
154,GH
93,GB
...,...
549,TZ
387,NG
305,KE
503,SG


GloVe testing

In [None]:
print("Logistic Regression:")
GloVe_lr_model = LogisticRegression(max_iter=2000, class_weight='balanced', random_state=3)
evaluate_model(GloVe_lr_model, GloVe_X_train, GloVe_X_test, y_train, y_test)

Logistic Regression:
Accuracy: 0.1532258064516129
              precision    recall  f1-score   support

          AU       1.00      0.14      0.25         7
          BD       0.00      0.00      0.00         6
          CA       0.00      0.00      0.00         6
          GB       0.25      0.29      0.27         7
          GH       0.33      0.33      0.33         6
          HK       0.00      0.00      0.00         6
          IE       0.75      0.50      0.60         6
          IN       0.00      0.00      0.00         6
          JM       0.17      0.14      0.15         7
          KE       0.17      0.17      0.17         6
          LK       0.00      0.00      0.00         7
          MY       0.50      0.17      0.25         6
          NG       0.00      0.00      0.00         6
          NZ       0.25      0.17      0.20         6
          PH       0.00      0.00      0.00         6
          PK       0.23      0.50      0.32         6
          SG       0.25      0.

In [None]:
print("Random Forest Classifier:")
GloVe_rf_model = RandomForestClassifier(random_state=3, class_weight='balanced')
evaluate_model(GloVe_rf_model, GloVe_X_train, GloVe_X_test, y_train, y_test)

Random Forest Classifier:
Accuracy: 0.22580645161290322
              precision    recall  f1-score   support

          AU       0.25      0.29      0.27         7
          BD       0.17      0.17      0.17         6
          CA       0.20      0.17      0.18         6
          GB       0.00      0.00      0.00         7
          GH       0.31      0.67      0.42         6
          HK       0.29      0.33      0.31         6
          IE       0.50      0.67      0.57         6
          IN       0.25      0.17      0.20         6
          JM       0.00      0.00      0.00         7
          KE       0.33      0.50      0.40         6
          LK       0.20      0.14      0.17         7
          MY       0.00      0.00      0.00         6
          NG       0.00      0.00      0.00         6
          NZ       0.30      0.50      0.38         6
          PH       0.00      0.00      0.00         6
          PK       0.33      0.50      0.40         6
          SG       0.40  

In [None]:
print("Multi-Layer Perceptron:")
GloVe_mlp_model = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=500, random_state=3)
evaluate_model(GloVe_mlp_model, GloVe_X_train, GloVe_X_test, y_train, y_test)

Multi-Layer Perceptron:
Accuracy: 0.2661290322580645
              precision    recall  f1-score   support

          AU       0.17      0.14      0.15         7
          BD       0.33      0.33      0.33         6
          CA       0.00      0.00      0.00         6
          GB       0.00      0.00      0.00         7
          GH       0.20      0.17      0.18         6
          HK       0.40      0.33      0.36         6
          IE       0.45      0.83      0.59         6
          IN       0.12      0.17      0.14         6
          JM       0.40      0.29      0.33         7
          KE       0.36      0.67      0.47         6
          LK       0.50      0.14      0.22         7
          MY       0.11      0.17      0.13         6
          NG       0.33      0.17      0.22         6
          NZ       0.40      0.33      0.36         6
          PH       0.00      0.00      0.00         6
          PK       0.27      0.50      0.35         6
          SG       0.50     



BERT EMBEDDINGS!!

In [None]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
from tqdm import tqdm
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model = BertModel.from_pretrained("bert-base-uncased").to(device)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def text_to_bert_embedding(text, model, tokenizer, max_length=512):
    tokens = tokenizer(text, max_length=max_length, truncation=True, padding="max_length", return_tensors="pt")
    tokens = {key: val.to(device) for key, val in tokens.items()}
    with torch.no_grad():
        outputs = model(**tokens)
    # CLS token's embedding
    return outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()

df = pd.read_excel("/content/drive/MyDrive/text/balanced_dataset.xlsx")

In [None]:
# BERT embeddings for the dataset
BERT_X = np.array([text_to_bert_embedding(text, bert_model, tokenizer) for text in tqdm(df["text"], desc="processing")])
y = df["country"]

# Train-test split
BERT_X_train, BERT_X_test, y_train, y_test = train_test_split(BERT_X, y, test_size=0.2, random_state=3, stratify=y)


processing:   0%|          | 0/620 [00:00<?, ?it/s][A
processing:   0%|          | 1/620 [00:06<1:01:54,  6.00s/it][A
processing:   0%|          | 2/620 [00:09<49:14,  4.78s/it]  [A
processing:   0%|          | 3/620 [00:13<44:48,  4.36s/it][A
processing:   1%|          | 4/620 [00:19<48:29,  4.72s/it][A
processing:   1%|          | 5/620 [00:22<43:35,  4.25s/it][A
processing:   1%|          | 6/620 [00:26<41:44,  4.08s/it][A
processing:   1%|          | 7/620 [00:29<38:40,  3.78s/it][A
processing:   1%|▏         | 8/620 [00:33<39:29,  3.87s/it][A
processing:   1%|▏         | 9/620 [00:36<37:17,  3.66s/it][A
processing:   2%|▏         | 10/620 [00:38<31:52,  3.13s/it][A
processing:   2%|▏         | 11/620 [00:40<27:56,  2.75s/it][A
processing:   2%|▏         | 12/620 [00:42<25:17,  2.50s/it][A
processing:   2%|▏         | 13/620 [00:44<23:24,  2.31s/it][A
processing:   2%|▏         | 14/620 [00:46<22:02,  2.18s/it][A
processing:   2%|▏         | 15/620 [00:49<24:20,  2.

In [None]:
print("Logistic Regression:")
BERT_lr_model = LogisticRegression(max_iter=2000, class_weight='balanced', random_state=3)
evaluate_model(BERT_lr_model, BERT_X_train, BERT_X_test, y_train, y_test)

Logistic Regression:
Accuracy: 0.33064516129032256
              precision    recall  f1-score   support

          AU       0.29      0.29      0.29         7
          BD       0.50      0.50      0.50         6
          CA       0.25      0.33      0.29         6
          GB       0.29      0.29      0.29         7
          GH       0.38      0.50      0.43         6
          HK       0.43      0.50      0.46         6
          IE       0.40      0.33      0.36         6
          IN       0.25      0.33      0.29         6
          JM       0.33      0.14      0.20         7
          KE       0.12      0.17      0.14         6
          LK       0.33      0.29      0.31         7
          MY       0.25      0.17      0.20         6
          NG       0.67      0.33      0.44         6
          NZ       0.11      0.17      0.13         6
          PH       0.29      0.33      0.31         6
          PK       0.20      0.17      0.18         6
          SG       0.43      0

In [None]:
print("Random Forest Classifier:")
BERT_rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')
evaluate_model(BERT_rf_model, BERT_X_train, BERT_X_test, y_train, y_test)

Random Forest Classifier:
Accuracy: 0.20967741935483872
              precision    recall  f1-score   support

          AU       0.25      0.29      0.27         7
          BD       0.11      0.17      0.13         6
          CA       0.18      0.33      0.24         6
          GB       0.12      0.14      0.13         7
          GH       0.43      0.50      0.46         6
          HK       0.12      0.17      0.14         6
          IE       0.38      0.50      0.43         6
          IN       0.00      0.00      0.00         6
          JM       0.00      0.00      0.00         7
          KE       0.25      0.33      0.29         6
          LK       0.00      0.00      0.00         7
          MY       0.20      0.17      0.18         6
          NG       0.40      0.33      0.36         6
          NZ       0.25      0.33      0.29         6
          PH       0.00      0.00      0.00         6
          PK       1.00      0.17      0.29         6
          SG       0.43  

In [None]:
print("Multi-Layer Perceptron:")
BERT_mlp_model = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=500, random_state=3)
evaluate_model(BERT_mlp_model, BERT_X_train, BERT_X_test, y_train, y_test)

Multi-Layer Perceptron:
Accuracy: 0.22580645161290322
              precision    recall  f1-score   support

          AU       0.29      0.29      0.29         7
          BD       0.33      0.17      0.22         6
          CA       0.25      0.33      0.29         6
          GB       0.33      0.29      0.31         7
          GH       0.43      0.50      0.46         6
          HK       0.11      0.17      0.13         6
          IE       0.43      0.50      0.46         6
          IN       0.14      0.17      0.15         6
          JM       0.25      0.14      0.18         7
          KE       0.00      0.00      0.00         6
          LK       0.33      0.43      0.38         7
          MY       0.00      0.00      0.00         6
          NG       0.67      0.33      0.44         6
          NZ       0.00      0.00      0.00         6
          PH       0.00      0.00      0.00         6
          PK       0.17      0.17      0.17         6
          SG       0.20    

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
import numpy as np
from tqdm import tqdm

In [4]:
class DialectDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_masks[idx],
            "label": torch.tensor(self.labels[idx], dtype=torch.long),
        }


df = pd.read_excel("/content/drive/MyDrive/text/balanced_dataset.xlsx")

label_mapping = {label: idx for idx, label in enumerate(df["country"].unique())}
df["label"] = df["country"].map(label_mapping)

BERTT_X_train, BERTT_X_test, BERTT_y_train, BERTT_y_test = train_test_split(df["text"], df["label"], test_size=0.2, stratify=df["label"], random_state=3)

In [5]:
def preprocess_texts(texts, tokenizer, max_length):
    inputs = tokenizer(
        texts.tolist(), max_length=max_length, truncation=True, padding="max_length", return_tensors="pt"
    )
    return inputs["input_ids"], inputs["attention_mask"]

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_inputs, train_masks = preprocess_texts(BERTT_X_train, tokenizer, max_length=256)
test_inputs, test_masks = preprocess_texts(BERTT_X_test, tokenizer, max_length=256)

In [6]:
# Create datasets
train_dataset = DialectDataset(train_inputs, train_masks, BERTT_y_train.tolist())
test_dataset = DialectDataset(test_inputs, test_masks, BERTT_y_test.tolist())

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_mapping))
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def train_model(model, train_loader, optimizer, criterion, device, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        correct = 0
        total = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}"):
            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()
            _, preds = torch.max(logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch + 1}: Loss = {total_loss / len(train_loader)}, Accuracy = {correct / total}")

In [8]:
train_model(model, train_loader, optimizer, criterion, device, epochs=3)

Epoch 1/3: 100%|██████████| 31/31 [26:26<00:00, 51.16s/it]


Epoch 1: Loss = 3.04086047603238, Accuracy = 0.04435483870967742


Epoch 2/3: 100%|██████████| 31/31 [26:14<00:00, 50.79s/it]


Epoch 2: Loss = 2.8819619763282036, Accuracy = 0.14717741935483872


Epoch 3/3: 100%|██████████| 31/31 [26:04<00:00, 50.47s/it]

Epoch 3: Loss = 2.4922544879298054, Accuracy = 0.3387096774193548





In [9]:
def predict(model, tokenizer, text, max_length=512):
    model.eval()
    with torch.no_grad():
        tokens = tokenizer(
            text, max_length=max_length, truncation=True, padding="max_length", return_tensors="pt"
        )
        input_ids = tokens["input_ids"].to(device)
        attention_mask = tokens["attention_mask"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        _, prediction = torch.max(logits, dim=1)
    return prediction.item()

In [27]:
# Hong Kong example vv
text = "Still I admit that Plato 's world was not ours , that his scorn of trade and handicraft is fantastic , that he had no conception of a great industrial community such as that of the United States , and that such a community must and will shape its education to suit its own needs . If the usual education handed down to it from the past does not suit it , it will certainly before long drop this and try another ."
predicted_label = predict(model, tokenizer, text)
predicted_country = [country for country, label in label_mapping.items() if label == predicted_label][0]

print(f"Predicted country: {predicted_country}")

Predicted country: NZ


In [28]:
def evaluate_model(model, test_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, preds = torch.max(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Accuracy: {accuracy}")
    print(classification_report(all_labels, all_preds, target_names=label_mapping.keys()))

In [29]:
evaluate_model(model, test_loader, device)

Accuracy: 0.3225806451612903
              precision    recall  f1-score   support

          AU       1.00      0.43      0.60         7
          BD       0.50      0.83      0.62         6
          CA       0.75      0.50      0.60         6
          GB       0.33      0.29      0.31         7
          GH       1.00      0.17      0.29         6
          HK       0.25      0.17      0.20         6
          IE       1.00      0.50      0.67         6
          IN       0.25      0.17      0.20         6
          JM       0.25      0.14      0.18         7
          KE       0.00      0.00      0.00         6
          LK       0.33      0.29      0.31         7
          MY       0.15      0.33      0.21         6
          NG       0.18      0.33      0.24         6
          NZ       0.20      0.50      0.29         6
          PH       0.00      0.00      0.00         6
          PK       0.25      0.17      0.20         6
          SG       0.40      0.67      0.50         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
