# Language classification analysis

In [1]:
import os
import csv
import numpy as np
from collections import Counter
import json
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [2]:
# Load the data from flores200
dev_dir = "data/flores200_dataset/dev"
dev_lines = []
dev_labels = []
for filename in os.listdir(dev_dir):
    if not filename.endswith(".dev"):
        continue
    label = filename[:-4]
    with open(os.path.join(dev_dir, filename), "r") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            dev_lines.append(line)
            dev_labels.append(label)
trg_langs = list(set(dev_labels))

In [3]:
# Create a vectorizer.
vectorizer = CountVectorizer(ngram_range=(1, 5), analyzer='char', min_df=2)

# Train the classifier.
print(dev_lines[:5])
print("Vectorizing data...")
vectorized_data = vectorizer.fit_transform(dev_lines)


['Tɛnɛndo, Stanford kɛnɛya sanfɛkalanso donnikɛbaw ye baana sƐgƐ sƐgƐli minan kura do dilani kumalase min ni a be se ka seliliw suguya woloma: pisi fitifitinin do min  bese ka dilan ni Ɛnpirmanti ankirima ye US wari tama ɲɔgɔnna a kelen o kelen songɔ ye .', 'Ɲinninikɛla jɔnjɔnw  ko ko o be se ka to boɔ bana, Sɔgɔsɔgɔnijɛ, SIDA bana ani Sumaya sƐgƐsƐgƐli joona ka se ka kƐ banabatɔw dɛsɛbato jamana la, yɔrɔ minw na ni boɔbanatɔ inafɔ sin na boɔ balota hakƐ bese ka dɔgɔya ni setigi jamanaw ta tila.', 'Pankunru Gripen JAS 39C binna pankunru jigi kɛnɛ dɔ kan sɔkɔma nɛkɛ kan ɲɛ 9:30 (0230 UTC) ni ka mɛnɛ, min naara kɛra sabu ye ka pankunru jiginkɛnɛ datugu jakokɛ pankunru ɲɛ. pankuluw ɲɛ.', 'Pankulu boli ba lakodɔnna ka kɛ Esekadɔron Leader Dilokrit Pattavee ye.', "Yen kunafoni dilaw y' a lase ko pankunruw sow tasuma fagamobili dɔ binna u tasuma fagatɔ."]
Vectorizing data...


In [4]:
# Create a classifier.
classifier = LogisticRegression(C=0.1, penalty='l2', solver='saga', multi_class='multinomial', verbose=1)
# classifier = LinearSVC(C=1.0, penalty='l2', multi_class='crammer_singer', verbose=1)

print("Training classifier...")
classifier.fit(vectorized_data, dev_labels)

Training classifier...


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Epoch 1, change: 1.00000000
Epoch 2, change: 0.28748756
Epoch 3, change: 0.21249547
Epoch 4, change: 0.17033078
Epoch 5, change: 0.12905634
Epoch 6, change: 0.10072557
Epoch 7, change: 0.08010951
Epoch 8, change: 0.06647310
Epoch 9, change: 0.05760336
Epoch 10, change: 0.05024778
Epoch 11, change: 0.04476511
Epoch 12, change: 0.04037381
Epoch 13, change: 0.03647386
Epoch 14, change: 0.03323122
Epoch 15, change: 0.03050382
Epoch 16, change: 0.02822779
Epoch 17, change: 0.02622157
Epoch 18, change: 0.02442056
Epoch 19, change: 0.02298158
Epoch 20, change: 0.02145350
Epoch 21, change: 0.02029932
Epoch 22, change: 0.01916717
Epoch 23, change: 0.01809062
Epoch 24, change: 0.01717335
Epoch 25, change: 0.01618069
Epoch 26, change: 0.01539998
Epoch 27, change: 0.01460681
Epoch 28, change: 0.01390642
Epoch 29, change: 0.01333829
Epoch 30, change: 0.01271897
Epoch 31, change: 0.01217977
Epoch 32, change: 0.01166206
Epoch 33, change: 0.01118856
Epoch 34, change: 0.01076730
Epoch 35, change: 0.010

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 175.8min finished


In [7]:
def load_hyp_from_tsv(filename: str) -> list[str]:
    with open(filename, "r") as f:
        # Load the third column from the csv reader after the header
        reader = csv.reader(f, delimiter="\t")
        next(reader)
        return [x[2] for x in reader]

# get all the files in the system_outputs folder
all_results = {}
for subdir in os.listdir("system_outputs"):
    # The sub_directory must be a directory
    subdir_path = os.path.join("system_outputs", subdir)
    if not os.path.isdir(subdir_path):
        raise ValueError(f"{subdir_path=} is not a directory")
    # get all the files in the sub_directory
    all_files = os.listdir(subdir_path)
    for trg_lang in trg_langs:
        # find the tsv file
        trg_file = [f for f in all_files if f.endswith(f"{trg_lang}.tsv")]
        if len(trg_file) == 1:
            all_results[f"{trg_lang} {subdir}"] = load_hyp_from_tsv(os.path.join(subdir_path, trg_file[0]))
            continue
        # find the hyp file
        trg_file = [f for f in all_files if f.endswith(f"{trg_lang}-devtest.hyp")]
        if len(trg_file) == 1:
            with open(os.path.join(subdir_path, trg_file[0]), "r") as f:
                all_results[f"{trg_lang} {subdir}"] = [x.strip() for x in f.readlines()]
            continue

In [8]:
all_langs = {}
for lang_and_model in sorted(all_results.keys()):
    trg_lang, model = lang_and_model.split()
    # Do classification
    data = all_results[f"{trg_lang} {model}"]
    encoded_data = vectorizer.transform(data)
    predicted_langs = list(classifier.predict(encoded_data))
    all_langs[f"{trg_lang} {model}"] = predicted_langs
    counts = Counter(predicted_langs)
    # Print top 5 counts
    top_5 = ", ".join(f"{lang}:{count}" for lang, count in counts.most_common(5))
    print(f"{trg_lang}, {model}: acc={counts[trg_lang]/len(predicted_langs):.2f}, top 5: {top_5}")

ace_Arab, nllb_moe: acc=0.11, top 5: min_Arab:492, bjn_Arab:338, ace_Arab:107, knc_Arab:15, ind_Latn:15
ace_Arab, tt-five: acc=0.58, top 5: ace_Arab:585, bjn_Arab:217, ind_Latn:61, min_Arab:46, ace_Latn:26
ace_Arab, tt-zero: acc=0.10, top 5: bjn_Arab:289, arb_Arab:119, ace_Arab:102, ace_Latn:81, ars_Arab:66
ace_Latn, nllb_moe: acc=0.68, top 5: ace_Latn:688, ind_Latn:208, zsm_Latn:58, sun_Latn:28, ban_Latn:9
ace_Latn, tt-five: acc=0.81, top 5: ace_Latn:823, ind_Latn:129, zsm_Latn:24, sun_Latn:11, luo_Latn:10
ace_Latn, tt-zero: acc=0.62, top 5: ace_Latn:626, ind_Latn:237, zsm_Latn:56, sun_Latn:38, min_Latn:17
acm_Arab, gpt4_tt-five: acc=0.19, top 5: arb_Arab:266, acm_Arab:189, ars_Arab:155, acq_Arab:118, aeb_Arab:75
acm_Arab, nllb_moe: acc=0.12, top 5: knc_Arab:297, arb_Arab:173, acm_Arab:120, ars_Arab:119, acq_Arab:102
acm_Arab, tt-five: acc=0.07, top 5: arb_Arab:339, ars_Arab:250, acq_Arab:107, aeb_Arab:78, acm_Arab:69
acm_Arab, tt-zero: acc=0.08, top 5: arb_Arab:318, ars_Arab:211, acq

In [10]:
all_models = sorted(list(set([x.split()[1] for x in all_results.keys()])))
lang_id_accuracies = {x: [] for x in all_models}
for lang_and_model, langid_results in all_langs.items():
    lang, model = lang_and_model.split()
    lang_id_accuracies[model].extend(1.0 if x == lang else 0 for x in langid_results)
for lang, accuracies in lang_id_accuracies.items():
    lang_id_accuracies[lang] = sum(accuracies) / len(accuracies)
    print(f"{lang} lang_id_accuracy: {lang_id_accuracies[lang]}")

gpt4_tt-five lang_id_accuracy: 0.8963932806324111
nllb_moe lang_id_accuracy: 0.9097447544884274
tt-five lang_id_accuracy: 0.8337097684923772
tt-zero lang_id_accuracy: 0.723943223193598


In [14]:
# Make the langid_data folder if it doesn't exist
if not os.path.exists("langid_data"):
    os.makedirs("langid_data")

with open("langid_data/all_results.json", "w") as f:
    json.dump(all_results, f)
with open("langid_data/all_langs.json", "w") as f:
    print(all_langs)
    json.dump(all_langs, f)
with open("langid_data/langid_classifier.pkl", "wb") as f:
    pickle.dump(classifier, f)

{'ace_Arab nllb_moe': array(['bjn_Arab', 'eng_Latn', 'ace_Arab', ..., 'min_Arab', 'bjn_Arab',
       'min_Arab'], dtype='<U8'), 'ace_Arab tt-five': array(['ace_Latn', 'ace_Arab', 'bjn_Arab', ..., 'ace_Arab', 'ace_Arab',
       'ace_Arab'], dtype='<U8'), 'ace_Arab tt-zero': array(['ace_Latn', 'arb_Arab', 'bjn_Arab', ..., 'arb_Arab', 'min_Arab',
       'acq_Arab'], dtype='<U8'), 'ace_Latn nllb_moe': array(['ace_Latn', 'ind_Latn', 'ace_Latn', ..., 'ind_Latn', 'zsm_Latn',
       'ace_Latn'], dtype='<U8'), 'ace_Latn tt-five': array(['ace_Latn', 'ind_Latn', 'ace_Latn', ..., 'ace_Latn', 'zsm_Latn',
       'ace_Latn'], dtype='<U8'), 'ace_Latn tt-zero': array(['ace_Latn', 'ind_Latn', 'ace_Latn', ..., 'zsm_Latn', 'ace_Latn',
       'ace_Latn'], dtype='<U8'), 'acm_Arab gpt4_tt-five': array(['ajp_Arab', 'arb_Arab', 'arb_Arab', ..., 'acm_Arab', 'arb_Arab',
       'ars_Arab'], dtype='<U8'), 'acm_Arab nllb_moe': array(['acm_Arab', 'knc_Arab', 'knc_Arab', ..., 'acq_Arab', 'ars_Arab',
       'acq_Arab'

TypeError: Object of type ndarray is not JSON serializable