# Test CLD
Notebook for various tests of the pycld2 library

In [None]:
# Imports
import os
import random
import re
from collections import Counter

import pandas as pd
from pycld2 import detect
from sklearn.metrics import confusion_matrix, classification_report

Load data from Wiki files

In [None]:
data = {}
for n in ["train", "val", "test"]:
    nowiki = [(line.strip(), "no") for line in open(f"../res/wiki/nowiki-{n}.txt")]
    nnwiki = [(line.strip(), "nn") for line in open(f"../res/wiki/nnwiki-{n}.txt")]

    m = min(len(nowiki), len(nnwiki))
    sample = random.sample(nowiki, m)
    sample += random.sample(nnwiki, m)

    random.shuffle(sample)

    data[n] = sample

x_train, y_train = zip(*data["train"])
x_val, y_val = zip(*data["val"])
x_test, y_test = zip(*data["test"])

Load data from WiLI dataset

In [None]:
wili_train = open("../res/wili-2018/x_train.txt"), open("../res/wili-2018/y_train.txt")
wili_test = open("../res/wili-2018/x_test.txt"), open("../res/wili-2018/y_test.txt")

In [None]:
df = pd.read_csv("../res/wili-2018/labels.csv", delimiter=";")

pred = []
true = []
for text, label in wili_test:
    text = re.sub("[]", "", text)  # Clean text so cld doesn't crash
    text = text.strip()
    label = label.strip()
    try:
        d = detect(text, returnVectors=True, bestEffort=True)
        p = d[2][0][1]  # Number one prediction
        
        # Use wiki code from WiLI (matches better with cld codes)
        t = df[df["Label"] == label]["Wiki Code"].values
        
        # a few more adjustments for matching
        t = t[0] if len(t) > 0 else "other"
        conv = {"arz": "ar", "tcy": "kn", "he": "iw", "xmf": "ka", "be-tarask": "be", "zh-classical": "zh",
                "zh-yue": "zh", "jv": "jw"}
        t = conv[t] if t in conv else t
        
        if len(t) == 2 or t == p or t in {"chr", "sco", "war", "zh-Hant"}:
            pred.append(p if p != "zh-Hant" else "zh")
            true.append(t)
    except Exception as e:
        print(text)
        print(e)


Get stats.

In [None]:
counter = Counter((a, b) for a, b in zip(pred, true) if a != b and a != "un")

print(counter.most_common())

# for y in confusion_matrix(true, pred):
#     print(list(y))
print(confusion_matrix(true, pred))
print(classification_report(true, pred))

Additional speaker text identification

In [None]:
files = os.listdir("../res/speaker_texts")
files = sorted(files, key=lambda x: x.lower().replace("æ", "{").replace("ø", "|").replace("å", "}"))

d = open("speakers.csv", "w")
d.write("file,is_reliable,bytes," + ",".join(
    (f"lang{i}_name,lang{i}_code,lang{i}_percentage,lang{i}_score" for i in range(3))) + "\n")

for f in files:
    print(f)
    txt = open(f"../res/speaker_texts/{f}").read()
    txt = re.sub(r"(^|\n)[^\t]+\t", "", txt)
    det = detect(txt, hintTopLevelDomain="no")

    d.write(f + "," + re.sub(r"[\s()]+", "", str(det)) + "\n")
