# Test CLD
Notebook for various tests of the pycld2 library

In [1]:
# Install dependencies
%pip install pandas pycld2 sklearn

Collecting pandas
  Downloading pandas-1.0.4-cp36-cp36m-manylinux1_x86_64.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 1.6 MB/s eta 0:00:01
[?25hCollecting pycld2
  Downloading pycld2-0.41.tar.gz (41.4 MB)
[K     |████████████████████████████████| 41.4 MB 152 kB/s  eta 0:00:01█▊                         | 8.7 MB 51.1 MB/s eta 0:00:01    |███████████                     | 14.1 MB 51.1 MB/s eta 0:00:011.1 MB/s eta 0:00:01███████▋                | 20.3 MB 51.1 MB/s eta 0:00:0122.5 MB 51.1 MB/s eta 0:00:01B 51.1 MB/s eta 0:00:01�█████████████████████▏         | 28.7 MB 51.1 MB/s eta 0:00:01   |█████████████████████████▏      | 32.6 MB 51.1 MB/s eta 0:00:01��███████████████████▍   | 36.7 MB 51.1 MB/s eta 0:00:01███████▍| 40.6 MB 51.1 MB/s eta 0:00:01
[?25hCollecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Collecting pytz>=2017.2
  Downloading pytz-2020.1-py2.py3-none-any.whl (510 kB)
[K     |████████████████████████████████| 510 kB 52.8 MB/s eta 0:00:01
[?2

In [2]:
# Imports
import os
import random
import re
from collections import Counter

import pandas as pd
from pycld2 import detect
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import MultiLabelBinarizer

Load data from Wiki files

In [3]:
data = {}
for n in ["train", "val", "test"]:
    nowiki = [(line.strip(), "no") for line in open(f"res/wiki/nowiki-{n}.txt")]
    nnwiki = [(line.strip(), "nn") for line in open(f"res/wiki/nnwiki-{n}.txt")]

    m = min(len(nowiki), len(nnwiki))
    sample = random.sample(nowiki, m)
    sample += random.sample(nnwiki, m)

    random.shuffle(sample)

    data[n] = sample

x_train, y_train = zip(*data["train"])
x_val, y_val = zip(*data["val"])
x_test, y_test = zip(*data["test"])

In [22]:
pred = []
true = []
for x, y in [(x_train, y_train), (x_val, y_val), (x_test, y_test)]:
    for text, label in zip(x, y):
        # print(x[:10])
        try:
            d = detect(text, returnVectors=True, bestEffort=True)
            p = d[2][0][1]  # Number one prediction
            pred.append(p if p in {"no", "nn"} else "un")
            true.append(label)
        except Exception as e:
            # print(text)
            print(e)

counter = Counter((a, b) for a, b in zip(pred, true) if a != b and a != "un")

print(counter.most_common(100))

# for y in confusion_matrix(true, pred):
#     print(list(y))
print(confusion_matrix(true, pred))
print(classification_report(true, pred))

input contains invalid UTF-8 around byte 0 (of 32)
input contains invalid UTF-8 around byte 55 (of 85)
input contains invalid UTF-8 around byte 26 (of 131)
input contains invalid UTF-8 around byte 123 (of 173)
input contains invalid UTF-8 around byte 3 (of 113)
input contains invalid UTF-8 around byte 59 (of 127)
input contains invalid UTF-8 around byte 44 (of 163)
input contains invalid UTF-8 around byte 10 (of 128)
input contains invalid UTF-8 around byte 111 (of 159)
input contains invalid UTF-8 around byte 103 (of 154)
input contains invalid UTF-8 around byte 0 (of 54)
input contains invalid UTF-8 around byte 8 (of 19)
input contains invalid UTF-8 around byte 8 (of 39)
input contains invalid UTF-8 around byte 8 (of 38)
input contains invalid UTF-8 around byte 23 (of 41)
input contains invalid UTF-8 around byte 8 (of 23)
input contains invalid UTF-8 around byte 23 (of 219)
input contains invalid UTF-8 around byte 75 (of 94)
input contains invalid UTF-8 around byte 68 (of 225)
[(('nn

Load data from WiLI dataset

In [18]:
wili_train = zip(open("res/wili-2018/x_train.txt"), open("res/wili-2018/y_train.txt"))
wili_test = zip(open("res/wili-2018/x_test.txt"), open("res/wili-2018/y_test.txt"))

In [19]:
df = pd.read_csv("res/wili-2018/labels.csv", delimiter=";")

pred = []
true = []
for text, label in wili_test:
    text = re.sub("[]", "", text)  # Clean text so cld doesn't crash
    text = text.strip()
    label = label.strip()
    try:
        d = detect(text, returnVectors=True, bestEffort=True)
        p = d[2][0][1]  # Number one prediction
        
        # Use wiki code from WiLI (matches better with cld codes)
        t = df[df["Label"] == label]["Wiki Code"].values
        
        # a few more adjustments for matching
        t = t[0] if len(t) > 0 else "other"
        conv = {"arz": "ar", "tcy": "kn", "he": "iw", "xmf": "ka", "be-tarask": "be", "zh-classical": "zh",
                "zh-yue": "zh", "jv": "jw"}
        t = conv[t] if t in conv else t
        
        if len(t) == 2 or t == p or t in {"chr", "sco", "war", "zh-Hant"}:
            pred.append(p if p != "zh-Hant" else "zh")
            true.append(t)
    except Exception as e:
        # print(text)
        print(e)


input contains invalid UTF-8 around byte 1402 (of 46994)
input contains invalid UTF-8 around byte 104396 (of 124230)


Get stats.

In [20]:
counter = Counter((a, b) for a, b in zip(pred, true) if a != b and a != "un")

print(counter.most_common(100))

# for y in confusion_matrix(true, pred):
#     print(list(y))
print(confusion_matrix(true, pred))
print(classification_report(true, pred))

[(('en', 'sco'), 378), (('eo', 'io'), 347), (('nl', 'li'), 346), (('en', 'ia'), 290), (('en', 'ie'), 270), (('fr', 'wa'), 250), (('hr', 'bs'), 233), (('en', 'sc'), 225), (('hr', 'sh'), 198), (('cy', 'kw'), 197), (('en', 'wa'), 197), (('es', 'an'), 183), (('bs', 'sh'), 178), (('fi', 'se'), 171), (('en', 'kw'), 168), (('en', 'an'), 168), (('bs', 'hr'), 130), (('qu', 'ay'), 124), (('da', 'nv'), 115), (('ru', 'kv'), 112), (('en', 'av'), 111), (('uz', 'av'), 108), (('hi', 'bh'), 107), (('en', 'yo'), 107), (('sr', 'kv'), 105), (('sr', 'sh'), 94), (('fr', 'oc'), 92), (('en', 'ig'), 86), (('en', 'ku'), 82), (('en', 'li'), 79), (('co', 'sc'), 75), (('en', 'sn'), 71), (('en', 'la'), 69), (('en', 'io'), 65), (('sm', 'se'), 64), (('ru', 'os'), 63), (('af', 'li'), 63), (('gl', 'an'), 61), (('br', 'kw'), 59), (('en', 'km'), 58), (('en', 'bn'), 56), (('cs', 'ku'), 54), (('aa', 'nv'), 51), (('rw', 'io'), 48), (('ro', 'av'), 48), (('id', 'ms'), 48), (('tr', 'ku'), 47), (('ru', 'av'), 45), (('mn', 'os')

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          aa       0.00      0.00      0.00         0
          ab       0.00      0.00      0.00         0
          af       0.87      0.98      0.92       500
          ak       0.00      0.00      0.00         0
          am       1.00      0.99      0.99       500
          an       0.00      0.00      0.00       500
          ar       0.99      0.99      0.99      1000
          as       1.00      0.97      0.98       500
          av       0.00      0.00      0.00       500
          ay       1.00      0.68      0.81       500
          az       0.96      0.98      0.97       500
          ba       0.95      0.98      0.96       500
          be       0.92      0.98      0.95      1000
          bg       0.96      0.93      0.94       500
          bh       0.99      0.75      0.85       500
          bi       0.00      0.00      0.00         0
          bn       1.00      0.88      0.94       500
          bo       1.00    

Additional speaker text identification

In [None]:
files = os.listdir("res/speaker_texts")
files = sorted(files, key=lambda x: x.lower().replace("æ", "{").replace("ø", "|").replace("å", "}"))

d = open("speakers.csv", "w")
d.write("file,is_reliable,bytes," + ",".join(
    (f"lang{i}_name,lang{i}_code,lang{i}_percentage,lang{i}_score" for i in range(3))) + "\n")

for f in files:
    print(f)
    txt = open(f"res/speaker_texts/{f}").read()
    txt = re.sub(r"(^|\n)[^\t]+\t", "", txt)
    det = detect(txt, hintTopLevelDomain="no")

    d.write(f + "," + re.sub(r"[\s()]+", "", str(det)) + "\n")
