# Data Analysis

- Read `.conll` data and write the unique list of entity, entity ids, and relations

In [2]:
import re
import os
from collections import Counter


def write_unique_keys(values, file_path):
    c = Counter(values)
    with open(file_path, "w") as fo:
        for k in sorted(c.keys()):
            print(f"{k}\t{c[k]}", file=fo)
    

input_dir = "./data/VLSP2020_RE_training_fixed"
# input_dir = "./data/VLSP2020_RE_dev_fixed"

subdirs = [d for d in os.listdir(input_dir) if os.path.isdir(os.path.join(input_dir, d))
           and re.search(r"\.conll", d)]
print("Number of sub-directories: {}".format(len(subdirs)))

error_file = open("./tmp/errors.txt", "w")

field3, field4, field5, field6 = [], [], [], []
for subdir in subdirs:
    subdir_ = os.path.join(input_dir, subdir)
    files = [f for f in os.listdir(subdir_) if re.search(r"\.tsv", f)]
    for file in files:
        file_path = os.path.join(subdir_, file)
        with open(file_path, 'r') as fi:
            i = 0
            for line in fi:
                i += 1
                line = line.rstrip()
                if line == "":
                    continue
                if re.search(r"#Text=(.+)$", line):
                    continue
                elif re.search(r"^1-\d+", line):
                    fields = line.split("\t")
                    if len(fields) > 3:
                        field3.append(fields[3])
                    if len(fields) > 4:
                        field4.append(fields[4])
                        if re.search(r"^\*", fields[4]):
                            print(f"{subdir}/{file}\tLine: {i}\t{line}", file=error_file)
                    if len(fields) > 5:
                        field5.append(fields[5])
                        if re.search(r"\*", fields[5]):
                            print(f"{fields[5]}\t{subdir}/{file}\tLine: {i}\t{line}", file=error_file)
                    if len(fields) > 6:
                        field6.append(fields[6])
                        if not re.search(r"^[0-9-\[\]_\.|\s]*$", fields[6]):
                            print(f"{fields[6]}\t{subdir}/{file}\tLine: {i}\t{line}", file=error_file)
                        entity1_ids = fields[6].split("|")
                        rel_types = fields[5].split("|")
                        if len(entity1_ids)  != len(rel_types):
                            print(f"{fields[6]}\t{subdir}/{file}\tLine: {i}\t{line}", file=error_file)

write_unique_keys(field3, "tmp/field3.txt")
write_unique_keys(field4, "tmp/field4.txt")
write_unique_keys(field5, "tmp/field5.txt")
write_unique_keys(field6, "tmp/field6.txt")

error_file.close()

Number of sub-directories: 506


## List-up data files that contain PERSONAL-SOCIAL


In [5]:
res = set()
for subdir in subdirs:
    subdir_ = os.path.join(input_dir, subdir)
    files = [f for f in os.listdir(subdir_) if re.search(r"\.tsv", f)]
    for file in files:
        file_path = os.path.join(subdir_, file)
        with open(file_path, 'r') as fi:
            for line in fi:
                line = line.rstrip()
                if line == "":
                    continue
                if re.search(r"PERSONAL - SOCIAL", line):
                    res.add(file_path)
sorted(list(res))

['./data/VLSP2020_RE_training_fixed/23351260.conll/CURATION_USER.tsv',
 './data/VLSP2020_RE_training_fixed/23351416.conll/CURATION_USER.tsv',
 './data/VLSP2020_RE_training_fixed/23351426.conll/CURATION_USER.tsv',
 './data/VLSP2020_RE_training_fixed/23351493.conll/CURATION_USER.tsv',
 './data/VLSP2020_RE_training_fixed/23351515.conll/CURATION_USER.tsv',
 './data/VLSP2020_RE_training_fixed/23351516.conll/CURATION_USER.tsv',
 './data/VLSP2020_RE_training_fixed/23351519.conll/CURATION_USER.tsv',
 './data/VLSP2020_RE_training_fixed/23351563.conll/CURATION_USER.tsv',
 './data/VLSP2020_RE_training_fixed/23351564.conll/CURATION_USER.tsv',
 './data/VLSP2020_RE_training_fixed/23351569.conll/CURATION_USER.tsv',
 './data/VLSP2020_RE_training_fixed/23351612.conll/CURATION_USER.tsv',
 './data/VLSP2020_RE_training_fixed/23351615.conll/CURATION_USER.tsv',
 './data/VLSP2020_RE_training_fixed/23351627.conll/CURATION_USER.tsv',
 './data/VLSP2020_RE_training_fixed/23351649.conll/CURATION_USER.tsv',
 './da