# Import Modules

In [44]:
import os

from collections import Counter

In [98]:
import pandas as pd
import missingno as msgn

from tqdm import tqdm 

tqdm.pandas()

# Load Data

In [6]:
abs_path = '/home/alem/Alem_Sagandykov_Documents/Alem_Social/Location_Identifier/Named_Entity_Recognition/data/'

In [20]:
data = pd.read_csv(os.path.join(abs_path, 'Russian/Alem_Tagged_Complaints/akerke_tagged/txt/train_all_data.txt'), 
                   sep = '\t', error_bad_lines = False)

In [14]:
data

Unnamed: 0,<DOCSTART>
0,Хотела O
1,бы O
2,выразить O
3,свое O
4,недовольство O
5,по O
6,поводу O
7,маршрута B-ORG
8,№ I-ORG
9,22 I-ORG


# Load

In [29]:
def parse_ner_file(path, pos_exist = False):
    
    with open(path, 'r', encoding = 'UTF-8') as file:
        data = file.readlines()
    
    data = [token.strip() for token in data]
    data = [token.split() for token in data if token and token != '<DOCSTART>']

    tag_index = 2 if pos_exist else 1

    tags = [parsed_token[tag_index] if len(parsed_token) > 1 else 'ERROR' for parsed_token in data]    
    
    sentences = [parsed_token[0] for parsed_token in data]

    return (tags, sentences)

In [37]:
tags, sentences = parse_ner_file(path = os.path.join(abs_path, 'Russian/Alem_Tagged_Complaints/akerke_tagged/txt/data.txt'))

In [35]:
len(tags)

162114

In [38]:
len(sentences)

162114

In [41]:
data = pd.DataFrame(data = {
    'sentences' : sentences, 
    'tags' : tags
})

In [42]:
data.head()

Unnamed: 0,sentences,tags
0,Хотела,O
1,бы,O
2,выразить,O
3,свое,O
4,недовольство,O


In [46]:
counter_tags_error = Counter(data['tags'])

In [52]:
counter_tags_error

Counter({'O': 149829,
         'B-ORG': 2683,
         'I-ORG': 3011,
         'B-PER': 1658,
         'B-LOC': 1935,
         'I-LOC': 2340,
         'I-PER': 634,
         'BI-ORG': 1,
         'B-PRE': 2,
         'B-PEP': 1,
         'BPER': 1,
         'ERROR': 10,
         'ORG': 1,
         'B-RG': 1,
         'B-EPR': 2,
         'B-0RG': 5})

In [70]:
sum(counter_tags_error.values())

162114

# Normalized

In [90]:
tags_norm, sentences_norm = parse_ner_file(path = os.path.join(abs_path, 'Russian/Alem_Tagged_Complaints/akerke_tagged/txt/data.txt'))

In [91]:
len(tags_norm)

162114

In [92]:
len(sentences_norm)

162114

In [93]:
data_norm = pd.DataFrame(data = {
    'sentences' : sentences_norm, 
    'tags' : tags_norm
})

In [94]:
counter_tags_error_norm = Counter(data_norm['tags'])

In [95]:
counter_tags_error_norm

Counter({'O': 149837,
         'B-ORG': 2690,
         'I-ORG': 3010,
         'B-PER': 1664,
         'B-LOC': 1936,
         'I-LOC': 2343,
         'I-PER': 634})

In [96]:
data_norm[data_norm.tags == 'ERROR']

Unnamed: 0,sentences,tags


In [97]:
new_normalized_data = os.path.join(abs_path, 'Russian/Alem_Tagged_Complaints/akerke_tagged/txt/clean_data.txt')

In [99]:
with open(new_normalized_data, 'w') as file:
    for i in tqdm(range(len(data_norm))):
        file.write(f"{data_norm['sentences'][i]}\t{data_norm['tags'][i]}\n")

100%|██████████| 162114/162114 [00:02<00:00, 72856.23it/s]
