TODO:
- Adjust fuzz ratio to catch all labels

In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
import re
import seaborn as sns
from tqdm import tqdm

train_example_paths = glob.glob('data/train/*.json')
test_example_paths = glob.glob('data/test/*.json')

train_example_names = [fn.split('.')[0] for fn in os.listdir('data/train')]
test_example_names = [fn.split('.')[0] for fn in os.listdir('data/test')]

metadata = pd.read_csv('data/train.csv')
metadata_train = metadata.loc[metadata.Id.isin(train_example_names)]
metadata_test = metadata.loc[metadata.Id.isin(test_example_names)]

metadata = pd.read_csv('data/train.csv')
metadata_train = metadata.loc[metadata.Id.isin(train_example_names)]
metadata_test = metadata.loc[metadata.Id.isin(test_example_names)]

In [2]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())

def remove_punc(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt))

def get_doc_id(doc_path):
    return os.path.split(train_example_names[0])[-1].split('.')[0]

def load_train_example(i: int):
    doc_path = train_example_paths[i]
    with open(doc_path) as f:
        data = json.load(f)
    return {'doc': data, 'meta': metadata.loc[metadata.Id == get_doc_id(doc_path)]}

def load_train_example_by_name(name):
    doc_path = os.path.join('data/train', name + '.json')
    with open(doc_path) as f:
        data = json.load(f)
    return data

def delete_file(filename):
    if os.path.exists(filename):
        os.remove(filename)

In [3]:
import random

docIdx = train_example_names.copy()

## Generate Dataset and Features

In [4]:
def preprocess_tokenize_doc(doc_json):
    doc_text = ' '.join([remove_punc(sec['text']) for sec in doc_json])
    doc_text = make_single_whitespace(doc_text)
    
    doc_tokens = doc_text.split(' ')
    return doc_tokens

def indices(lst, element):
    result = [i for i, token in enumerate(lst) if element in token]
    return result

_RE_COMBINE_WHITESPACE = re.compile(r"\s+")
def make_single_whitespace(text):
    return _RE_COMBINE_WHITESPACE.sub(" ", text).strip()

## Create dataframe for tokens and targets

In [9]:
def get_doc(doc_id, reduce_tokens = False, reduce_size = 150):
    doc_labels = list(metadata_train.loc[metadata_train.Id == doc_id, 'dataset_label'].values)
    doc_labels = [make_single_whitespace(remove_punc(l.strip())).lower() for l in doc_labels]

    doc = load_train_example_by_name(doc_id)
    doc_tokens = preprocess_tokenize_doc(doc)
    doc_tokens_lower = [t.lower() for t in doc_tokens]
    
    # Targets for dataset names will be 1
    target_arr = np.zeros(len(doc_tokens) ,dtype = 'uint8')

    # Keep n tokens before and after targets
    keep_df = pd.Series(np.zeros(len(doc_tokens), dtype = 'bool'))

    for l in doc_labels:
        n_label_tokens = len(l.split(' '))
        doc_tokens_joined = [' '.join(doc_tokens_lower[i:i+n_label_tokens]) for i in range(len(doc_tokens_lower) - n_label_tokens + 1)]
        
        occurrences = indices(doc_tokens_joined, l)

        assert len(occurrences) != 0, f'Label {l} not found in doc {doc_id}'
        for o in occurrences:
            if reduce_tokens:
                keep_start = max(0, o - reduce_size)
                keep_end = min(o + reduce_size + n_label_tokens, len(doc_tokens))
                keep_df[keep_start: keep_end] = True
            for i in range(n_label_tokens):
                target_arr[o + i] = 1

    doc_df = pd.DataFrame()
    doc_df['TOKEN'] = doc_tokens
    doc_df['TARGET'] = target_arr
    doc_df['TARGET'] = doc_df['TARGET'].astype('str')
    if reduce_tokens:
        doc_df = doc_df.loc[keep_df]

    return doc_df

In [10]:
for doc_id in tqdm(docIdx):
    doc_df = get_doc(docIdx[0])
    doc_df.to_csv(f'data/processed_data/{doc_id}.csv', index= False)

100%|██████████| 3/3 [00:00<00:00, 69.78it/s]
