In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
import re
import seaborn as sns
from tqdm import tqdm
import nltk
import random
nltk.download('punkt')
from nltk.tokenize import word_tokenize,sent_tokenize
import pickle

train_example_names = [fn.split('.')[0] for fn in os.listdir('data/train')]
test_example_names = [fn.split('.')[0] for fn in os.listdir('data/test')]

metadata = pd.read_csv('data/train.csv')
docIdx = train_example_names.copy()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ozano\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def load_train_example_by_name(name):
    doc_path = os.path.join('data/train', name + '.json')
    with open(doc_path) as f:
        data = json.load(f)
    return data

def text_cleaning(text):
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text)).strip() # remove unnecessary literals

    # remove extra spaces
    text = re.sub("\s+"," ", text)

    return text.strip()

def process_doc(doc_id):
    doc_json = load_train_example_by_name(doc_id)
    doc_text = ' '.join([sec['text'] for sec in doc_json])

    # Tokenize sentencewise
    sentences = sent_tokenize(doc_text)
    return sentences

def get_doc(doc_id):
    sents = process_doc(doc_id)
    return sents#[text_cleaning(s) for s in sents]

In [3]:
connection_tokens = {'s', 'of', 'and', 'in', 'on', 'for', 'from', 'the'}

def group_ok(group_df):
    if len(group_df) < 2:
        return False
    
    nonconn_count = len(group_df) - group_df.is_conn_token.sum()
    if nonconn_count < 2:
        return False

    if group_df.is_long_token.sum() < 2:
        return False

    return True

def get_connected_uppercase(tokens):
    if len(tokens) < 5:
        return []

    sent_df = pd.DataFrame({'token': tokens})
    sent_df['is_conn_token'] = sent_df.token.apply(lambda x: x.lower() in connection_tokens)
    sent_df['is_long_token'] = sent_df.token.str.len() > 4
    sent_df['in_name'] = sent_df.apply(lambda x: x.token[0].isupper() if not x.is_conn_token else True, axis = 1)

    in_name_pieces = []
    groups = sent_df.groupby((sent_df.in_name.shift() != sent_df.in_name).cumsum())
    for name, group in groups:
        if group.in_name.iloc[0]:
            in_name_pieces.append(group)

    return [list(p['token'].values) for p in in_name_pieces if group_ok(p)]

def has_connected_uppercase(tokens):
    if len(tokens) < 5:
        return []

    group_len = 0
    for token in tokens:
        token_lower = token.lower()
        if len(token) > 1 and token[0].isupper():
            if token_lower not in connection_tokens:
                group_len += 1
                if group_len > 2:
                    return True

        else:
            if token_lower not in connection_tokens:
                group_len = 0

    return False

In [4]:
possible_labels = []
pbar = tqdm(docIdx[:100])

for doc_id in pbar:
    sents = get_doc(doc_id)

    for s in sents:
        if has_connected_uppercase(text_cleaning(s).split(' ')):
            possible_labels.append(s)

    pbar.set_description(f'n_sents: {len(possible_labels)}')

n_sents: 2201: 100%|██████████| 100/100 [00:01<00:00, 80.98it/s]


In [6]:

possible_labels = list(set(possible_labels))

with open("data/possible_labels.txt", "w", encoding = 'utf-8') as f:
    for l in possible_labels:
        f.write(l + '\n')