#### Note:
    - I removed adni from labels, because it created noise in sentence labels. 

In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
import re
import seaborn as sns
from tqdm import tqdm
import nltk
import random
from nltk.tokenize import word_tokenize,sent_tokenize
import pickle

train_example_names = [fn.split('.')[0] for fn in os.listdir('data/train')]
test_example_names = [fn.split('.')[0] for fn in os.listdir('data/test')]

metadata = pd.read_csv('data/train.csv')
docIdx = train_example_names.copy()

connection_tokens = {'s', 'of', 'and', 'in', 'on', 'for', 'from', 'the', 'act', 'coast', 'future', 'system', 'per'}

## Dataset Name Selection

In [2]:
def text_cleaning(text):
    text = re.sub('[^A-Za-z]+', ' ', str(text)).strip() # remove unnecessary literals

    # remove extra spaces
    text = re.sub("\s+"," ", text)

    return text.lower().strip()

def is_name_ok(text):
    if len([c for c in text if c.isalnum()]) < 4:
        return False
    
    tokens = [t for t in text.split(' ') if len(t) > 3]
    tokens = [t for t in tokens if not t in connection_tokens]
    if len(tokens) < 3:
        return False

    return True

with open('data/all_preds_selected.csv', 'r') as f:
    selected_pred_labels = f.readlines()
    selected_pred_labels = [l.strip() for l in selected_pred_labels]

existing_labels = [text_cleaning(x) for x in metadata['dataset_label']] +\
                  [text_cleaning(x) for x in metadata['dataset_title']] +\
                  [text_cleaning(x) for x in metadata['cleaned_label']] +\
                  [text_cleaning(x) for x in selected_pred_labels]

"""to_remove = [
    'frequently asked questions', 'total maximum daily load tmd', 'health care facilities',
    'traumatic brain injury', 'north pacific high', 'droplet number concentration', 'great slave lake',
    'census block groups'
]"""


"""df = pd.read_csv(r'C:\projects\personal\kaggle\kaggle_coleridge_initiative\string_search\data\gov_data.csv')
print(len(df))


df['title'] = df.title.apply(text_cleaning)
titles = list(df.title.unique())
titles = [t for t in titles if not t in to_remove]
df = pd.DataFrame({'title': titles})
df = df.loc[df.title.apply(is_name_ok)]
df = pd.concat([df, pd.DataFrame({'title': existing_labels})], ignore_index= True).reset_index(drop = True)
titles = list(df.title.unique())
df = pd.DataFrame({'title': titles})
df['title'] = df.title.apply(text_cleaning)"""

# Sort labels by length in ascending order
#existing_labels = sorted(list(df.title.values), key = len, reverse = True)

existing_labels = list(set(existing_labels))
existing_labels = sorted(existing_labels, key = len, reverse = True)
existing_labels = [l for l in existing_labels if len(l.split(' ')) < 15]
#del df
#existing_labels.remove('adni')

print(len(existing_labels))

389


In [3]:
existing_labels[:5]

['national center for science and engineering statistics survey of science and engineering research facilities',
 'national center for science and engineering statistics higher education research and development survey',
 'national science foundation survey of graduate students and postdoctorates in science and engineering',
 'national center for science and engineering statistics survey of industrial research and development',
 'national oceanic and atmospheric administration optimum interpolation sea surface temperature']

In [4]:
existing_labels[-5:]

['cfsr', 'kegg', 'fema', 'pwv', 'csf']

## Create dataframe for tokens and targets

In [5]:
def load_train_example_by_name(name):
    doc_path = os.path.join('data/train', name + '.json')
    with open(doc_path) as f:
        data = json.load(f)
    return data

def load_test_example_by_name(name):
    doc_path = os.path.join('data/test', name + '.json')
    with open(doc_path) as f:
        data = json.load(f)
    return data

## Make sentences

In [6]:
def text_cleaning_upper(text):
    text = re.sub('[^A-Za-z]+', ' ', str(text)).strip() # remove unnecessary literals

    # remove extra spaces
    text = re.sub("\s+"," ", text)

    return text.strip()

def has_connected_uppercase(tokens):
    if len(tokens) < 5:
        return False

    group_len = 0
    n_long_tokens = 0
    for token in tokens:
        token_lower = token.lower()
        if token[0].isupper():
            if token_lower not in connection_tokens:
                if len(token) > 2:
                    n_long_tokens += 1

                group_len += 1
                if group_len > 2 and n_long_tokens > 0:
                    return True

        else:
            if token_lower not in connection_tokens:
                group_len = 0
                n_long_tokens = 0

    return False

def sent_has_acronym(tokens):
    # Acronym check
    for token in tokens:
        if len(token) > 3 and token.isupper():
            return True

    return False

def sent_is_candidate(clean_sentence):
    tokens = clean_sentence.split(' ')
    
    if sent_has_acronym(tokens):
        return True
    else:
        return has_connected_uppercase(tokens)
        

In [7]:
pos_sentences = []
neg_sentences = []
docs_no_pos = []
total_sentences = 0
label_use_counts = {l: 0 for l in existing_labels}


def process_doc(doc_id):
    """ Accept sentences with acronyms or uppercase words in succession as candidates.
    From those candidates, positives are the ones that contain a label.

    """
    global total_sentences
    doc_json = load_train_example_by_name(doc_id)
    doc_text = ' '.join([sec['text'] for sec in doc_json])
    doc_has_pos = False

    # Tokenize sentencewise
    sentences = sent_tokenize(doc_text)
    total_sentences += len(sentences)

    for sentence in sentences:
        clean_sentence = text_cleaning_upper(sentence)
        is_candidate = sent_is_candidate(clean_sentence)

        has_label = False
        if is_candidate:
            clean_sentence_lower = clean_sentence.lower()
            for clean_label in existing_labels:
                if re.search(r'\b{}\b'.format(clean_label), clean_sentence_lower):
                    has_label = True
                    label_use_counts[clean_label] = label_use_counts[clean_label] + 1
                    break
        
        # Store sentence in list if candidate
        # Non-candidate sentences are discarded
        if has_label:
            pos_sentences.append(sentence)
            doc_has_pos = True
        elif is_candidate:
            neg_sentences.append(sentence)

    if not doc_has_pos:
        docs_no_pos.append(doc_id)

#process_doc('0026563b-d5b3-417d-bd25-7656b97a044f')

## Generate and Save Sentences

In [8]:
import pickle
assert len(docIdx) > 0

pos_sentences = []
neg_sentences = []
docs_no_pos = []
total_sentences = 0

pbar = tqdm(docIdx)
for doc_id in pbar:
    process_doc(doc_id)
    pbar.set_description(\
        f'pos_size: {len(pos_sentences)}, neg_size: {len(neg_sentences)}, no pos label doc: {len(docs_no_pos)}, n_sentences: {total_sentences}')

with open(f'data/selected_sentences/pos.pkl', 'wb') as f:
    pickle.dump(pos_sentences, f)

with open(f'data/selected_sentences/neg.pkl', 'wb') as f:
    pickle.dump(neg_sentences, f)

print(f'pos size: {len(pos_sentences)}')
print(f'neg size: {len(neg_sentences)}')

pos_size: 175743, neg_size: 795925, no pos label doc: 184, n_sentences: 4178921: 100%|█| 14316/14316 [20:58<00:00, 11.3


pos size: 175743
neg size: 795925


In [9]:
pd.Series(label_use_counts).sort_values()

national water quality assesment                                                    0
complexity science hub covid control strategies list                                0
covid precision medicine analytics platform registry jh crown                       0
characterizing health associated risks and your baseline disease in sars cov        0
cas covid antiviral candidate compounds data                                        0
                                                                                ...  
ecls                                                                             7983
nces                                                                             8080
timss                                                                           13086
apoe                                                                            14561
adni                                                                            26690
Length: 389, dtype: int64