In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
import re
import seaborn as sns
from tqdm import tqdm

train_example_paths = glob.glob('data/train/*.json')
test_example_paths = glob.glob('data/test/*.json')

train_example_names = [fn.split('.')[0] for fn in os.listdir('data/train')]
test_example_names = [fn.split('.')[0] for fn in os.listdir('data/test')]

metadata = pd.read_csv('data/train.csv')
metadata_train = metadata.loc[metadata.Id.isin(train_example_names)]
metadata_test = metadata.loc[metadata.Id.isin(test_example_names)]

metadata = pd.read_csv('data/train.csv')
metadata_train = metadata.loc[metadata.Id.isin(train_example_names)]
metadata_test = metadata.loc[metadata.Id.isin(test_example_names)]

_RE_COMBINE_WHITESPACE = re.compile(r"\s+")
def make_single_whitespace(text):
    return _RE_COMBINE_WHITESPACE.sub(" ", text).strip()

def remove_punc(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt))

def load_train_example_by_name(name):
    doc_path = os.path.join('data/train', name + '.json')
    with open(doc_path) as f:
        data = json.load(f)
    return data

def get_doc_id(doc_path):
    return os.path.split(train_example_names[0])[-1].split('.')[0]

def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())

def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

# Load dataset names
df = pd.read_csv('C:\projects\personal\kaggle\kaggle_coleridge_initiative\data\data_set_26897.csv')
us_dataset_names = list(df.title.values)
us_dataset_names = [make_single_whitespace(remove_punc(n)).lower() for n in us_dataset_names]

labels = list(metadata.cleaned_label.unique())
labels = sorted(labels, key = len, reverse = True)
labels = [l.strip() for l in labels]

In [None]:
def get_token_bigrams(label):
    tokens = label.split(' ')
    return [f'{t1} {t2}' for t1, t2 in zip()]

In [578]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

for t in ['and', 'was', 'in']:
    stop_words.remove(t)

banned_kw = [
    'STEM', 'FDA', 'SSH', 'FSIZE', 'PET', 'NCATE', 'TESOL', 'AVHRR-OI',
    'ICT',
    'AAEA',
    'BMI', 'ADGC', 'CDRSUM', 'NASS',
    'MMSE', 'CDR', 'SPSS', 'LCRP', 'DML', 'ITU', 'DRI', 'CIPSEA', 'IEP', 'NCES', 'BCG', 'HLM', 'MLLW', 'FDG', 'MRMC'
]

banned_values = [
    'laboratory', 'body mass index', 'admission test', 'neural networks', 'accuracy of', 'chain reaction', 'adversarial network',
    'state exam', 'reform act', 'least', 'labeling', 'principal components analysis', 'independent components analysis', 'markov chain', 'monte carlo',
    'bayesian information', 'family wise error', 'posterior anterior', 'Bidirectional Encoder', 'Morphometry', 'Integral', 'T2*weighted', 'T2-weighted',
    'T2weighted', 'T1*weighted', 'T1-weighted', 'T1weighted', 'EMCI', 'Learning Test', 'Gradepoint average', 'doctor of', 'masters of',
    'Expected Family Contribution', 'life in', 'Long Short Term', 'Long ShortTerm', 'LSTM', 'lipoprotein', 'Support Vector Machine', 'User Interface',
    'National Institute of', 'glucose', 'Research Division', '%', 'Heating Weeks', 'Public Management', 'Theory', 'Middle East respiratory',
    'Discriminant Analysis', 'boltzmann', 'Disease Control and Prevention', 'polymorphism', 'positron emission tomography', 'dorsolateral', 'Data Analysis System',
    'Analysis Kit', 'Google', 'Principal Analysis', 'Cognitive Impairment', 'Analysis of Variance'
    ]

banned_after_tokens = stop_words

banned_values = [b.lower() for b in banned_values]

In [579]:
def get_words_from_abbr_in_parantheses(match, doc_text):
    try:
        match_i = doc_text.index(f'({match})')
    except:
        try:
            match_i = doc_text.index(f'({match};')
        except:
            match_i = doc_text.index(f'({match}')

    match_nopunc = remove_punc(match).replace(' ', '')
    n_tokens = len(match_nopunc.replace(' ', ''))

    slice_start_i = max(match_i - 200, 0)
    slice_end_i = min(match_i + len(match) + 40, len(doc_text))
    doc_slice = doc_text[slice_start_i: slice_end_i]
    
    # Remove dates
    doc_slice = re.sub(r'(19|20)[0-9][0-9]', ' ', doc_slice)

    # Remove parantheses
    doc_slice = doc_slice.replace('(', ' ').replace(')', ' ').replace(';', ' ').replace('-', '').replace('\n', ' ')

    tokens = doc_slice.split(' ')

    tokens = [t for t in tokens if len(t) > 0]
    match_token_i = tokens.index(match_nopunc)

    if match_token_i - n_tokens <= 0:
        return [], [], ""

    start_i = match_token_i-n_tokens
    end_i = match_token_i

    # If a prev token started with uppercase, use it
    try:
        if start_i > 1:
            if tokens[start_i-2][0].isupper():
                start_i -= 2
            else:
                if tokens[start_i-1][0].isupper():
                    start_i -= 1

        word_tokens = tokens[start_i:end_i]

        # Drop number token if it is coming first
        if word_tokens[0].isdigit():
            word_tokens = word_tokens[1:]

        # Remove 2 lowercase tokens from start
        for _ in range(2):
            if word_tokens[0][0].islower():
                word_tokens = word_tokens[1:]
                start_i += 1

    except IndexError:
        print(f'IndexError for {match}')
        return [], [], ""

    after_token = ""
    if len(tokens) > match_token_i + 1:
        after_token = tokens[match_token_i + 1]

    before_tokens = tokens[max(start_i - 3, 0) : start_i]

    return before_tokens, word_tokens, after_token


def tokens_are_dataset_name(tokens):
    if len(tokens) == 0:
        return False
    
    long_tokens = [t for t in tokens if len(t) > 3]
    lowercase_count = len([t for t in long_tokens if t[0].islower()])
    uppercase_count = len([t for t in long_tokens if t[0].isupper()])

    return lowercase_count < 4 and uppercase_count > 0

def after_token_ok(after_token):
    if after_token == "":
        return True

    if after_token.lower() in banned_after_tokens:
        return False

    # Probably plural
    if after_token not in ['was', 'has', 'is', 'this'] and after_token[-1].lower() == 's':
        return False

    # Probably a link
    if 'http' in after_token:
        return False

    # A reference. Datasets don't get referenced like that
    if '[' in after_token and ']' in after_token:
        return False

    # Probably a link
    if '/' in after_token:
        return False

    if 'cell' in after_token:
        return False

    return True

def before_tokens_ok(before_tokens):
    if len(before_tokens) == 0:
        return True

    if 'by' in before_tokens:
        return False

    if 'adjusted' in before_tokens:
        return False

    return True


#get_words_from_abbr_in_parantheses('BDNF', doc_text)

In [590]:
i = 5113

doc_id = train_example_names[i]

In [591]:
doc_json = load_train_example_by_name(doc_id)

doc_labels = list(metadata_train.loc[metadata_train.Id == doc_id, 'dataset_label'].values)
doc_labels = [make_single_whitespace(remove_punc(l.strip())).lower() for l in doc_labels]
doc_text = ' '.join([s['text'] for s in doc_json])
print(doc_id)
print(doc_labels)

re_find_par = r'\(([A-Z]{2,}-?[A-Z]{1,}?[a-z]?)[\);]'
matches = set(re.findall(re_find_par, doc_text))
print(matches)

5c038729-c9a6-4521-b692-102a2295811a
['beginning postsecondary students longitudinal study', 'beginning postsecondary student', 'education longitudinal study', 'beginning postsecondary students', 'national education longitudinal study']
{'DEFTs', 'PSEFIRTY', 'NEB', 'PSE', 'PSEFIRDA', 'BPSLNKWT', 'DEFT', 'BYFCOMP', 'ANOVA', 'GED', 'SES', 'DAS', 'BYSES', 'IPEDS'}


In [592]:
selected_mathces = {}
for m in matches:
    try:
        before_tokens, found_tokens, after_token = get_words_from_abbr_in_parantheses(m, doc_text)
    except Exception as e:
        print(f'Exception for {m}')
        raise e

    cond1 = tokens_are_dataset_name(found_tokens)
    cond2 = after_token_ok(after_token)
    cond3 = before_tokens_ok(before_tokens)

    if not cond1:
        print(f'{m} : Tokens {found_tokens} do not make a dataset name.')

    if not cond2:
        print(f'{m} : Aftertoken {after_token} was in banlist.')

    if not cond3:
        print(f'{m} : Beforetokens {before_tokens} were in banlist.')

    if cond1 and cond2 and cond3:
        selected_mathces[m] = (' '.join(found_tokens), after_token)

selected_mathces

DEFTs : Tokens ['the', 'design', 'effects'] do not make a dataset name.
DEFTs : Aftertoken for was in banlist.
PSEFIRTY : Tokens ['consider', 'type', 'of', 'institution', 'first', 'attended'] do not make a dataset name.
PSEFIRDA : Tokens ['education', 'enrollment', 'date', 'for', 'valid', 'institutions'] do not make a dataset name.
BPSLNKWT : Tokens ['This', 'disturbance', 'term', 'inflated', 'the', 'weight'] do not make a dataset name.
BPSLNKWT : Aftertoken so was in banlist.
DEFT : Tokens ['dependent', 'variable'] do not make a dataset name.
SES : Tokens ['status'] do not make a dataset name.
SES : Aftertoken are was in banlist.
BYSES : Tokens ['1.', 'Low', 'SES'] do not make a dataset name.


{'NEB': ('Beginning Postsecondary Longitudinal Study', 'Data'),
 'PSE': ('Postsecondary Education', 'enrollment.'),
 'BYFCOMP': ('Low SES BYSES 2. Single parent family', '3.'),
 'ANOVA': ('Analysis of Variance', 'was'),
 'GED': ('General Educational Development', ','),
 'DAS': ('Data File Data Analysis System', 'see'),
 'IPEDS': ('Integrated Postsecondary Education Data System', 'data')}

In [593]:
# Drop by keyword
matches_not_banned = {m: v for m, v in selected_mathces.items() if m not in banned_kw}
matches_not_banned

{'NEB': ('Beginning Postsecondary Longitudinal Study', 'Data'),
 'PSE': ('Postsecondary Education', 'enrollment.'),
 'BYFCOMP': ('Low SES BYSES 2. Single parent family', '3.'),
 'ANOVA': ('Analysis of Variance', 'was'),
 'GED': ('General Educational Development', ','),
 'DAS': ('Data File Data Analysis System', 'see'),
 'IPEDS': ('Integrated Postsecondary Education Data System', 'data')}

In [594]:
# Drop by text
matches_not_banned = {m: v for m, v in matches_not_banned.items() if not any([b for b in banned_values if b in v[0].lower()])}
matches_not_banned

{'NEB': ('Beginning Postsecondary Longitudinal Study', 'Data'),
 'PSE': ('Postsecondary Education', 'enrollment.'),
 'BYFCOMP': ('Low SES BYSES 2. Single parent family', '3.'),
 'ANOVA': ('Analysis of Variance', 'was'),
 'GED': ('General Educational Development', ','),
 'IPEDS': ('Integrated Postsecondary Education Data System', 'data')}

In [173]:
l_search = doc_labels[0]
[section for section in doc_json if l_search in clean_text(section['text'])]

[{'section_title': 'Discussion',
  'text': 'For the first time, to the best of our knowledge, in older patients hospitalized for acute or subacute event with inflammatory process, PAL was determined with the gold standard DLW method as 1.3 ± 0.2, with daily energy intake (1420 kcal/d) being just sufficient to cover the daily energy requirements (1497 kcal/d). Our data did not show that TEE was increased in acutely or subacutely ill old patients with inflammatory process and malnutrition. However, although we observed relatively low values of REE, a negative association was found between the evolution of CRP and REE at baseline. These results could suggest a possible influence of inflammation on REE, although this could not definitely be confirmed.\nREE declines with age in older adults, as many studies have shown [29] [30] [31] . Generally, this decline is partly the result of the decrease in FFM, and although there is no consensus, several authors have shown a decreased REE adjusted f

{'ECLS-K', 'IEP', 'IRT', 'MD', 'NCES', 'SEM', 'SES'}

clues:
- between parantheses
- starts with such as
- has abbreviation between parantheses
- starts with capital letters or all capital letters

How to process:

Split into sentences

Keep uppercase letters.

remove []

keep ()

Capital letter words followed by (abbreviation)