In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
import re
import seaborn as sns
from tqdm import tqdm
import nltk
import random
nltk.download('punkt')
from nltk.tokenize import word_tokenize,sent_tokenize
#from fuzzywuzzy import fuzz

train_example_paths = glob.glob('data/train/*.json')
test_example_paths = glob.glob('data/test/*.json')

train_example_names = [fn.split('.')[0] for fn in os.listdir('data/train')]
test_example_names = [fn.split('.')[0] for fn in os.listdir('data/test')]

metadata = pd.read_csv('data/train.csv')
docIdx = train_example_names.copy()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ozano\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def load_train_example_by_name(name):
    doc_path = os.path.join('data/train', name + '.json')
    with open(doc_path) as f:
        data = json.load(f)
    return data

def load_test_example_by_name(name):
    doc_path = os.path.join('data/test', name + '.json')
    with open(doc_path) as f:
        data = json.load(f)
    return data

## Create dataframe for tokens and targets

In [3]:
import unidecode

match_puncs_re = r"([.,!?()\-;\[\]+\\\/@:<>#_{}&%'*=" + r'"' + r"|])"
match_puncs_re = re.compile(match_puncs_re)

def text_cleaning(text):
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text)).strip() # remove unnecessary literals

    text = re.sub(r'\[[0-9]+]', ' specialreference ', text)

    # Remove years
    text = re.sub(r'(19|20)[0-9][0-9]', ' specialyear ', text)

    # remove other digits
    text = re.sub(r'\d+', ' ', text)

    # remove extra spaces
    text = re.sub("\s+"," ", text)

    # Remove websites
    text = ' '.join(['specialwebsite' if 'http' in t or 'www' in t else t for t in text.split(' ') ])

    return text.lower()

def text_cleaning_for_bert(text):
    # Keeps puncs, pads them with whitespaces

    text = text.replace('^', ' ')
    text = unidecode.unidecode(text)

    # Remove websites
    text = ' '.join(['specialwebsite' if 'http' in t or 'www' in t else t for t in text.split(' ') ])

    text = match_puncs_re.sub(r' \1 ', text)

    # remove extra spaces
    text = re.sub("\s+"," ", text)

    return text.lower()

def text_cleaning_for_label(text):
    text = text.replace('^', ' ')
    text = unidecode.unidecode(text)

    text = re.sub('[^A-Za-z0-9]+', ' ', str(text)).strip() # remove unnecessary literals

    # Remove websites
    text = ' '.join(['specialwebsite' if 'http' in t or 'www' in t else t for t in text.split(' ') ])

    # remove extra spaces
    text = re.sub("\s+"," ", text)

    return text.lower()

In [4]:
import string

##### STEP 1: Make a list of the known labels provided to us

temp_1 = [text_cleaning(x) for x in metadata['dataset_label']]
temp_2 = [text_cleaning(x) for x in metadata['dataset_title']]
temp_3 = [text_cleaning(x) for x in metadata['cleaned_label']]

existing_labels = temp_1 + temp_2 + temp_3
existing_labels = [l.lower() for l in existing_labels]
existing_labels = list(set(existing_labels))

# Sort labels by length in descending order
existing_labels = sorted(existing_labels, key = len, reverse= True)

In [5]:
pos_sentences = []
neg_sentences = []

def process_doc(doc_id):
    doc_json = load_train_example_by_name(doc_id)
    doc_text = ' '.join([sec['text'] for sec in doc_json])

    # Tokenize sentencewise
    sentences = sent_tokenize(doc_text)

    adni_count = 0
    for sentence in sentences:
        clean_sentence = text_cleaning(sentence)

        has_label = False
        label_is_adni = False
        for clean_label in existing_labels:
            if clean_label in clean_sentence:
                has_label = True

                # Remove label from the text, or model will overfit
                clean_sentence = clean_sentence.replace(clean_label, '')
                if 'adni' in clean_label or 'alzheimer' in clean_label:
                    adni_count += 1
                    label_is_adni = True

        if has_label and (adni_count <= 2 or not label_is_adni):
            clean_sentence = re.sub("\s+"," ", clean_sentence)
            pos_sentences.append(clean_sentence)
        else:
            if random.uniform(0, 1) < 0.25:
                neg_sentences.append(clean_sentence)

#get_doc(docIdx[0])[0]

## Create Dataset for All Documents

In [6]:
for doc_id in tqdm(docIdx):
    process_doc(doc_id)

print('')
print(f'pos size: {len(pos_sentences)}')
print(f'neg size: {len(neg_sentences)}')

100%|██████████| 14316/14316 [06:06<00:00, 39.07it/s]
pos size: 28781
neg size: 1038592



## Save Dataset

In [7]:
import pickle

with open(f'data/sentence_classification_data_sklearn/pos.pkl', 'wb') as f:
    pickle.dump(pos_sentences, f)

with open(f'data/sentence_classification_data_sklearn/neg.pkl', 'wb') as f:
    pickle.dump(neg_sentences, f)

print(f'pos size: {len(pos_sentences)}')
print(f'neg size: {len(neg_sentences)}')

pos size: 28781
neg size: 1038592


## Load Dataset

In [10]:
import pickle

with open(f'data/sentence_classification_data_sklearn/pos.pkl', 'rb') as f:
    pos_sentences = pickle.load(f)

with open(f'data/sentence_classification_data_sklearn/neg.pkl', 'rb') as f:
    neg_sentences = pickle.load(f)

print(f'pos size: {len(pos_sentences)}')
print(f'neg size: {len(neg_sentences)}')

pos size: 28781
neg size: 1038592


## Preprocess Sentences

In [6]:
pos_sentences_processed = []
neg_sentences_processed = []
n_broken_sent = 0

def is_text_broken(tokens):
    # Some texts are like 'p a dsdv a d a ds f b', remove them
    if len(tokens) == 0:
        return True

    if len(tokens) < 50:
        return False

    one_char_token_ratio = len([l for l in tokens if len(l) == 1]) / len(tokens)
    return one_char_token_ratio > 0.2

def split_to_smaller_sent(tokens, s_size, overlap_size):
    # output sentences will be s_size + overlap_size long
    small_sents = []

    if len(tokens) <= s_size:
        return [tokens]

    n_parts = len(tokens) // s_size
    if len(tokens) % s_size != 0:
        n_parts += 1

    for i_part in range(n_parts):
        start_i = i_part * s_size
        if i_part > 0:
            start_i -= overlap_size

        end_i = min(len(tokens), (i_part + 1) * s_size)

        small_sents.append(tokens[start_i: end_i])

    return small_sents

def join_tuple_tokens(tuples):
    return ' '.join([t[1] for t in tuples])

def get_index(lst, el):
    try:
        return lst.index(el)
    except ValueError as e:
        for i, lst_el in enumerate(lst):
            if el in lst_el:
                return i
        
    raise ValueError(f'Element {el} not found in {lst}')

def process_pos_sentence(sentence):
    global n_broken_sent

    bert_sentence = text_cleaning_for_bert(sentence)
    label_sentence = text_cleaning_for_label(sentence)

    if is_text_broken(label_sentence.split(' ')): # Can't use bert cleaning for this, because all punc.s are padded with spaces
        n_broken_sent += 1
        return
    
    bert_tokens = bert_sentence.split(' ')
    ### STEP 1: Split into fixed sized sentences ###
    for small_sentence_tokens in split_to_smaller_sent(bert_tokens, s_size = 125, overlap_size = 25):
        small_bert_sentence = ' '.join(small_sentence_tokens)
        pos_sentences_processed.append(small_bert_sentence)


def process_neg_sentence(sentence):
    global n_broken_sent

    bert_sentence = text_cleaning_for_bert(sentence)
    label_sentence = text_cleaning_for_label(sentence)

    if is_text_broken(label_sentence.split(' ')): # Can't use bert cleaning for this, because all punc.s are padded with spaces
        n_broken_sent += 1
        return
    
    bert_tokens = bert_sentence.split(' ')
    ### STEP 1: Split into fixed sized sentences ###
    for small_sentence_tokens in split_to_smaller_sent(bert_tokens, s_size = 125, overlap_size = 25):
        small_bert_sentence = ' '.join(small_sentence_tokens)
        neg_sentences_processed.append(small_bert_sentence)

#process_pos_sentence(pos_sentences[2472])

## Generate Processed Sentences and Save

In [12]:
pos_sentences_processed = []
neg_sentences_processed = []
n_broken_sent = 0

for s in tqdm(pos_sentences):
    process_pos_sentence(s)

for s in tqdm(neg_sentences):
    process_neg_sentence(s)

import pickle

with open(f'data/sentence_classification_data_sklearn/pos_proc.pkl', 'wb') as f:
    pickle.dump(pos_sentences_processed, f)

with open(f'data/sentence_classification_data_sklearn/neg_proc.pkl', 'wb') as f:
    pickle.dump(neg_sentences_processed, f)

print(f'pos size: {len(pos_sentences_processed)}')
print(f'neg size: {len(neg_sentences_processed)}')

100%|██████████| 28781/28781 [00:01<00:00, 25064.10it/s]
100%|██████████| 1038592/1038592 [00:34<00:00, 30325.67it/s]
pos size: 29198
neg size: 1038066


## Load Processed Sentences

In [3]:
import pickle

with open(f'data/sentence_classification_data_sklearn/pos_proc.pkl', 'rb') as f:
    pos_sentences_processed = pickle.load(f)

with open(f'data/sentence_classification_data_sklearn/neg_proc.pkl', 'rb') as f:
    neg_sentences_processed = pickle.load(f)

print(f'pos size: {len(pos_sentences_processed)}')
print(f'neg size: {len(neg_sentences_processed)}')

pos size: 29198
neg size: 1038066


## Create Training Data

In [4]:
perm_idx = np.random.permutation(len(neg_sentences_processed))
neg_sentences_processed = [neg_sentences_processed[i] for i in perm_idx[:150000]]

all_sentences = pos_sentences_processed + neg_sentences_processed
y = np.zeros(len(all_sentences))
y[:len(pos_sentences_processed)] = 1

print(f'pos size: {len(pos_sentences_processed)}')
print(f'neg size: {len(neg_sentences_processed)}')
print(f'all_sentences size: {len(all_sentences)}')

del pos_sentences_processed
del neg_sentences_processed

In [7]:
"""from sklearn.model_selection import train_test_split

print('Splitting data...')
X_train, X_val, y_train, y_val = train_test_split(all_sentences, y, test_size=0.20, random_state=42)

del all_sentences
del y"""

"from sklearn.model_selection import train_test_split\n\nprint('Splitting data...')\nX_train, X_val, y_train, y_val = train_test_split(all_sentences, y, test_size=0.20, random_state=42)\n\ndel all_sentences\ndel y"

## Train Model

In [5]:
import os
import math
import random
import csv
import sys
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import classification_report
import statistics as stats
from bert_sklearn import BertClassifier 

In [6]:
model = BertClassifier(bert_model='scibert-scivocab-uncased',
                        validation_fraction= 0.15,
                        max_seq_length=150,
                        train_batch_size=1,
                        warmup_proportion=0.02,
                        gradient_accumulation_steps=1
                        )

Building sklearn text classifier...


In [8]:
model.fit(all_sentences[:100000], y[:100000])

Loading scibert-scivocab-uncased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint
train data size: 85000, validation data size: 15000
	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  ..\torch\csrc\utils\python_arg_parser.cpp:1005.)
  next_m.mul_(beta1).add_(1 - beta1, grad)
Training  :   0%|          | 86/85000 [00:24<3:59:14,  5.92it/s, loss=0.817]Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x0000019D35E25550>
Traceback (most recent call last):
  File "C:\Users\ozano\.conda\envs\torch\lib\site-packages\torch\utils\data\dataloader.py", line 1324, in __del__
    self._shutdown_workers()
  File "C:\Users\ozano\.conda\envs\torch\lib\site-packages\torch\utils\data\dataloader.py", line 1297, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "C:\Users\ozano\.conda\envs\torch\lib\multiprocessing\process

In [None]:
# save model to disk
savefile='data/sklearn_bert_classification.bin'
model.save(savefile)

In [13]:
def test_process_doc(doc_id):
    doc_json = load_test_example_by_name(doc_id)
    doc_text = ' '.join([sec['text'] for sec in doc_json])

    # Tokenize sentences
    sentences = sent_tokenize(doc_text)

    sentences_cleaned = [text_cleaning(s).lower() for s in sentences]

    X_test = vectorizer.transform(sentences_cleaned)

    return X_test, sentences

In [22]:
test_doc_id = test_example_names[3]
X_test, sentences = test_process_doc(test_doc_id)
doc_preds = model.predict([{'output': X_test}])[0]['output']

pos_pred_idx = np.argwhere(doc_preds == 1)[:, 0]
pos_pred_sentences = [sentences[i] for i in pos_pred_idx]

list(metadata.loc[metadata.Id == test_doc_id, 'cleaned_label'].values)

['rural urban continuum codes']

In [23]:
pos_pred_sentences

["Results from shoppers' intercept survey data collected at 13 stores in LI areas in nine Northeastern locations were compared with those obtained using secondary household food purchasing data from the Information Resource Incorporated (IRI) Consumer Network Panel (CNP) courtesy of the USDA Economic Research Service (ERS), and food expenditures from the Consumer Expenditure Survey (CES) of the US Bureau of Labor Statistics.",
 "Participants' county of residence was linked with the USDA (2013) Rural-Urban Continuum Codes (RUCCs) (United States Department of Agriculture, 2013) to determine if the household was located in an urban or rural area."]

In [None]:
# TODO: Check model using eli5

## Create NER Sentences