In [1]:
# additional python packages
!pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
!pip install -q ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
!pip install -q ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
!pip install -q ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl

Looking in links: file:///kaggle/input/coleridge-packages/packages/datasets


In [2]:
import os
import re
import json
import simplejson
import time
import datetime
import random
import glob
import importlib

# dataset manipulation
import numpy as np
import pandas as pd

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

# pytorch
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, \
AutoModelForMaskedLM, Trainer, TrainingArguments, pipeline

from typing import List
import string
from functools import partial

import pickle
from joblib import Parallel, delayed

from collections import defaultdict, Counter
import gc

# tf
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# language preprocessing
import nltk

from typing import *

# spacy
import spacy
from spacy import displacy
from spacy.util import minibatch, compounding

# set seed
sns.set()
random.seed(123)
np.random.seed(456)
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

print('packages loaded')

2021-09-25 04:44:42.585096: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


packages loaded


In [3]:
sample_submission = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')

COMPUTE_CV = False

if len(sample_submission)>4: COMPUTE_CV = False
if COMPUTE_CV:
    print('this submission notebook will compute CV score but commit notebook will not')
else:
    print('this submission notebook will only be used to submit result')
    

this submission notebook will only be used to submit result


In [4]:
train_path = '../input/coleridgeinitiative-show-us-the-data/train.csv'
train = pd.read_csv(train_path)

if COMPUTE_CV:
    sample_submission = train
    paper_test_folder = '../input/coleridgeinitiative-show-us-the-data/train'
    test_files_path = paper_test_folder
else:
    sample_submission = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
    paper_test_folder = '../input/coleridgeinitiative-show-us-the-data/test'
    test_files_path = paper_test_folder

adnl_govt_labels_path = '../input/bigger-govt-dataset-list/data_set_800.csv'

In [5]:
MAX_SAMPLE = 0

train = train[:MAX_SAMPLE]

paper_train_folder = '../input/coleridgeinitiative-show-us-the-data/train'
papers = {}
for paper_id in train['Id'].unique():
    with open(f'{paper_train_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

In [6]:
for paper_id in tqdm(sample_submission['Id']):
    with open(f'{paper_test_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

100%|██████████| 4/4 [00:00<00:00, 99.10it/s]


# Additional goverent dataset

In [7]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

In [8]:
tmp3 = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')

tmp3_ = [x for x in tmp3['cleaned_label'].unique() if len(str(x).split()) > 0]
tmp3_ += [x for x in tmp3['dataset_title'].unique()]
tmp3 = [clean_text(x) for x in np.unique(tmp3_)]

In [9]:
tmp8 = pd.read_csv('../input/ci-ext-datasets-found-in-train-v2/train_ext_data.csv')
tmp8['ext_cleaned_label'] = tmp8['ext_cleaned_label'].apply(lambda x: x.split('|'))
all_labels = []
for labels in tmp8['ext_cleaned_label'].values:
    for l in labels:
        all_labels.append(l)
tmp8 = list(np.unique(all_labels))
tmp8 = pd.DataFrame(tmp8, columns=['title'])
tmp8.head()

Unnamed: 0,title
0,2010 nielsen homescan survey
1,2019 ncov complete genome sequences
2,2019 ncov genome sequence
3,2019 ncov genome sequences
4,about the workshop


In [10]:
tmp8_ = []
for l in tmp8['title'].values:
    if l not in tmp3:
        tmp8_.append(l)
        
print(len(tmp8_))
tmp8_ = pd.DataFrame(tmp8_, columns=['title'])

267


In [11]:
not_datasets = ['about', 'climatologists', 'control', 'exploration', 'defense', 
                'american community', 'american landscape', 'current population survey',
                'gulf of maine', 'argonne national laboratory s greet', 
                'annual wholesale trade',
                'bird conservation areas', 'bird incidental take', 'new housing', 'business patterns',
                'create', 'federal aid to states', 'freedom of information act', 'fruit and vegetable prices',
                'guidance navigation and control', 'high school and beyond', 'human resource management', 
                'housing unit estimates', 'international data base', 'labor market analysts', 'major land uses',
                'mars exploration program', 'new residential construction', 'oxygen delivery system',
                'pilot boarding areas', 'profiles in science', 'state fact sheets', 'summary of business',
                'tsunamis general', 'virtual grower', # 0.620
                
                'advanced monthly', 
                'advanced telecommunications', 
                'agricultural productivity',
                'annual survey', 
                'breeding bird', 
                'bridged race population estimates', 
                'building permits survey',
                'census of governments', 
                'clinical laboratory', 'coastal energy facilities', 
                'commodity costs and returns',
                'comprehensive environmental', 'county typology codes', 'delta cost project', 
                'endangered species act',
                'energy policy act', 
                'fertilizer', 'geostationary', 'landfire', 'occupational projections', 
                'marine mammal protection act', 
                'meat price', 'medication therapy', 'mexican american', 
                'milk cost',
                'animal health', 'weather', 'national environmental policy', 'national outbreak', 'natural amenities scale',
                'office', 'services file', 'stores', 'right whale', 'shuttle radar', 'solar dynamics',
                'business owners', 'expedition', 'usa'
               ]
for l in not_datasets:
    tmp8_ = tmp8_[~tmp8_['title'].str.contains(l)]
    
tmp8_.loc[tmp8_['title'].str.contains('national assessment of educational progress'), 'title'] = 'national assessment of educational progress'
tmp8_.loc[tmp8_['title'].str.contains('national postsecondary student aid study'), 'title'] = 'national postsecondary student aid study'
tmp8_.loc[tmp8_['title'].str.contains('nursing home compare'), 'title'] = 'nursing home compare'
tmp8_.loc[tmp8_['title'].str.contains('private school universe survey'), 'title'] = 'private school universe survey'
tmp8_.loc[tmp8_['title'].str.contains('program for international student assessment'), 'title'] = 'program for international student assessment'
tmp8_.loc[tmp8_['title'].str.contains('progress in international reading literacy study'), 'title'] = 'progress in international reading literacy study'
tmp8_.loc[tmp8_['title'].str.contains('schools and staffing survey'), 'title'] = 'schools and staffing survey'

tmp8_ = list(tmp8_['title'].unique())
print(len(tmp8_))

162


In [12]:
all_datasets = np.unique(tmp3 + tmp8_)
all_datasets = np.unique([clean_text(x) for x in all_datasets])
print(len(all_datasets))
all_datasets[:5]

295


array(['2010 nielsen homescan survey',
       '2019 ncov complete genome sequences', '2019 ncov genome sequence',
       '2019 ncov genome sequences',
       'accredited postsecondary institutions and programs 2013'],
      dtype='<U128')

In [13]:
def clean_training_text(txt):
    """
    similar to the default clean_text function but without lowercasing.
    """
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt)).strip()

def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

def totally_clean_text(txt):
    txt = clean_text(txt)
    txt = re.sub(' +', ' ', txt)
    return txt

def text_cleaning(text):
    '''
    Converts all text to lower case, Removes special charecters, emojis and multiple spaces
    text - Sentence that needs to be cleaned
    '''
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
    text = re.sub(' +', ' ', text)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text
    
def read_json_pub(filename, train_data_path=paper_train_folder, output='text'):
    json_path = os.path.join(train_data_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data

# Literal prediction

In [14]:
literal_preds = []
to_append = []
for index, row in tqdm(sample_submission.iterrows()):
    to_append = [row['Id'],'']
    large_string = str(read_json_pub(row['Id'], test_files_path))
    clean_string = text_cleaning(large_string)
    for row2 in all_datasets:
        query_string = str(row2)
        if query_string in clean_string:
            if to_append[1] != '' and clean_text(query_string) not in to_append[1]:
                to_append[1] = to_append[1] + '|' + clean_text(query_string)
            if to_append[1] == '':
                to_append[1] = clean_text(query_string)
    literal_preds.append(*to_append[1:])
literal_preds[:5]

4it [00:00, 34.03it/s]


['adni|alzheimer s disease neuroimaging initiative adni|cardiovascular health study chs',
 'common core of data|integrated postsecondary education data system|nces common core of data|oecd s online education database|program for international student assessment|progress in international reading literacy study|schools and staffing survey|trends in international mathematics and science study',
 'noaa storm surge inundation|north carolina emergency management spatial data download|sea lake and overland surges from hurricanes|sea lake and overland surges from hurricanes slosh basin models|slosh model',
 '2010 nielsen homescan survey|food access research atlas|rural urban continuum codes']

# Masked Dataset Modelling

In [15]:
# multiple model
PRETRAINED_PATH = [
    '../input/coleridge-mlm-model/output-mlm/checkpoint-60000', #bert base cased 1
    '../input/d/mfalfafa/ci-bert-base-cased-mlm/output-mlm/checkpoint-60000',
]
TOKENIZER_PATH = [
    '../input/coleridge-mlm-model/model_tokenizer',
    '../input/d/mfalfafa/ci-bert-base-cased-mlm/model_tokenizer',
]

MAX_LENGTH = 64
OVERLAP = 20

PREDICT_BATCH = 32 # a higher value requires higher GPU memory usage

DATASET_SYMBOL = '$' # this symbol represents a dataset name
NONDATA_SYMBOL = '#' # this symbol represents a non-dataset name

# Tranform data into MLM format

In [16]:
def get_mlm_model(TOKENIZER_PATH, PRETRAINED_PATH):
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, use_fast=True)
    model = AutoModelForMaskedLM.from_pretrained(PRETRAINED_PATH)

    mlm = pipeline(
        'fill-mask', 
        model=model,
        tokenizer=tokenizer,
        device=0 if torch.cuda.is_available() else -1
    )
    return tokenizer, model, mlm

In [17]:
# Auxiliary functions
def jaccard_similarity(s1, s2):
    l1 = s1.split(" ")
    l2 = s2.split(" ")    
    intersection = len(list(set(l1).intersection(l2)))
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union

def clean_paper_sentence(s):
    """
    This function is essentially clean_text without lowercasing.
    """
    s = re.sub('[^A-Za-z0-9]+', ' ', str(s)).strip()
    s = re.sub(' +', ' ', s)
    return s

def shorten_sentences(sentences):
    """
    Sentences that have more than MAX_LENGTH words will be split
    into multiple sentences with overlappings.
    """
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences

connection_tokens = {'s', 'of', 'and', 'in', 'on', 'for', 'data', 'dataset'}
def find_mask_candidates(sentence):
    """
    Extract masking candidates for Masked Dataset Modeling from a given $sentence.
    A candidate should be a continuous sequence of at least 2 words, 
    each of these words either has the first letter in uppercase or is one of
    the connection words ($connection_tokens). Furthermore, the connection 
    tokens are not allowed to appear at the beginning and the end of the
    sequence.
    """
    def candidate_qualified(words):
        while len(words) and words[0].lower() in connection_tokens:
            words = words[1:]
        while len(words) and words[-1].lower() in connection_tokens:
            words = words[:-1]
        
        return len(words) >= 2
    
    candidates = []
    
    phrase_start, phrase_end = -1, -1
    for id in range(1, len(sentence)):
        word = sentence[id]
        if word[0].isupper() or word in connection_tokens:
            if phrase_start == -1:
                phrase_start = phrase_end = id
            else:
                phrase_end = id
        else:
            if phrase_start != -1:
                if candidate_qualified(sentence[phrase_start:phrase_end+1]):
                    candidates.append((phrase_start, phrase_end))
                phrase_start = phrase_end = -1
    
    if phrase_start != -1:
        if candidate_qualified(sentence[phrase_start:phrase_end+1]):
            candidates.append((phrase_start, phrase_end))
    
    return candidates

In [18]:
def transform_test_data():
    # transform
    mask = mlm.tokenizer.mask_token
    all_test_data = []

    for paper_id in tqdm(sample_submission['Id']):
        # load paper
        paper = papers[paper_id]

        # extract sentences
        sentences = set([clean_paper_sentence(sentence) for section in paper 
                         for sentence in section['text'].split('.')
                        ])
        sentences = shorten_sentences(sentences) # make sentences short
        sentences = [sentence for sentence in sentences if len(sentence) > 10] # only accept sentences with length > 10 chars
        sentences = [sentence for sentence in sentences if any(word in sentence.lower() for word in ['data', 'study'])]
        sentences = [sentence.split() for sentence in sentences] # sentence = list of words

        # mask
        test_data = []
        for sentence in sentences:
            for phrase_start, phrase_end in find_mask_candidates(sentence):
                dt_point = sentence[:phrase_start] + [mask] + sentence[phrase_end+1:]
                test_data.append((' '.join(dt_point), ' '.join(sentence[phrase_start:phrase_end+1]))) # (masked text, phrase)

        all_test_data.append(test_data)
    return all_test_data

# XLM Roberta prediction

In [19]:
# Paths and Hyperparameters
MAX_LENGTH = 64 # max no. words for each sentence.
OVERLAP = 20 # if a sentence exceeds MAX_LENGTH, we split it to multiple sentences with overlapping

PREDICT_BATCH = 64000 

PRETRAINED_PATH = ['../input/coleridge-xlm-roberta-base-epoch-1-training/output']
TEST_INPUT_SAVE_PATH = './input_data'
TEST_NER_DATA_FILE = 'test_ner_input.json'
TRAIN_PATH = ['../input/coleridge-xlm-roberta-base-epoch-1-training/train_ner.json']
VAL_PATH = ['../input/coleridge-xlm-roberta-base-epoch-1-training/train_ner.json']

PREDICTION_SAVE_PATH = './pred'
PREDICTION_FILE = 'test_predictions.txt'

In [20]:
train = train.groupby('Id').agg({
    'pub_title': 'first',
    'dataset_title': '|'.join,
    'dataset_label': '|'.join,
    'cleaned_label': '|'.join
}).reset_index()

print(f'No. grouped training rows: {len(train)}')

No. grouped training rows: 0


In [21]:
test_rows = [] # test data in NER format
paper_length = [] # store the number of sentences each paper has

for paper_id in sample_submission['Id']:
    # load paper
    paper = papers[paper_id]
    
    # extract sentences
    sentences = [clean_training_text(sentence) for section in paper 
                 for sentence in section['text'].split('.')
                ]
    sentences = shorten_sentences(sentences) # make sentences short
    sentences = [sentence for sentence in sentences if len(sentence) > 5] # only accept sentences with length > 10 chars
    sentences = [sentence for sentence in sentences if any(word in sentence.lower() for word in ['data', 'study', 'from'])]
        
    # collect all sentences in json
    for sentence in sentences:
        sentence_words = sentence.split()
        dummy_tags = ['O']*len(sentence_words)
        test_rows.append({'tokens' : sentence_words, 'tags' : dummy_tags})
    
    # track which sentence belongs to which data point
    paper_length.append(len(sentences))
    
print(f'total number of sentences: {len(test_rows)}')

total number of sentences: 591


In [22]:
def set_os_env(
    pretrained_path,
    train_path,
    val_path
):
    os.environ["MODEL_PATH"] = f"{pretrained_path}"
    os.environ["TRAIN_FILE"] = f"{train_path}"
    os.environ["VALIDATION_FILE"] = f"{val_path}"
    
    os.environ["TEST_FILE"] = f"{TEST_INPUT_SAVE_PATH}/{TEST_NER_DATA_FILE}"
    os.environ["OUTPUT_DIR"] = f"{PREDICTION_SAVE_PATH}"

In [23]:
# copy my_seqeval.py to the working directory because the input directory is non-writable
!cp /kaggle/input/coleridge-packages/my_seqeval.py ./

# make necessart directories and files
os.makedirs(TEST_INPUT_SAVE_PATH, exist_ok=True)

In [24]:
def bert_predict():
    !python ../input/kaggle-ner-utils/kaggle_run_ner.py \
    --model_name_or_path "$MODEL_PATH" \
    --train_file "$TRAIN_FILE" \
    --validation_file "$VALIDATION_FILE" \
    --test_file "$TEST_FILE" \
    --output_dir "$OUTPUT_DIR" \
    --report_to 'none' \
    --seed 123 \
    --do_predict

In [25]:
final_bert_outputs = []
for i in range(1):
    print(f'Prediction Bert model {i}')
    set_os_env(PRETRAINED_PATH[i], TRAIN_PATH[i], VAL_PATH[i])
    
    bert_outputs = []

    for batch_begin in range(0, len(test_rows), PREDICT_BATCH):
        # write data rows to input file
        with open(f'{TEST_INPUT_SAVE_PATH}/{TEST_NER_DATA_FILE}', 'w') as f:
            for row in test_rows[batch_begin:batch_begin+PREDICT_BATCH]:
                json.dump(row, f)
                f.write('\n')

        # remove output dir
        !rm -r "$OUTPUT_DIR"

        # do predict
        bert_predict()

        # read predictions
        with open(f'{PREDICTION_SAVE_PATH}/{PREDICTION_FILE}') as f:
            this_preds = f.read().split('\n')[:-1]
            bert_outputs += [pred.split() for pred in this_preds]
        break
    final_bert_outputs.append(bert_outputs)

Prediction Bert model 0
rm: cannot remove './pred': No such file or directory
2021-09-25 04:44:56.155485: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-1569380d0ed7b188/0.0.0/d75ead8d5cfcbe67495df0f89bd262f0023257fbbbd94a730313295f3d756d50...
100%|███████████████████████████████████████████| 3/3 [00:00<00:00, 9876.70it/s]
100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 256.00it/s]
Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-1569380d0ed7b188/0.0.0/d75ead8d5cfcbe67495df0f89bd262f0023257fbbbd94a730313295f3d756d50. Subsequent calls will reuse this data.
100%|█████████████████████████████████████████████| 3/3 [00:00<00:00, 86.71it/s]
[INFO|configuration_utils.py:470] 2021-09-25 04:45:59,737 >> loading configuration file ../input/coleridge-xlm-roberta-base

In [26]:
# get test sentences
test_sentences = [row['tokens'] for row in test_rows]

del test_rows

In [27]:
final_dataset_labels = []

for i in range(1):
    
    bert_dataset_labels = [] # store all dataset labels for each publication

    for length in paper_length:
        labels = set()
        for sentence, pred in zip(test_sentences[:length], final_bert_outputs[i][:length]):
            curr_phrase = ''
            for word, tag in zip(sentence, pred):
                if tag == 'B': # start a new phrase
                    if curr_phrase:
                        labels.add(curr_phrase)
                        curr_phrase = ''
                    curr_phrase = word
                elif tag == 'I' and curr_phrase: # continue the phrase
                    curr_phrase += ' ' + word
                else: # end last phrase (if any)
                    if curr_phrase:
                        labels.add(curr_phrase)
                        curr_phrase = ''
            # check if the label is the suffix of the sentence
            if curr_phrase:
                labels.add(curr_phrase)
                curr_phrase = ''

        # record dataset labels for this publication
        bert_dataset_labels.append(labels)

        del test_sentences[:length], final_bert_outputs[i][:length]
    final_dataset_labels.append(bert_dataset_labels)
    
final_dataset_labels[0][:5]

[{'Alzheimer s Disease Neuroimaging Initiative ADNI',
  'Cardiovascular Health Study CHS'},
 {'Integrated Postsecondary Education Data System',
  'NCES Common Core of Data',
  'Progress in International Reading Literacy Study',
  'Progress in International reading Literacy Study',
  'Schools and Staffing Survey',
  'Trends in International Mathematics',
  'Trends in International Mathematics and Science Study',
  'trends in International Mathematics and Science Study'},
 {'SLOSH model', 'Sea Lake and Overland Surges from Hurricanes'},
 set()]

# Filter based on Jaccard

In [28]:
final_xlm_roberta_labels = []
for bert_dataset_labels in final_dataset_labels:
    filtered_bert_labels = []
    for labels in bert_dataset_labels:
        filtered = []

        for label in sorted(labels, key=len):
            label = clean_text(label)
            if len(filtered) == 0 or all(jaccard_similarity(label, got_label) < 0.4 for got_label in filtered):
                filtered.append(label)

        filtered_bert_labels.append('|'.join(filtered))
    final_xlm_roberta_labels.append(filtered_bert_labels)
del filtered_bert_labels

print(final_xlm_roberta_labels[0][:5])

['cardiovascular health study chs|alzheimer s disease neuroimaging initiative adni', 'nces common core of data|schools and staffing survey|trends in international mathematics|integrated postsecondary education data system|progress in international reading literacy study', 'slosh model|sea lake and overland surges from hurricanes', '']


# Spicy prediction

In [29]:
def clean_text(text: str) -> str:               return re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
def clean_texts(texts: List[str]) -> List[str]: return [ clean_text(text) for text in texts ] 

def read_json(index: str, test_train) -> Dict:
    filename = f"../input/coleridgeinitiative-show-us-the-data/{test_train}/{index}.json"
    with open(filename) as f:
        json = simplejson.load(f)
    return json
        
def json2text(index: str, test_train) -> str:
    json  = read_json(index, test_train)
    texts = [
        row["section_title"] + " " + row["text"] 
        for row in json
    ]
    text  = " ".join(texts)
    return text

def filename_to_index(filename):
    return re.sub("^.*/|\.[^.]+$", '', filename)

def glob_to_indices(globpath):
    return list(map(filename_to_index, glob.glob(globpath)))

# Inspired by: https://www.kaggle.com/hamditarek/merge-multiple-json-files-to-a-dataframe
def dataset_df(test_train="test"):
    indices = glob_to_indices(f"../input/coleridgeinitiative-show-us-the-data/{test_train}/*.json")    
    texts   = Parallel(-1)( 
        delayed(json2text)(index, test_train)
        for index in indices  
    )
    df = pd.DataFrame([
        { "id": index, "text": text}
        for index, text in zip(indices, texts)
    ])
    df.to_csv(f"{test_train}.json.csv", index=False)
    return df

In [30]:
papers = {}
for paper_id in sample_submission['Id'].values:
    with open(f'../input/coleridgeinitiative-show-us-the-data/test/{paper_id}.json', 'r') as f:
        sections = json.load(f)
        paper = ''
        for section in sections:
            paper = paper + section['text'] + ' .'
    papers[paper_id] = paper
    del paper

In [31]:
# load spacy classifier model
with open('../input/coleridge-spacy-classifier/spacy_model.pickle', 'rb') as f:
    nlp = pickle.load(f)

In [32]:
from gensim.parsing.preprocessing import remove_stopwords
from nltk.tokenize import sent_tokenize

#### remove >.5 jaccard matches from predicitons
def jaccard_similarity(s1, s2):
    l1 = s1.split()
    l2 = s2.split()    
    intersection = len(list(set(l1).intersection(l2)))
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union

start_time = time.time()
column_names = ["Id", "PredictionString"]
submission = pd.DataFrame(columns = column_names)

no_delete = ['study', 'dataset', 'model','survey','data','adni','codes', 'genome', 'program','assessment','database','census','initiative','gauge','system','stewardship','surge']

spacy_predictions = []
for index, row in sample_submission.iterrows():
    to_append=[row['Id'],'']
    passage = papers[row['Id']]
    passage=passage.replace("'s","s")
    passage=passage.replace("-"," ")
    passage=passage.replace(","," ")
    
    ######## ACRONYMS
    for match in re.finditer(r"(\(([A-Z]{2,})\))", passage):
    #for match in re.finditer(r"(\((.*?)\))", data):
        caps=[]
        start_index = match.start()
        abbr = match.group(1)
        size = len(abbr)
        words = passage[:start_index].split()[-size:]
        for word in words:
            if word[0].isupper():
                caps.append(word)
        definition = " ".join(caps)
        if sum(1 for c in definition if c.isupper()) < 15:
            words = [word for word in no_delete if word in definition.lower()]
            doc=nlp(definition)
            score=doc.cats['POSITIVE']
            if len(words)>0 and  score > .99:
                if to_append[1]!='' and definition not in to_append[1]:
                    to_append[1] = to_append[1]+'|'+definition+'|'+abbr
                    to_append[1] = to_append[1]+'|'+abbr
                if to_append[1]=='':
                    to_append[1] = definition
                    to_append[1] = to_append[1]+'|'+abbr
                            
    #### cap word sequence
    if to_append[1]=='':        
        mylist=re.findall('([A-Z][\w-]*(?:\s+[A-Z][\w-]*)+)', remove_stopwords(passage))
        mylist = list(dict.fromkeys(mylist))
        for match in mylist:
            upper_score=sum(1 for c in match if c.isupper())
            if upper_score < 15:
                words = [word for word in no_delete if word in match.lower()]
                doc=nlp(match)
                score=doc.cats['POSITIVE']
                if len(words)>0 and len(match.split())>=2 and score > .99:
                    if to_append[1]!='' and match not in to_append[1]:
                        to_append[1] = to_append[1]+'|'+match
                    if to_append[1]=='':
                        to_append[1] = match
            
    ###### remove similar jaccard
    got_label=to_append[1].split('|')
    filtered=[]
    filtered_labels = ''
    for label in sorted(got_label, key=len):
        label = clean_text(label)
        if len(filtered) == 0 or all(jaccard_similarity(label, got_label) < .5 for got_label in filtered):
            filtered.append(label)
            if filtered_labels!='':
                filtered_labels=filtered_labels+'|'+label
            if filtered_labels=='':
                filtered_labels=label
    
    to_append[1] = filtered_labels  
    
    spacy_predictions.append(to_append[1])
    
print("--- %s seconds ---" % (time.time() - start_time))

spacy_predictions[:5]

--- 0.5597970485687256 seconds ---


['adni|alzheimers disease neuroimaging initiative',
 'ines|ipeds|pirls|oecd indicators national education systems|nces integrated postsecondary education data system|achievement progress international reading literacy study',
 'apes|usgs|slosh|us geological survey|albemarle pamlico estuarine system|sea lake overland surges hurricanes',
 'ces|consumer expenditure survey']

# Custom transformer model

In [33]:
model_path = '../input/ci-transformers-model-v2/model/sent_transformer'
tokenizer_path = '../input/ci-transformers-model-v2/tokenizer.pickle'

In [34]:
""" build transformer model"""

maxlen = 500
num_classes = 2
vocab_size = 32824

embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(2, activation="softmax")(x)

model_t = keras.Model(inputs=inputs, outputs=outputs)
model_t.load_weights(model_path)
model_t.compile(loss='sparse_categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(), metrics=['accuracy'])
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0,
                patience=0, verbose=1, mode='auto', baseline=None, restore_best_weights=True)

2021-09-25 04:46:41.554988: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-09-25 04:46:41.556930: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-09-25 04:46:41.582859: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-25 04:46:41.583662: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:00:04.0 name: Tesla P100-PCIE-16GB computeCapability: 6.0
coreClock: 1.3285GHz coreCount: 56 deviceMemorySize: 15.90GiB deviceMemoryBandwidth: 681.88GiB/s
2021-09-25 04:46:41.583749: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-09-25 04:46:41.583858: I tensorflow/stream_executor/platform/def

In [35]:
from fuzzywuzzy import fuzz

# prepare list of dataset titles to match
train_df = pd.read_csv('../input/bigger-govt-dataset-list/data_set_800.csv')
sample_sub = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')

ds_titles = all_datasets

filtered = []
labels = ds_titles
for label in sorted(labels, key=len):
    label = clean_text(label)
    if len(filtered) == 0 or all(jaccard_similarity(label, got_label) < 0.2 for got_label in filtered):
        filtered.append(label)
        
ds_titles = np.array(filtered)
ds_titles.shape

(124,)

In [36]:
tokenizer = ''
with open(tokenizer_path, "rb") as openfile:
    tokenizer = pickle.load(openfile)
            
test_data_path = '../input/coleridgeinitiative-show-us-the-data/test'
test_sentences = {}
candidate_threshold = 0.3
acceptance_score = 80

def read_json_pub(Id):
    filename = os.path.join(test_data_path, Id+'.json')
    with open(filename) as f:
        json_pub = json.load(f)
    return json_pub

transformers_preds = []
for index, row in tqdm(sample_submission.iterrows(), total = sample_submission.shape[0]):
    # Load text
    raw_text = read_json_pub(row['Id'])
    text = '\n'.join([z for y in raw_text for z in y.values()])

    # split and clean sentences
    sentences = nltk.sent_tokenize(re.sub(r'\.?\n', '. ', text))
    sentences = [re.sub(r"[^a-z ]+","", s.lower()) for s in sentences]
    
    # tokenize
    tokens = tokenizer.texts_to_sequences(sentences)
    tokens = tf.keras.preprocessing.sequence.pad_sequences(
        tokens, maxlen=maxlen, padding='pre',)

    # Predict candidates sentences that may contain DS references
    y_pred = model_t.predict(tokens, batch_size=32)
    sent_candidates = np.array(sentences)[y_pred[:,1] > candidate_threshold]
    test_sentences[row['Id']] = sent_candidates

    ds_candidates = set()
    for sent in sent_candidates:
        scores = [fuzz.partial_ratio(sent.lower(), title) for title in ds_titles]
        best_fit_title_index = np.argmax(scores)
        if max(scores) > acceptance_score:
            ds_candidates.add(ds_titles[np.argmax(scores)])
    prediction_string = ' | '.join(ds_candidates)
    transformers_preds.append(prediction_string)
    
transformers_preds[:5]

  0%|          | 0/4 [00:00<?, ?it/s]2021-09-25 04:46:46.190521: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2021-09-25 04:46:46.200782: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2000175000 Hz
2021-09-25 04:46:46.448722: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11
2021-09-25 04:46:47.189284: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.11
100%|██████████| 4/4 [00:02<00:00,  1.43it/s]


['alzheimers disease neuroimaging initiative',
 'oecd s online education database | common core of data | program for international student assessment | progress in international reading literacy study',
 'noaa storm surge inundation | slosh model | sea lake and overland surges from hurricanes',
 'rural urban continuum codes']

# Aggregate all predictions

In [37]:
final_predictions = []
for bert_pred, trans_pred, spacy_pred, literal_match in zip(
    final_xlm_roberta_labels[0], 
    transformers_preds, 
    spacy_predictions,
    literal_preds
):        
    pred1 = [x for x in bert_pred.split('|') if x not in ['']]
    pred2 = [x for x in trans_pred.split('|') if x not in ['']]
    pred3 = [x for x in spacy_pred.split('|') if x not in ['']]

    labels = np.unique(pred1+pred2+pred3)
    if len(labels)>0:
        filtered = []
        for label in tqdm(sorted(labels, key=len)):
            label = clean_text(label)
            if len(filtered) == 0 or all(jaccard_similarity(label, got_label) < 0.4 for got_label in filtered):
                filtered.append(label)

        final_predictions.append('|'.join(filtered))
    else:
        final_predictions.append(literal_match)        

final_predictions[:5]

100%|██████████| 4/4 [00:00<00:00, 11848.32it/s]
100%|██████████| 15/15 [00:00<00:00, 10354.60it/s]
100%|██████████| 11/11 [00:00<00:00, 23467.62it/s]
100%|██████████| 3/3 [00:00<00:00, 23172.95it/s]


['adni|cardiovascular health study chs|alzheimers disease neuroimaging initiative',
 'ines|ipeds|pirls|common core of data|schools and staffing survey|oecd s online education database|trends in international mathematics|oecd indicators national education systems|program for international student assessment|integrated postsecondary education data system|progress in international reading literacy study',
 'apes|usgs|slosh|us geological survey|noaa storm surge inundation|albemarle pamlico estuarine system|sea lake overland surges hurricanes',
 'ces|consumer expenditure survey|rural urban continuum codes']

In [38]:
sample_submission['PredictionString'] = final_predictions
sample_submission[['Id', 'PredictionString']].to_csv('submission.csv', index=False)

sample_submission.head()

Unnamed: 0,Id,PredictionString
0,2100032a-7c33-4bff-97ef-690822c43466,adni|cardiovascular health study chs|alzheimer...
1,2f392438-e215-4169-bebf-21ac4ff253e1,ines|ipeds|pirls|common core of data|schools a...
2,3f316b38-1a24-45a9-8d8c-4e05a42257c6,apes|usgs|slosh|us geological survey|noaa stor...
3,8e6996b4-ca08-4c0b-bed2-aaf07a4c6a60,ces|consumer expenditure survey|rural urban co...
