In [1]:
!pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
!pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
!pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
!pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl

# !pip3 uninstall fsspec -y
!pip install fsspec==2021.5.0

from IPython.display import clear_output

clear_output()

In [2]:
import os
import re
import json
import time
import datetime
import random
import glob
import importlib

import numpy as np
import pandas as pd

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

random.seed(123)
np.random.seed(456)

In [3]:
# copy my_seqeval.py to the working directory because the input directory is non-writable
!cp /kaggle/input/coleridge-packages/my_seqeval.py ./

In [4]:
MAX_LENGTH = 64 # max no. words for each sentence.
OVERLAP = 20 # if a sentence exceeds MAX_LENGTH, we split it to multiple sentences with overlapping

MAX_SAMPLE = None # set a small number for experimentation, set None for production.

In [5]:
train_path = '../input/coleridgeinitiative-show-us-the-data/train.csv'
paper_train_folder = '../input/coleridgeinitiative-show-us-the-data/train'

# train = pd.read_csv(train_path)
# train = train[:MAX_SAMPLE]
# print(f'No. raw training rows: {len(train)}')

In [6]:
# train = train.groupby('Id').agg({
#     'pub_title': 'first',
#     'dataset_title': '|'.join,
#     'dataset_label': '|'.join,
#     'cleaned_label': '|'.join
# }).reset_index()

# print(f'No. grouped training rows: {len(train)}')

# Load datasets

In [7]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

In [8]:
tmp8 = pd.read_csv('../input/ci-ext-datasets-found-in-train-v2/train_ext_data.csv')
tmp8['ext_cleaned_label'] = tmp8['ext_cleaned_label'].apply(lambda x: x.split('|'))

In [9]:
not_datasets = ['about', 'climatologists', 'control', 'exploration', 'defense', 
                'american community', 'american landscape', 'current population survey',
                'gulf of maine', 'argonne national laboratory s greet', 
                'annual wholesale trade',
                'bird conservation areas', 'bird incidental take', 'new housing', 'business patterns',
                'create', 'federal aid to states', 'freedom of information act', 'fruit and vegetable prices',
                'guidance navigation and control', 'high school and beyond', 'human resource management', 
                'housing unit estimates', 'international data base', 'labor market analysts', 'major land uses',
                'mars exploration program', 'new residential construction', 'oxygen delivery system',
                'pilot boarding areas', 'profiles in science', 'state fact sheets', 'summary of business',
                'tsunamis general', 'virtual grower', # 0.620
               ]
wrong_names = [
    'national assessment of educational progress',
    'national postsecondary student aid study',
    'nursing home compare',
    'private school universe survey',
    'program for international student assessment',
    'progress in international reading literacy study',
    'schools and staffing survey'
]

a = tmp8['ext_cleaned_label'].values
b = []
for l in tqdm(a):
    sub_label = []
    for l2 in l:
        append_ = True
        for l3 in not_datasets:
            if l3 in l2:
                append_ = False
        if append_:
            for wn in wrong_names:
                if wn in l2:
                    l2 = wn
            sub_label.append(l2)
    b.append(sub_label)
tmp8['ext_cleaned_label2'] = b
tmp8['ext_cleaned_label2'] = tmp8['ext_cleaned_label2'].apply(lambda x: np.unique(x))

100%|██████████| 14316/14316 [00:00<00:00, 125530.29it/s]


In [10]:
train = tmp8
print(len(train))
tmp8.head()

14316


Unnamed: 0,Id,pub_title,dataset_title,dataset_label,cleaned_label,ext_cleaned_label,ext_cleaned_label2
0,f70051bf-a763-415b-aa66-97ae57f2efc1,Analysis of groundwater response to tidal fluc...,NOAA Tide Gauge,NOAA tidal station,noaa tidal station,[noaa tidal station],[noaa tidal station]
1,0d4e13ca-47ec-4827-b814-a39e5b8fede3,Geophysical and sampling data from the inner c...,NOAA Tide Gauge,NOAA tidal station,noaa tidal station,[noaa tidal station],[noaa tidal station]
2,c5cf06e5-182f-4c33-bf15-e06a0d353efd,Geophysical and sampling data from the inner c...,NOAA Tide Gauge,NOAA tidal station,noaa tidal station,"[gulf of maine, noaa tidal station]",[noaa tidal station]
3,da25e497-208d-4ed5-9c51-37c69a5524d3,Development of the Hydrodynamic Model for Long...,NOAA Tide Gauge,NOAA tidal station,noaa tidal station,[noaa tidal station],[noaa tidal station]
4,50d6879b-1c6b-4434-965e-19a7271e8c49,MODELING MICROBIAL WATER QUALITY AT A BEACH IM...,NOAA Tide Gauge,NOAA tidal station,noaa tidal station,[noaa tidal station],[noaa tidal station]


In [11]:
papers = {}
for paper_id in tqdm(train['Id'].unique()):
    with open(f'{paper_train_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

100%|██████████| 14316/14316 [01:04<00:00, 220.25it/s]


In [12]:
def clean_training_text(txt):
    """
    similar to the default clean_text function but without lowercasing.
    """
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt)).strip()

def shorten_sentences(sentences):
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences

def find_sublist(big_list, small_list):
    all_positions = []
    for i in range(len(big_list) - len(small_list) + 1):
        
        big_list = [x.lower() for x in big_list]
        
        if small_list == big_list[i:i+len(small_list)]:
            all_positions.append(i)
    
    return all_positions

def tag_sentence(sentence, labels): # requirement: both sentence and labels are already cleaned
    sentence_words = sentence.split()
    
    if labels is not None and any(re.findall(f'\\b{label}\\b', sentence.lower())
                                  for label in labels): # positive sample
        nes = ['O'] * len(sentence_words)
        for label in labels:
            label_words = label.split()

            all_pos = find_sublist(sentence_words, label_words)
            for pos in all_pos:
                nes[pos] = 'B'
                for i in range(pos+1, pos+len(label_words)):
                    nes[i] = 'I'

        return True, list(zip(sentence_words, nes))
        
    else: # negative sample
        nes = ['O'] * len(sentence_words)
        return False, list(zip(sentence_words, nes))

In [13]:
cnt_pos, cnt_neg = 0, 0 # number of sentences that contain/not contain labels
ner_data = []

# pbar = tqdm(total=len(train))
for i, id, dataset_label in train[['Id', 'ext_cleaned_label2']].itertuples():
    # paper
    paper = papers[id]
    
    # labels
#     labels = dataset_label.split('|')
#     labels = [clean_training_text(label) for label in labels]
    labels = dataset_label
    
    # sentences
    sentences = set([clean_training_text(sentence) for section in paper 
                 for sentence in section['text'].split('.') 
                ])
    sentences = shorten_sentences(sentences) # make sentences short
    sentences = [sentence for sentence in sentences if len(sentence) > 10] # only accept sentences with length > 10 chars
    
    # positive sample
    for sentence in sentences:
        is_positive, tags = tag_sentence(sentence, labels)
        if is_positive:
            cnt_pos += 1
            ner_data.append(tags)
        elif any(word in sentence for word in ['data', 'study', 'from']): 
            if np.random.rand(1)[0] > 0.6:
                ner_data.append(tags)
                cnt_neg += 1
    
    # process bar
#     pbar.update(1)
#     pbar.set_description(f"Training data size: {cnt_pos} positives + {cnt_neg} negatives")

# shuffling
random.shuffle(ner_data)

In [14]:
print(f"Training data size: {cnt_pos} positives + {cnt_neg} negatives")

Training data size: 58999 positives + 332724 negatives


In [15]:
with open('train_ner.json', 'w') as f:
    for row in ner_data: 
        words, nes = list(zip(*row))
        row_json = {'tokens' : words, 'tags' : nes}
        json.dump(row_json, f)
        f.write('\n')

In [16]:
!python ../input/kaggle-ner-utils/kaggle_run_ner.py \
--model_name_or_path 'xlm-roberta-base' \
--train_file './train_ner.json' \
--validation_file './train_ner.json' \
--num_train_epochs 1 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--save_steps 15000 \
--output_dir './output' \
--report_to 'none' \
--seed 123 \
--do_train 

2021-06-22 02:54:56.347750: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Downloading and preparing dataset json/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/json/default-f7b38428ba224caf/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02...
Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-f7b38428ba224caf/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02. Subsequent calls will reuse this data.
[INFO|file_utils.py:1402] 2021-06-22 02:55:20,150 >> https://huggingface.co/xlm-roberta-base/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpvjtsu9pw
Downloading: 100%|██████████████████████████████| 512/512 [00:00<00:00, 403kB/s]
[INFO|file_utils.py:1406] 2021-