# AI Final Project: Data for Public Good Identification
## CS4100
## Mirah Gordon and Sarah Moyer

## Table of Contents
* [Data Import and Cleaning](#DIC)
* [BERT Model](#BM)
* [Text Matching](#TM)
* [Model Implementation](#MI)
* [Predictions](#P)

In [1]:
# imports
import os
import pandas as pd
import json
import re
import numpy as np
import string
from functools import partial
from tqdm.notebook import tqdm
from collections import defaultdict
from collections import Counter

In [None]:
from transformers import TFElectraForPreTraining, ElectraTokenizerFastb

import tensorflow as tf

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Bidirectional, SpatialDropout1D
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K

from tensorflow_addons.text.crf import crf_log_likelihood
from tensorflow_addons.layers.crf import CRF

<a id='DIC'></a>
### Data Import and Cleaning

In [2]:
# load csv data
df = pd.read_csv('train.csv')

df.head()

Unnamed: 0,Id,pub_title,dataset_title,dataset_label,cleaned_label
0,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,The Impact of Dual Enrollment on College Degre...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
1,2f26f645-3dec-485d-b68d-f013c9e05e60,Educational Attainment of High School Dropouts...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
2,c5d5cd2c-59de-4f29-bbb1-6a88c7b52f29,Differences in Outcomes for Female and Male St...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
3,5c9a3bc9-41ba-4574-ad71-e25c1442c8af,Stepping Stone and Option Value in a Model of ...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
4,c754dec7-c5a3-4337-9892-c02158475064,"Parental Effort, School Resources, and Student...",National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study


In [3]:
# rename columns

df.rename(columns={'pub_title': 'Pub Title', 'dataset_title': 'Dataset Title', 'dataset_label': 'Dataset Label', 'cleaned_label': 'Cleaned Label'}, inplace=True)

In [4]:
df[:-1]

Unnamed: 0,Id,Pub Title,Dataset Title,Dataset Label,Cleaned Label
0,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,The Impact of Dual Enrollment on College Degre...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
1,2f26f645-3dec-485d-b68d-f013c9e05e60,Educational Attainment of High School Dropouts...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
2,c5d5cd2c-59de-4f29-bbb1-6a88c7b52f29,Differences in Outcomes for Female and Male St...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
3,5c9a3bc9-41ba-4574-ad71-e25c1442c8af,Stepping Stone and Option Value in a Model of ...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
4,c754dec7-c5a3-4337-9892-c02158475064,"Parental Effort, School Resources, and Student...",National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
...,...,...,...,...,...
19655,922a5f2c-2d1c-46a7-a07a-acaf2222c0c6,3D U-Net for segmentation of COVID-19 associat...,RSNA International COVID-19 Open Radiology Dat...,RSNA International COVID-19 Open Radiology Dat...,rsna international covid 19 open radiology dat...
19656,b3498176-8832-4033-aea6-b5ea85ea04c4,RSNA International Trends: A Global Perspectiv...,RSNA International COVID-19 Open Radiology Dat...,RSNA International COVID Open Radiology Database,rsna international covid open radiology database
19657,f77eb51f-c3ac-420b-9586-cb187849c321,MCCS: a novel recognition pattern-based method...,CAS COVID-19 antiviral candidate compounds dat...,CAS COVID-19 antiviral candidate compounds dat...,cas covid 19 antiviral candidate compounds dat...
19658,ab59bcdd-7b7c-4107-93f5-0ccaf749236c,Quantitative Structure–Activity Relationship M...,CAS COVID-19 antiviral candidate compounds dat...,CAS COVID-19 antiviral candidate compounds dat...,cas covid 19 antiviral candidate compounds dat...


In [5]:
# find the number of unique dataset titles
unique_title = len(pd.unique(df['Dataset Title']))
print("Number of unique dataset titles: " + str(unique_title))

Number of unique dataset titles: 45


In [6]:
# find the number of unique dataset labels
unique_label = len(pd.unique(df['Dataset Label']))
print("Number of unique dataset labels: " + str(unique_label))

Number of unique dataset labels: 130


In [7]:
# list most common titles in descending order
c = Counter(df['Dataset Title'].tolist())
title_list = list(c.most_common())
title_list

[("Alzheimer's Disease Neuroimaging Initiative (ADNI)", 6144),
 ('Baltimore Longitudinal Study of Aging (BLSA)', 1589),
 ('Trends in International Mathematics and Science Study', 1163),
 ('Early Childhood Longitudinal Study', 1011),
 ('SARS-CoV-2 genome sequence', 860),
 ('Census of Agriculture', 743),
 ('Education Longitudinal Study', 676),
 ('Agricultural Resource Management Survey', 660),
 ('North American Breeding Bird Survey (BBS)', 585),
 ('National Education Longitudinal Study', 550),
 ('Survey of Earned Doctorates', 509),
 ('Rural-Urban Continuum Codes', 490),
 ('Beginning Postsecondary Student', 461),
 ('NOAA Tide Gauge', 441),
 ('International Best Track Archive for Climate Stewardship', 386),
 ('Common Core of Data', 368),
 ('Coastal Change Analysis Program', 326),
 ('World Ocean Database', 326),
 ('Sea, Lake, and Overland Surges from Hurricanes', 312),
 ('Survey of Doctorate Recipients', 309),
 ('Baccalaureate and Beyond', 306),
 ('Optimum Interpolation Sea Surface Temperat

In [8]:
# find only titles with at least 300 occurences in the data
top_titles = []

for title in title_list:
    if title[1] > 300:
        top_titles.append(title)

top_titles = dict(top_titles)
print(top_titles)

{"Alzheimer's Disease Neuroimaging Initiative (ADNI)": 6144, 'Baltimore Longitudinal Study of Aging (BLSA)': 1589, 'Trends in International Mathematics and Science Study': 1163, 'Early Childhood Longitudinal Study': 1011, 'SARS-CoV-2 genome sequence': 860, 'Census of Agriculture': 743, 'Education Longitudinal Study': 676, 'Agricultural Resource Management Survey': 660, 'North American Breeding Bird Survey (BBS)': 585, 'National Education Longitudinal Study': 550, 'Survey of Earned Doctorates': 509, 'Rural-Urban Continuum Codes': 490, 'Beginning Postsecondary Student': 461, 'NOAA Tide Gauge': 441, 'International Best Track Archive for Climate Stewardship': 386, 'Common Core of Data': 368, 'Coastal Change Analysis Program': 326, 'World Ocean Database': 326, 'Sea, Lake, and Overland Surges from Hurricanes': 312, 'Survey of Doctorate Recipients': 309, 'Baccalaureate and Beyond': 306}


In [9]:
# filter the dataset to include only the top titles
top_titles_name = list(top_titles.keys())
top_df = df[df['Dataset Title'].isin(top_titles_name)]

top_df.head()

Unnamed: 0,Id,Pub Title,Dataset Title,Dataset Label,Cleaned Label
0,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,The Impact of Dual Enrollment on College Degre...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
1,2f26f645-3dec-485d-b68d-f013c9e05e60,Educational Attainment of High School Dropouts...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
2,c5d5cd2c-59de-4f29-bbb1-6a88c7b52f29,Differences in Outcomes for Female and Male St...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
3,5c9a3bc9-41ba-4574-ad71-e25c1442c8af,Stepping Stone and Option Value in a Model of ...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
4,c754dec7-c5a3-4337-9892-c02158475064,"Parental Effort, School Resources, and Student...",National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study


In [10]:
# find the number of unique dataset labels for filtered dataset titles
unique_labels = np.unique(top_df['Dataset Label'])
print("Number of unique dataset labels within the top titles: ", len(unique_labels))

Number of unique dataset labels within the top titles:  72


In [11]:
print(len(top_titles))

21


In [12]:
BASE_DIR = 'coleridgeinitiative-show-us-the-data'

test_dir = os.path.join(BASE_DIR, 'test')

sample_submission_path = os.path.join(BASE_DIR, 'sample_submission.csv')
sample_df = pd.read_csv(sample_submission_path)

In [13]:
url_regex = re.compile("https?://[\w!\?/\+\-_~=;\.,\*&@#\$%\(\)'\[\]]+[\w!\?/\+\-_~=\*&@#\$%']")
www_regex = re.compile("www\.[\w!\?/\+\-_~=;\.,\*&@#\$%\(\)'\[\]]+[\w!\?/\+\-_~=\*&@#\$%']")
def get_article(filename, dir_path=test_dir):
    json_path = os.path.join(test_dir, (filename+'.json'))
    contents = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            section_title = data['section_title']
            section_text= data['text']
            if len(section_text) >= len(section_title):
                contents.append(section_text)
            else:
                contents.append(section_title)
    all_contents = ' '.join(contents)

    return www_regex.sub('', url_regex.sub('', all_contents))

def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

def jaccard_similarity(s1, s2):
    a = set(s1.lower().split()) 
    b = set(s2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [14]:
tqdm.pandas()
sample_df['text'] = sample_df['Id'].progress_apply(get_article)

  0%|          | 0/4 [00:00<?, ?it/s]

In [15]:
bracket_regex = re.compile("\(.+\)")

temp_1 = { 
    bracket_regex.sub('', x).lower().strip() 
    if len(x.split()) > 1 else bracket_regex.sub('', x).lower().strip() + ' ' 
    for x in top_df['Dataset Label'].unique()
}
temp_2 = { 
    bracket_regex.sub('', x).lower().strip() 
    if len(x.split()) > 1 else bracket_regex.sub('', x).lower().strip() + ' ' 
    for x in top_df['Dataset Title'].unique()
}
existing_labels ={ label for label in (temp_1 | temp_2)}
existing_labels

{'2019-ncov complete genome sequences',
 '2019-ncov genome sequence',
 '2019-ncov genome sequences',
 'adni ',
 'agricultural resource management survey',
 'agricultural resources management survey',
 "alzheimer's disease neuroimaging initiative",
 'alzheimers disease neuroimaging initiative',
 'arms farm financial and crop production practices',
 'baccalaureate and beyond',
 'baccalaureate and beyond longitudinal study',
 'baltimore longitudinal study of aging',
 'beginning postsecondary student',
 'beginning postsecondary students',
 'beginning postsecondary students longitudinal study',
 'census of agriculture',
 'coastal change analysis program',
 'coastal change analysis program land cover',
 'common core of data',
 'covid-19 genome sequence',
 'covid-19 genome sequences',
 'early childhood longitudinal study',
 'education longitudinal study',
 'genome sequence of 2019-ncov',
 'genome sequence of covid-19',
 'genome sequence of sars-cov-2',
 'genome sequences of 2019-ncov',
 'geno

<a id='BM'></a>
### BERT Model

In [None]:
MAX_LENGTH = 128
BATCH_SIZE = 128
ENCODER_DIR = '/kaggle/input/huggingfaceelectra/electra-base-discriminator'

In [None]:
tokenizer = ElectraTokenizerFast.from_pretrained(ENCODER_DIR)
label2id = {
    tokenizer.pad_token: 0,
    tokenizer.cls_token: 1,
    tokenizer.sep_token: 2,
    'B-DATA': 3,
    'I-DATA': 4,
    'O': 5
}

In [None]:
def unpack_data(data):
    if len(data) == 2:
        return data[0], data[1], None
    elif len(data) == 3:
        return data
    else:
        raise TypeError("Data was not an expected tuple of size 2 or 3.")

In [None]:
class ModelWithCRFLoss(tf.keras.Model):
    """Wrapper around the base model for custom training logic."""

    def __init__(self, base_model):
        super().__init__()
        self.base_model = base_model

    @tf.function
    def call(self, inputs):
        return self.base_model(inputs)

    def compute_loss(self, x, y, sample_weight, training=False):
        y_pred = self(x, training=training)
        _, potentials, sequence_length, chain_kernel = y_pred

        crf_loss = -crf_log_likelihood(potentials, y, sequence_length, chain_kernel)[0]

        if sample_weight is not None:
            crf_loss = crf_loss * sample_weight

        return tf.reduce_mean(crf_loss), sum(self.losses)
    
    @tf.function
    def train_step(self, data):
        x, y, sample_weight = unpack_data(data)

        with tf.GradientTape() as tape:
            crf_loss, internal_losses = self.compute_loss(
                x, y, sample_weight, training=True
            )
            total_loss = crf_loss + internal_losses

        gradients = tape.gradient(total_loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))

        return {"crf_loss": crf_loss, "internal_losses": internal_losses}
    
    @tf.function
    def test_step(self, data):
        x, y, sample_weight = unpack_data(data)
        crf_loss, internal_losses = self.compute_loss(x, y, sample_weight)
        return {"crf_loss": crf_loss, "internal_losses": internal_losses}

def build_base_model(transformer, num_cls=1, max_len=512):
    input_ids = Input(shape=(max_len,), dtype=tf.int32, name='input_ids')
    input_attention_mask = Input(shape=(max_len,), dtype=tf.int32, name='attention_mask')
    sequence_output = transformer({
        'input_ids': input_ids, 
        'attention_mask': input_attention_mask
    }).hidden_states[0]
    mask = tf.cast(input_attention_mask, tf.bool)
    sequence_output = SpatialDropout1D(0.1)(sequence_output)
    sequence_output = Bidirectional(LSTM(256, return_sequences=True), name='bidirectional_lstm')(sequence_output, mask=mask)
    sequence_output = Dense(num_cls, activation='softmax', name='sequence_output')(sequence_output)
    out = CRF(num_cls, name='crf_output')(sequence_output, mask=mask)
    model = Model(inputs=[input_ids, input_attention_mask], outputs=out)
    return model

In [None]:
def select_sentence(text):
    text_list = text.split('\n')
    text_set = {x for sentence in text_list for x in sentence.split('.')}
    return {
        sentence for sentence in text_set 
        if len(sentence.split()) >= 6
    }

In [None]:
def decode_prediction(x, y, tokenizer, label2id):
    decoded_predictions = set()
    for input_ids, predictions in zip(x, y):
        words = []
        for i, prediction in enumerate(predictions[:len(input_ids)-1]):
            if prediction == label2id['B-DATA']:
                if words:
                    decoded_predictions.add(tokenizer.decode(words))
                    words.clear()
                words.append(input_ids[i])
            elif words:
                if prediction == label2id['I-DATA']:
                    words.append(input_ids[i])
                else:
                    decoded_predictions.add(tokenizer.decode(words))
                    words.clear()
        if words:
            decoded_predictions.add(tokenizer.decode(words))
    return decoded_predictions

<a id='TM'></a>
### Text Matching

In [None]:
test_ids = []

prepared_data = {}
first_stage_predictions = {}
cleaned_exisiting_labels = [clean_text(existing_label) for existing_label in existing_labels]
for row in sample_df.itertuples():
    
    sample_text = row.text
    test_id = row.Id
    
    cleaned_labels = set()
    
    sample_text_lower = f'{sample_text} '.lower()
    for known_label in existing_labels:
        if known_label in sample_text_lower:
            cleaned_labels.add(clean_text(known_label))
            
    for existing_label, cleaned_exisiting_label in zip(existing_labels, cleaned_exisiting_labels):
        if existing_label in sample_text_lower:
            if all(cleaned_exisiting_label not in label for label in cleaned_labels):
                cleaned_labels.add(cleaned_exisiting_label)           
    first_stage_predictions[test_id] = set(cleaned_labels)

    test_ids.append(test_id)
    
    # preparing data for 2nd stage prediction
    encoded_sentences = tokenizer(
        list(select_sentence(sample_text)),
        return_token_type_ids=False,
        max_length=MAX_LENGTH,
        truncation=True
    )
    prepared_data[test_id] = {
        'input_ids': pad_sequences(encoded_sentences['input_ids'], maxlen=MAX_LENGTH, padding='post'),
        'attention_mask': pad_sequences(encoded_sentences['attention_mask'], maxlen=MAX_LENGTH, padding='post'),
        'no_padded_input_ids': encoded_sentences['input_ids']
    }

<a id='MI'></a>
### Model Implementation

In [None]:
N_FOLDS = 4
CNT_THRES = 2
each_fold_predictions = {}
second_stage_predictions = {}
for fold in range(N_FOLDS):
    model_path = f'/coleridge-electra-base-ner4/fold{fold}/electra_base_crf'
    transformer_layer = TFElectraForPreTraining.from_pretrained(ENCODER_DIR, output_hidden_states=True)
    base_model = build_base_model(transformer_layer, num_cls=len(label2id), max_len=MAX_LENGTH)
    model = ModelWithCRFLoss(base_model)
    model.load_weights(model_path)
    for test_id in test_ids:
        x_test = prepared_data[test_id]
        y_pred = model.predict(
            {'input_ids': x_test['input_ids'], 'attention_mask': x_test['attention_mask']},
            batch_size=BATCH_SIZE)[0]
        labels = decode_prediction(
            x_test['no_padded_input_ids'], 
            y_pred, 
            tokenizer, 
            label2id
        )
        
        if test_id not in each_fold_predictions:
            each_fold_predictions[test_id] = defaultdict(int)
        
        for label in labels:
            each_fold_predictions[test_id][label] += 1

second_stage_predictions = {
    test_id: {
        clean_text(label) for label, cnt in each_fold_predictions[test_id].items() if cnt >= CNT_THRES
    } for test_id in test_ids
}

<a id='P'></a>
### Predictions 

In [None]:
prediction_string_list = []
for test_id in test_ids:
    first = first_stage_predictions[test_id]
    second = set()
    for ner_label in second_stage_predictions[test_id]:
        cleaned_ner_label = clean_text(ner_label)
        if all(
            jaccard_similarity(cleaned_ner_label, cleaned_matching_label) < 0.5 
            for cleaned_matching_label in first
        ):
            second.add(cleaned_ner_label)
    prediction_string_list.append('|'.join(first | second ))

In [None]:
submission = pd.DataFrame()
submission['Id'] = test_ids
submission['PredictionString'] = prediction_string_list
submission.to_csv('submission.csv', index=False)