In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
import re
import seaborn as sns
from tqdm import tqdm
import nltk
import random
from nltk.tokenize import word_tokenize,sent_tokenize
import pickle

train_example_names = [fn.split('.')[0] for fn in os.listdir('data/train')]
test_example_names = [fn.split('.')[0] for fn in os.listdir('data/test')]

metadata = pd.read_csv('data/train.csv')
docIdx = train_example_names.copy()

connection_tokens = {'s', 'of', 'and', 'in', 'on', 'for', 'from', 'the', 'act', 'coast', 'future', 'system', 'per'}

In [2]:
def text_cleaning(text):
    text = re.sub('[^A-Za-z]+', ' ', str(text)).strip() # remove unnecessary literals

    # remove extra spaces
    text = re.sub("\s+"," ", text)

    return text.lower().strip()

def is_name_ok(text):
    if len([c for c in text if c.isalnum()]) < 4:
        return False
    
    tokens = [t for t in text.split(' ') if len(t) > 3]
    tokens = [t for t in tokens if not t in connection_tokens]
    if len(tokens) < 3:
        return False

    return True
    
with open('data/all_preds_selected.txt', 'r') as f:
    selected_pred_labels = f.readlines()
    selected_pred_labels = [l.strip() for l in selected_pred_labels]

existing_labels = [text_cleaning(x) for x in metadata['dataset_label']] +\
                  [text_cleaning(x) for x in metadata['dataset_title']] +\
                  [text_cleaning(x) for x in metadata['cleaned_label']] +\
                  [text_cleaning(x) for x in selected_pred_labels]

to_remove = [
    'frequently asked questions', 'total maximum daily load tmd', 'health care facilities',
    'traumatic brain injury', 'north pacific high', 'droplet number concentration', 'great slave lake',
    'census block groups'
]

df = pd.read_csv(r'C:\projects\personal\kaggle\kaggle_coleridge_initiative\string_search\data\gov_data.csv')
print(len(df))


df['title'] = df.title.apply(text_cleaning)
titles = list(df.title.unique())
titles = [t for t in titles if not t in to_remove]
df = pd.DataFrame({'title': titles})
df = df.loc[df.title.apply(is_name_ok)]
df = pd.concat([df, pd.DataFrame({'title': existing_labels})], ignore_index= True).reset_index(drop = True)
titles = list(df.title.unique())
df = pd.DataFrame({'title': titles})
df['title'] = df.title.apply(text_cleaning)

# Sort labels by length in ascending order
existing_labels = sorted(list(df.title.values), key = len, reverse = True)
existing_labels = [l for l in existing_labels if len(l.split(' ')) < 10]
del df

existing_labels.remove('adni')
print(len(existing_labels))

291984
60187


## Load, Clean

In [3]:
import pickle

with open(f'data/bert_ner_sentences/pos.pkl', 'rb') as f:
    pos_sentences_raw = pickle.load(f)

with open(f'data/bert_ner_sentences/neg.pkl', 'rb') as f:
    neg_sentences_raw = pickle.load(f)

pos_sentences = [text_cleaning(s) for s in pos_sentences_raw]
neg_sentences = [text_cleaning(s) for s in neg_sentences_raw]

print(f'pos size: {len(pos_sentences)}')
print(f'neg size: {len(neg_sentences)}')

pos size: 69267
neg size: 902401


## Remove labels from positive sentences to prevent overfitting

In [4]:
pos_sentences_with_label = pos_sentences
pos_sentences = []

for s in tqdm(pos_sentences_with_label):
    for l in existing_labels:
        if l in s:
            s = s.replace(l, '')

    pos_sentences.append(s)

pos_sentences = [re.sub("\s+"," ", s) for s in pos_sentences]

100%|████████████████████████████████████████████████████████████████████████████| 69267/69267 [12:42<00:00, 90.84it/s]


## Take first n tokens from each sentence

In [5]:
pos_sentences_raw = pos_sentences_with_label.copy()
neg_sentences_raw = neg_sentences.copy()

SENTENCE_TOKEN_SIZE = 25

def shorten_sentence(text):
    tokens = text.split(' ')
    return ' '.join(tokens[:SENTENCE_TOKEN_SIZE])

pos_sentences = [shorten_sentence(s) for s in pos_sentences]
neg_sentences = [shorten_sentence(s) for s in neg_sentences]

In [6]:
print(len(pos_sentences))
print(len(neg_sentences))

69267
902401


In [6]:
idx = np.random.permutation(len(neg_sentences))
neg_sentences = [neg_sentences[i] for i in idx]
neg_sentences_raw = [neg_sentences_raw[i] for i in idx]
neg_sentences = neg_sentences[:75000]
#pos_sentences = pos_sentences[:20000]

sentences = pos_sentences + neg_sentences
labels = np.zeros(len(sentences))
labels[:len(pos_sentences)] = 1

assert len(sentences) == len(labels)

idx = np.random.permutation(len(sentences))
sentences = [sentences[i] for i in idx]
labels = [labels[i] for i in idx]

## Training

In [13]:
import os
import math
import random
import csv
import sys
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import classification_report
import statistics as stats
from bert_sklearn import BertClassifier

model = BertClassifier(bert_model='scibert-scivocab-uncased',
                        validation_fraction= 0.1,
                        max_seq_length=25,
                        train_batch_size=4,
                        warmup_proportion=0.1,
                        gradient_accumulation_steps=3,
                        epochs = 1
                        )

Building sklearn text classifier...


In [14]:
model.fit(pd.Series(sentences), pd.Series(labels))

Loading scibert-scivocab-uncased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint
train data size: 129841, validation data size: 14426


Training  : 100%|████████████████████████████████████████████████| 129841/129841 [3:27:59<00:00, 10.40it/s, loss=0.475]
Validating: 100%|██████████████████████████████████████████████████████████████████| 1804/1804 [00:54<00:00, 32.83it/s]

Epoch 1, Train loss: 0.4754, Val loss: 0.5159, Val accy: 85.40%





BertClassifier(bert_model='scibert-scivocab-uncased', do_lower_case=True,
               epochs=1, gradient_accumulation_steps=3,
               label_list=array([0., 1.]), max_seq_length=25,
               train_batch_size=4)

In [15]:
# save model to disk
savefile='data/sklearn_bert_classification2.bin'
model.save(savefile)

In [16]:
model.predict_proba(pd.Series(sentences[:3]))

Predicting: 100%|████████████████████████████████████████████████████████████████████████| 1/1 [00:12<00:00, 12.73s/it]


array([[9.6569099e-03, 9.9034309e-01],
       [9.7400814e-01, 2.5991896e-02],
       [2.0728298e-04, 9.9979275e-01]], dtype=float32)

## Load model

In [7]:
from bert_sklearn import load_model
bert_model = load_model(r'data/sklearn_bert_classification2.bin')

Loading model from data/sklearn_bert_classification2.bin...


06/15/2021 13:59:22 - INFO - bert_sklearn.model.pytorch_pretrained.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 31090
}



Defaulting to linear classifier/regressor
Building sklearn text classifier...


In [8]:
import pickle

with open(f'data/bert_ner_sentences/pos.pkl', 'rb') as f:
    pos_sentences_raw = pickle.load(f)

with open(f'data/bert_ner_sentences/neg.pkl', 'rb') as f:
    neg_sentences_raw = pickle.load(f)

In [9]:
len(neg_sentences_raw)

902401

In [10]:
print(len(pos_sentences))
print(len(neg_sentences))

69267
902401


In [11]:
pos_pred = bert_model.predict_proba(pd.Series(pos_sentences))
neg_pred = bert_model.predict_proba(pd.Series(neg_sentences))
pos_pred = pos_pred[:, 1] > 0.5
neg_pred = neg_pred[:, 1] > 0.5

pred_sentences = []
pred_sentences += [pos_sentences_raw[i] for i in np.argwhere(pos_pred).squeeze()]
pred_sentences += [neg_sentences_raw[i] for i in np.argwhere(neg_pred).squeeze()]

Predicting: 100%|██████████████████████████████████████████████████████████████████| 8659/8659 [03:34<00:00, 40.34it/s]
Predicting: 100%|██████████████████████████████████████████████████████████████| 112801/112801 [57:39<00:00, 32.61it/s]


In [13]:
len(pred_sentences)

119897

In [14]:
pos_pred.sum()

54449

In [15]:
neg_pred.sum()

65448

In [12]:
print(pos_pred.mean())
print(neg_pred.mean())

0.7860741767363968
0.07252651537398562


In [17]:
import pickle

with open(f'data/classifier_output/pos_classified.pkl', 'wb') as f:
    pickle.dump(pred_sentences, f)

print(f'prediction size: {len(pred_sentences)}')

prediction size: 119897
