In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
import re
import seaborn as sns
from tqdm import tqdm
import nltk
import random
from nltk.tokenize import word_tokenize,sent_tokenize
import pickle

train_example_names = [fn.split('.')[0] for fn in os.listdir('data/train')]
test_example_names = [fn.split('.')[0] for fn in os.listdir('data/test')]

metadata = pd.read_csv('data/train.csv')
docIdx = train_example_names.copy()

connection_tokens = {'s', 'of', 'and', 'in', 'on', 'for', 'from', 'the', 'act', 'coast', 'future', 'system', 'per'}

In [2]:
def text_cleaning(text):
    text = re.sub('[^A-Za-z]+', ' ', str(text)).strip() # remove unnecessary literals

    # remove extra spaces
    text = re.sub("\s+"," ", text)

    return text.lower().strip()

def is_name_ok(text):
    if len([c for c in text if c.isalnum()]) < 4:
        return False
    
    tokens = [t for t in text.split(' ') if len(t) > 3]
    tokens = [t for t in tokens if not t in connection_tokens]
    if len(tokens) < 3:
        return False

    return True

with open('data/all_preds_selected.csv', 'r') as f:
    selected_pred_labels = f.readlines()
    selected_pred_labels = [l.strip() for l in selected_pred_labels]

existing_labels = [text_cleaning(x) for x in metadata['dataset_label']] +\
                  [text_cleaning(x) for x in metadata['dataset_title']] +\
                  [text_cleaning(x) for x in metadata['cleaned_label']] +\
                  [text_cleaning(x) for x in selected_pred_labels]

"""to_remove = [
    'frequently asked questions', 'total maximum daily load tmd', 'health care facilities',
    'traumatic brain injury', 'north pacific high', 'droplet number concentration', 'great slave lake',
    'census block groups'
]"""


"""df = pd.read_csv(r'C:\projects\personal\kaggle\kaggle_coleridge_initiative\string_search\data\gov_data.csv')
print(len(df))


df['title'] = df.title.apply(text_cleaning)
titles = list(df.title.unique())
titles = [t for t in titles if not t in to_remove]
df = pd.DataFrame({'title': titles})
df = df.loc[df.title.apply(is_name_ok)]
df = pd.concat([df, pd.DataFrame({'title': existing_labels})], ignore_index= True).reset_index(drop = True)
titles = list(df.title.unique())
df = pd.DataFrame({'title': titles})
df['title'] = df.title.apply(text_cleaning)"""

# Sort labels by length in ascending order
#existing_labels = sorted(list(df.title.values), key = len, reverse = True)

existing_labels = list(set(existing_labels))
existing_labels = sorted(existing_labels, key = len, reverse = True)
existing_labels = [l for l in existing_labels if len(l.split(' ')) < 15]
#del df
#existing_labels.remove('adni')

print(len(existing_labels))

387


## Load, Clean

In [9]:
import pickle

with open(f'data/selected_sentences/pos.pkl', 'rb') as f:
    pos_sentences_raw = pickle.load(f)

with open(f'data/selected_sentences/neg.pkl', 'rb') as f:
    neg_sentences_raw = pickle.load(f)

pos_sentences = [text_cleaning(s) for s in pos_sentences_raw]
neg_sentences = [text_cleaning(s) for s in neg_sentences_raw]

print(f'pos size: {len(pos_sentences)}')
print(f'neg size: {len(neg_sentences)}')

pos size: 175743
neg size: 795925


## Remove labels from positive sentences to prevent overfitting

In [10]:
pos_sentences_with_label = pos_sentences
pos_sentences = []

for s in tqdm(pos_sentences_with_label):
    """for l in existing_labels:
        if l in s:
            s = s.replace(l, '')"""

    pos_sentences.append(s)

pos_sentences = [re.sub("\s+"," ", s) for s in pos_sentences]

100%|█████████████████████████████████████████████████████████████████████| 175743/175743 [00:00<00:00, 3357793.27it/s]


## Take first n tokens from each sentence

In [11]:
pos_sentences_raw = pos_sentences_with_label.copy()
neg_sentences_raw = neg_sentences.copy()

SENTENCE_TOKEN_SIZE = 25

def shorten_sentence(text):
    tokens = text.split(' ')
    return ' '.join(tokens[:SENTENCE_TOKEN_SIZE])

pos_sentences = [shorten_sentence(s) for s in pos_sentences]
neg_sentences = [shorten_sentence(s) for s in neg_sentences]

In [12]:
print(len(pos_sentences))
print(len(neg_sentences))

175743
795925


In [13]:
idx = np.random.permutation(len(neg_sentences))
neg_sentences = [neg_sentences[i] for i in idx]
neg_sentences_raw = [neg_sentences_raw[i] for i in idx]
neg_sentences = neg_sentences[:200000]
#pos_sentences = pos_sentences[:20000]

sentences = pos_sentences + neg_sentences
labels = np.zeros(len(sentences))
labels[:len(pos_sentences)] = 1

assert len(sentences) == len(labels)

idx = np.random.permutation(len(sentences))
sentences = [sentences[i] for i in idx]
labels = [labels[i] for i in idx]

In [14]:
pos_sentences[0]

'in fact organizations are now identifying digital skills or computer literacy as one of their core values for employability such as the us department of'

In [15]:
neg_sentences[0]

'a couple s marital status has also been associated with ipv such that unmarried cohabitating couples report more ipv than married couples brownridge and hill'

## Training

In [18]:
import os
import math
import random
import csv
import sys
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import classification_report
import statistics as stats
from bert_sklearn import BertClassifier

model = BertClassifier(bert_model='scibert-scivocab-uncased',
                        validation_fraction= 0.0,
                        max_seq_length=25,
                        train_batch_size=4,
                        warmup_proportion=0.1,
                        gradient_accumulation_steps=3,
                        epochs = 1
                        )

Building sklearn text classifier...


In [19]:
model.fit(pd.Series(sentences), pd.Series(labels))

Loading scibert-scivocab-uncased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint
train data size: 375743, validation data size: 0


Training  :   3%|█▍                                              | 11397/375743 [17:48<10:08:41,  9.98it/s, loss=0.487]Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x000002AA3C5A84C0>
Traceback (most recent call last):
  File "C:\Users\ozano\.conda\envs\torch\lib\site-packages\torch\utils\data\dataloader.py", line 1324, in __del__
    self._shutdown_workers()
  File "C:\Users\ozano\.conda\envs\torch\lib\site-packages\torch\utils\data\dataloader.py", line 1297, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "C:\Users\ozano\.conda\envs\torch\lib\multiprocessing\process.py", line 149, in join
    res = self._popen.wait(timeout)
  File "C:\Users\ozano\.conda\envs\torch\lib\multiprocessing\popen_spawn_win32.py", line 108, in wait
    res = _winapi.WaitForSingleObject(int(self._handle), msecs)
KeyboardInterrupt: 
Training  :   3%|█▍                                               | 11398/375743 [17:48<9:29:29, 10.66it/s, loss=0.487]

KeyboardInterrupt: 

In [None]:
# save model to disk
savefile='data/sklearn_bert_classification_labels_included.bin'
model.save(savefile)

In [None]:
model.predict_proba(pd.Series(sentences[:3]))

## Load model

In [None]:
from bert_sklearn import load_model
bert_model = load_model(r'data/sklearn_bert_classification_labels_included.bin')

In [33]:
import pickle

with open(f'data/selected_sentences/pos.pkl', 'rb') as f:
    pos_sentences_raw = pickle.load(f)

with open(f'data/selected_sentences/neg.pkl', 'rb') as f:
    neg_sentences_raw = pickle.load(f)
    
pos_sentences = [text_cleaning(s) for s in pos_sentences_raw]
neg_sentences = [text_cleaning(s) for s in neg_sentences_raw]

SENTENCE_TOKEN_SIZE = 25

def shorten_sentence(text):
    tokens = text.split(' ')
    return ' '.join(tokens[:SENTENCE_TOKEN_SIZE])

pos_sentences = [shorten_sentence(s) for s in pos_sentences]
neg_sentences = [shorten_sentence(s) for s in neg_sentences]

print(f'pos size: {len(pos_sentences)}')
print(f'neg size: {len(neg_sentences)}')

pos size: 175743
neg size: 795925


In [34]:
len(neg_sentences_raw)

795925

In [35]:
print(len(pos_sentences))
print(len(neg_sentences))

175743
795925


In [36]:
pos_pred = bert_model.predict_proba(pd.Series(pos_sentences))
neg_pred = bert_model.predict_proba(pd.Series(neg_sentences))
pos_pred = pos_pred[:, 1] > 0.5
neg_pred = neg_pred[:, 1] > 0.5

pred_sentences = []
pred_sentences += [pos_sentences_raw[i] for i in np.argwhere(pos_pred).squeeze()]
pred_sentences += [neg_sentences_raw[i] for i in np.argwhere(neg_pred).squeeze()]

Predicting: 100%|████████████████████████████████████████████████████████████████| 21968/21968 [10:58<00:00, 33.34it/s]
Predicting: 100%|██████████████████████████████████████████████████████████████| 99491/99491 [1:00:15<00:00, 27.52it/s]


In [37]:
len(pred_sentences)

228983

In [38]:
pos_pred.sum()

96880

In [39]:
neg_pred.sum()

132103

In [40]:
print(pos_pred.mean())
print(neg_pred.mean())

0.5512595096248499
0.16597418098438924


In [42]:
import pickle

with open(f'data/classifier_output/pos_classified.pkl', 'wb') as f:
    pickle.dump(pred_sentences, f)

print(f'prediction size: {len(pred_sentences)}')

prediction size: 228983
