In [1]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [2]:
# =========================================================================================
# Libraries
# =========================================================================================
import gc
import time
import math
import random
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from sentence_transformers import SentenceTransformer, CrossEncoder, util
from sentence_transformers.readers import InputExample
from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator
from torch.utils.data import DataLoader

# Custom libraries
from utils.unsupervised_utils import read_data, get_neighbors, build_training_set
from utils.utils import read_config
from utils.metrics import get_pos_score, get_f2_score

%env TOKENIZERS_PARALLELISM=true
%env TRANSFORMERS_NO_ADVISORY_WARNINGS=true

env: TOKENIZERS_PARALLELISM=true


In [3]:
config = read_config()
DATA_PATH = "../raw_data/"
GENERATED_DATA_PATH = "./generated_files/"

In [4]:
train_df = pd.read_csv(GENERATED_DATA_PATH + "unsupervised_train.csv")
test_df = pd.read_csv(GENERATED_DATA_PATH + "unsupervised_test.csv")
correlation_df = pd.read_csv(DATA_PATH + "correlations.csv")

### Training

In [5]:
train_samples = [InputExample(texts=[row.model_input1,
                                     row.model_input2],
                              label=int(row.target)) for row in tqdm(train_df.itertuples())]

test_samples = [InputExample(texts=[row.model_input1,
                                     row.model_input2],
                              label=int(row.target)) for row in tqdm(test_df.itertuples())]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [6]:
model = CrossEncoder(config["supervised_model"]["base_name"],
                     num_labels=1)

num_epochs = 3

train_dataloader = DataLoader(train_samples,
                              shuffle=True,
                              batch_size=64,
                              num_workers=0)

evaluator = CEBinaryClassificationEvaluator.from_input_examples(test_samples,
                                                                name='K12-local-test',
                                                               show_progress_bar=True)

warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at trained_models/unsupervised/all-MiniLM-L6-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
model.fit(train_dataloader=train_dataloader,
          evaluator=evaluator,
          epochs=num_epochs,
#           evaluation_steps=1000,
          warmup_steps=warmup_steps,
#            save_best_model=True,
          output_path=config["supervised_model"]["save_name"],
         use_amp=True)

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/43177 [00:00<?, ?it/s]

Batches:   0%|          | 0/9768 [00:00<?, ?it/s]

Iteration:   0%|          | 0/43177 [00:00<?, ?it/s]

Batches:   0%|          | 0/9768 [00:00<?, ?it/s]

Iteration:   0%|          | 0/43177 [00:00<?, ?it/s]

Batches:   0%|          | 0/9768 [00:00<?, ?it/s]

In [8]:
model

<sentence_transformers.cross_encoder.CrossEncoder.CrossEncoder at 0x7fdb4ecdcbe0>

In [9]:
model.save(config["supervised_model"]["save_name"])

### Load Model & Tune Threshold

In [13]:
model = CrossEncoder(config["supervised_model"]["save_name"])

In [15]:
preds = model.predict(test_df[["model_input1", "model_input2"]].values,
                      show_progress_bar=True,
                      batch_size=96)

Batches:   0%|          | 0/1221 [00:00<?, ?it/s]

In [16]:
test_df["pred_score"] = preds

In [18]:
for thr in np.arange(0.0, 0.1, 0.005):
    preds_thr_df = test_df[test_df.pred_score >= thr].sort_values(by="pred_score",
                                                    ascending=False)[["topics_ids",
                                                                      "content_ids"]].\
                                    groupby("topics_ids")["content_ids"].apply(lambda x: " ".join(x)).rename("pred_content_ids").reset_index()

    preds_thr_df = preds_thr_df.merge(correlation_df[correlation_df.topic_id.isin(test_df.topics_ids)],
                                      how="outer", right_on="topic_id", left_on="topics_ids")
    preds_thr_df.fillna("None", inplace=True)
    f2score_for_threshold = get_f2_score(preds_thr_df['content_ids'],
                                         preds_thr_df['pred_content_ids'])

    print(f"Threshold: {thr} | Score: {f2score_for_threshold}")

Threshold: 0.0 | Score: 0.1797
Threshold: 0.005 | Score: 0.3557
Threshold: 0.01 | Score: 0.3953
Threshold: 0.015 | Score: 0.4135
Threshold: 0.02 | Score: 0.4238
Threshold: 0.025 | Score: 0.4305
Threshold: 0.03 | Score: 0.4343
Threshold: 0.035 | Score: 0.4369
Threshold: 0.04 | Score: 0.4398
Threshold: 0.045 | Score: 0.4426
Threshold: 0.05 | Score: 0.4433
Threshold: 0.055 | Score: 0.4443
Threshold: 0.06 | Score: 0.4437
Threshold: 0.065 | Score: 0.4435
Threshold: 0.07 | Score: 0.4442
Threshold: 0.075 | Score: 0.4445
Threshold: 0.08 | Score: 0.4449
Threshold: 0.085 | Score: 0.4442
Threshold: 0.09 | Score: 0.4443
Threshold: 0.095 | Score: 0.4432
