In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'

In [2]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics.pairwise import cosine_similarity
import re

# Training BERT

# Training model_1

In [4]:
df1 = pd.read_csv("data/product_df.csv")
df1.columns = [0,1,2]
df1.head(5)

Unnamed: 0,0,1,2
0,amari ireland largest independent metal stockh...,"product, motor vehicles, buses and trucks, mot...",0
1,davies chemists established john glynne davies...,"product, pharmaceutical products medicines dru...",1
2,browser doesn support javascript.,"product, ship spare parts, boats, yachts",0
3,agilent acquisition asset young scientific ins...,"product, water heaters, water purifiers, water...",1
4,load click https www. com featured_page antenn...,"product, electrical and electronic appliances,...",0


In [5]:
fold = int(len(df1)*0.95)
train_1 = df1
val_1 = df1[fold:]

### Making STS-like dataset

In [6]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import *
import logging
from datetime import datetime


import csv
import gzip
import os


##### Custom reader
class DataReader:
    """
    Reads in the STS dataset. Each line contains two sentences (s1_col_idx, s2_col_idx) and one label (score_col_idx)
    """
    def __init__(self, dataset_folder, s1_col_idx=0, s2_col_idx=1, score_col_idx=2, delimiter="\t",
                 quoting=csv.QUOTE_NONE, normalize_scores=False, min_score=0, max_score=1):
        self.dataset_folder = dataset_folder
        self.score_col_idx = score_col_idx
        self.s1_col_idx = s1_col_idx
        self.s2_col_idx = s2_col_idx
        self.delimiter = delimiter
        self.quoting = quoting
        self.normalize_scores = normalize_scores
        self.min_score = min_score
        self.max_score = max_score

    def get_examples(self, df, modelname = 'model_1', max_examples=0):
        """
        filename specified which data split to use (train.csv, dev.csv, test.csv).
        """
        examples = []
        for r in df.iterrows():
            id = r[0]
            row = r[1]
            score = float(row[self.score_col_idx])

            s1 = row[self.s1_col_idx]
            s2 = row[self.s2_col_idx]
            examples.append(InputExample(guid=modelname+str(id), texts=[s1, s2], label=score))

        return examples

In [7]:
# Read the dataset
model_name = 'roberta-base'
# 'bert-base-nli-stsb-mean-tokens'

batch_size = 16
reader = DataReader('')
model_save_path = ' '+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")


# Use RoBERTa for mapping tokens to embeddings
word_embedding_model = models.RoBERTa(model_name)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


model1 = SentenceTransformer('bert-base-nli-mean-tokens')

In [8]:
# training data
train_data = SentencesDataset(reader.get_examples(train_1, modelname = 'model_1'), model=model1)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
train_loss = losses.CosineSimilarityLoss(model=model1)

print('done')

# val data
dev_data = SentencesDataset(reader.get_examples(val_1, modelname = 'model_1'), model=model1)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

done


## Fitting the model

In [9]:
# Configure the training
num_epochs = 1

warmup_steps = math.ceil(len(train_dataloader) * num_epochs / batch_size * 0.1) #10% of train data for warm-up

# Train the model
model1.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=100,
          warmup_steps=warmup_steps,
          output_path=model_save_path
          )

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]
Iteration:   0%|          | 0/461 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/461 [00:00<03:48,  2.01it/s][A
Iteration:   0%|          | 2/461 [00:00<03:40,  2.08it/s][A
Iteration:   1%|          | 3/461 [00:01<03:24,  2.24it/s][A
Iteration:   1%|          | 4/461 [00:01<03:14,  2.36it/s][A
Iteration:   1%|          | 5/461 [00:02<03:02,  2.50it/s][A
Iteration:   1%|▏         | 6/461 [00:02<02:53,  2.63it/s][A
Iteration:   2%|▏         | 7/461 [00:02<03:08,  2.41it/s][A
Iteration:   2%|▏         | 8/461 [00:03<02:59,  2.53it/s][A
Iteration:   2%|▏         | 9/461 [00:03<03:04,  2.46it/s][A
Iteration:   2%|▏         | 10/461 [00:04<03:12,  2.35it/s][A
Iteration:   2%|▏         | 11/461 [00:04<03:00,  2.49it/s][A
Iteration:   3%|▎         | 12/461 [00:04<03:09,  2.37it/s][A
Iteration:   3%|▎         | 13/461 [00:05<03:15,  2.29it/s][A
Iteration:   3%|▎         | 14/461 [00:05<03:13,  2.31it/s][A
Iteration:   3%|▎         | 

Iteration:  56%|█████▌    | 257/461 [01:54<01:27,  2.33it/s][A
Iteration:  56%|█████▌    | 258/461 [01:54<01:23,  2.44it/s][A
Iteration:  56%|█████▌    | 259/461 [01:55<01:22,  2.44it/s][A
Iteration:  56%|█████▋    | 260/461 [01:55<01:29,  2.25it/s][A
Iteration:  57%|█████▋    | 261/461 [01:56<01:26,  2.32it/s][A
Iteration:  57%|█████▋    | 262/461 [01:56<01:26,  2.30it/s][A
Iteration:  57%|█████▋    | 263/461 [01:57<01:26,  2.30it/s][A
Iteration:  57%|█████▋    | 264/461 [01:57<01:21,  2.41it/s][A
Iteration:  57%|█████▋    | 265/461 [01:57<01:18,  2.50it/s][A
Iteration:  58%|█████▊    | 266/461 [01:58<01:16,  2.56it/s][A
Iteration:  58%|█████▊    | 267/461 [01:58<01:20,  2.40it/s][A
Iteration:  58%|█████▊    | 268/461 [01:59<01:17,  2.48it/s][A
Iteration:  58%|█████▊    | 269/461 [01:59<01:14,  2.58it/s][A
Iteration:  59%|█████▊    | 270/461 [01:59<01:12,  2.63it/s][A
Iteration:  59%|█████▉    | 271/461 [02:00<01:18,  2.41it/s][A
Iteration:  59%|█████▉    | 272/461 [02:

# Model_2 fitting

In [None]:
df2 = pd.read_csv("data/function_df.csv")

fold = int(len(df2)*0.95)
train_2 = df2
val_2 = df2[fold:]

# Read the dataset
model_name = '2222bert-base-nli-mean-tokens'
batch_size = 16
reader = DataReader('')
model_save_path = ' '+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")


model2 = SentenceTransformer('bert-base-nli-mean-tokens')

In [None]:
# training data
train_data = SentencesDataset(reader.get_examples(train_2, modelname = 'model_2'), model=model2)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
train_loss = losses.CosineSimilarityLoss(model=model2)

# val data
dev_data = SentencesDataset(reader.get_examples(val_2, modelname = 'model_2'), model=model2)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

In [None]:
# Configure the training
num_epochs = 1

warmup_steps = math.ceil(len(train_dataloader) * num_epochs / batch_size * 0.1) #10% of train data for warm-up

# Train the model
model2.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=100,
          warmup_steps=warmup_steps,
          output_path=model_save_path
          )

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]
Iteration:   0%|          | 0/1905 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/1905 [00:00<14:48,  2.14it/s][A
Iteration:   0%|          | 2/1905 [00:00<14:13,  2.23it/s][A
Iteration:   0%|          | 3/1905 [00:01<13:48,  2.30it/s][A
Iteration:   0%|          | 4/1905 [00:01<13:34,  2.33it/s][A
Iteration:   0%|          | 5/1905 [00:02<13:26,  2.36it/s][A
Iteration:   0%|          | 6/1905 [00:02<13:16,  2.38it/s][A
Iteration:   0%|          | 7/1905 [00:02<13:03,  2.42it/s][A
Iteration:   0%|          | 8/1905 [00:03<13:26,  2.35it/s][A
Iteration:   0%|          | 9/1905 [00:03<13:17,  2.38it/s][A
Iteration:   1%|          | 10/1905 [00:04<13:11,  2.39it/s][A
Iteration:   1%|          | 11/1905 [00:04<13:15,  2.38it/s][A
Iteration:   1%|          | 12/1905 [00:05<13:26,  2.35it/s][A
Iteration:   1%|          | 13/1905 [00:05<13:23,  2.36it/s][A
Iteration:   1%|          | 14/1905 [00:05<13:29,  2.34it/s][A
Iteration:   

Iteration:  28%|██▊       | 527/1905 [05:04<10:57,  2.10it/s][A
Iteration:  28%|██▊       | 528/1905 [05:04<10:32,  2.18it/s][A
Iteration:  28%|██▊       | 529/1905 [05:04<10:28,  2.19it/s][A
Iteration:  28%|██▊       | 530/1905 [05:05<10:30,  2.18it/s][A
Iteration:  28%|██▊       | 531/1905 [05:05<10:04,  2.27it/s][A
Iteration:  28%|██▊       | 532/1905 [05:06<10:10,  2.25it/s][A
Iteration:  28%|██▊       | 533/1905 [05:06<09:57,  2.30it/s][A
Iteration:  28%|██▊       | 534/1905 [05:07<10:31,  2.17it/s][A
Iteration:  28%|██▊       | 535/1905 [05:07<10:33,  2.16it/s][A
Iteration:  28%|██▊       | 536/1905 [05:08<10:17,  2.22it/s][A
Iteration:  28%|██▊       | 537/1905 [05:08<10:02,  2.27it/s][A
Iteration:  28%|██▊       | 538/1905 [05:08<09:51,  2.31it/s][A
Iteration:  28%|██▊       | 539/1905 [05:09<09:32,  2.39it/s][A
Iteration:  28%|██▊       | 540/1905 [05:09<09:35,  2.37it/s][A
Iteration:  28%|██▊       | 541/1905 [05:10<09:42,  2.34it/s][A
Iteration:  28%|██▊      

Iteration:  41%|████      | 777/1905 [07:25<08:52,  2.12it/s][A
Iteration:  41%|████      | 778/1905 [07:25<08:54,  2.11it/s][A
Iteration:  41%|████      | 779/1905 [07:25<08:23,  2.24it/s][A
Iteration:  41%|████      | 780/1905 [07:26<08:42,  2.15it/s][A
Iteration:  41%|████      | 781/1905 [07:26<08:28,  2.21it/s][A
Iteration:  41%|████      | 782/1905 [07:27<08:18,  2.25it/s][A
Iteration:  41%|████      | 783/1905 [07:27<08:07,  2.30it/s][A
Iteration:  41%|████      | 784/1905 [07:28<07:54,  2.36it/s][A
Iteration:  41%|████      | 785/1905 [07:28<08:14,  2.27it/s][A
Iteration:  41%|████▏     | 786/1905 [07:29<08:25,  2.22it/s][A
Iteration:  41%|████▏     | 787/1905 [07:29<08:33,  2.18it/s][A
Iteration:  41%|████▏     | 788/1905 [07:29<08:21,  2.23it/s][A
Iteration:  41%|████▏     | 789/1905 [07:30<08:12,  2.26it/s][A
Iteration:  41%|████▏     | 790/1905 [07:30<07:58,  2.33it/s][A
Iteration:  42%|████▏     | 791/1905 [07:31<07:53,  2.35it/s][A
Iteration:  42%|████▏    

Iteration:  54%|█████▍    | 1027/1905 [10:04<06:12,  2.36it/s][A
Iteration:  54%|█████▍    | 1028/1905 [10:04<06:00,  2.43it/s][A
Iteration:  54%|█████▍    | 1029/1905 [10:05<06:04,  2.40it/s][A
Iteration:  54%|█████▍    | 1030/1905 [10:05<06:10,  2.36it/s][A
Iteration:  54%|█████▍    | 1031/1905 [10:06<06:08,  2.37it/s][A
Iteration:  54%|█████▍    | 1032/1905 [10:06<06:20,  2.30it/s][A
Iteration:  54%|█████▍    | 1033/1905 [10:07<06:31,  2.23it/s][A
Iteration:  54%|█████▍    | 1034/1905 [10:07<06:32,  2.22it/s][A
Iteration:  54%|█████▍    | 1035/1905 [10:08<06:23,  2.27it/s][A
Iteration:  54%|█████▍    | 1036/1905 [10:08<06:20,  2.28it/s][A
Iteration:  54%|█████▍    | 1037/1905 [10:08<06:17,  2.30it/s][A
Iteration:  54%|█████▍    | 1038/1905 [10:09<06:08,  2.35it/s][A
Iteration:  55%|█████▍    | 1039/1905 [10:09<05:54,  2.44it/s][A
Iteration:  55%|█████▍    | 1040/1905 [10:10<05:47,  2.49it/s][A
Iteration:  55%|█████▍    | 1041/1905 [10:10<05:59,  2.41it/s][A
Iteration:

In [None]:
%%time

with open('data/train_only_text.pickle', 'rb') as f:
    train = pickle.load(f)
    
with open('data/test_only_text.pickle', 'rb') as f:
    test = pickle.load(f)
    

train["af_embed"] = model2.encode(list(train["accepted_function"].values))
train["rf_embed"] = model2.encode(list(train["rejected_function"].values))
train["ap_embed"] = model1.encode(list(train["accepted_product"].values))
train["rp_embed"] = model1.encode(list(train["rejected_product"].values))

# train["text_embed"] = model.encode(list(train["text"].values))
train["text_embed"] = model1.encode(list(train["text"].values))
# train["text_embed"] = model.encode(list(train["text"].values))
train["text_embed_2"] = model2.encode(list(train["text"].values))


test["af_embed"] = model2.encode(list(test["accepted_function"].values))
test["rf_embed"] = model2.encode(list(test["rejected_function"].values))
test["ap_embed"] = model1.encode(list(test["accepted_product"].values))
test["rp_embed"] = model1.encode(list(test["rejected_product"].values))

# train["text_embed"] = model.encode(list(train["text"].values))
test["text_embed"] = model1.encode(list(test["text"].values))
# train["text_embed"] = model.encode(list(train["text"].values))
test["text_embed_2"] = model2.encode(list(test["text"].values))

In [None]:
import pickle

ss = train["ap_embed"][0].shape[0]

tmp1 = pd.DataFrame().from_records(train["af_embed"])
tmp1.columns = [f'af_{str(i)}' for i in range(ss)]

tmp2 = pd.DataFrame().from_records(train["rf_embed"])
tmp2.columns = [f'rf_{str(i)}' for i in range(ss)]

tmp3 = pd.DataFrame().from_records(train["ap_embed"])
tmp3.columns = [f'ap_{str(i)}' for i in range(ss)]

tmp4 = pd.DataFrame().from_records(train["rp_embed"])
tmp4.columns = [f'rp_{str(i)}' for i in range(ss)]

tmp5 = pd.DataFrame().from_records(train["text_embed"])
tmp5.columns = [f'tt_{str(i)}' for i in range(ss)]

tmp6 = pd.DataFrame().from_records(train["text_embed_2"])
tmp6.columns = [f'tt2_{str(i)}' for i in range(ss)]


tmp = pd.concat([tmp1, tmp2, tmp3, tmp4, tmp5, tmp6], axis = 1)

tmp['id'] = train['id']
columns1 = [f'ap_{str(i)}' for i in range(ss)]+[f'rp_{str(i)}' for i in range(ss)]+[f'tt_{str(i)}' for i in range(ss)]
columns2 = [f'af_{str(i)}' for i in range(ss)]+[f'rf_{str(i)}' for i in range(ss)]+[f'tt2_{str(i)}' for i in range(ss)]

dict_to_write = {'train_x':tmp,
                 'train_y':train.target,
                 'columns_1':columns1,
                 'columns_2':columns2
}

with open('train_bert_7.pickle', 'wb') as handle:
    pickle.dump(dict_to_write, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    
    
ss = test["ap_embed"][0].shape[0]

tmp1 = pd.DataFrame().from_records(test["af_embed"])
tmp1.columns = [f'af_{str(i)}' for i in range(ss)]

tmp2 = pd.DataFrame().from_records(test["rf_embed"])
tmp2.columns = [f'rf_{str(i)}' for i in range(ss)]

tmp3 = pd.DataFrame().from_records(test["ap_embed"])
tmp3.columns = [f'ap_{str(i)}' for i in range(ss)]

tmp4 = pd.DataFrame().from_records(test["rp_embed"])
tmp4.columns = [f'rp_{str(i)}' for i in range(ss)]

tmp5 = pd.DataFrame().from_records(test["text_embed"])
tmp5.columns = [f'tt_{str(i)}' for i in range(ss)]

tmp6 = pd.DataFrame().from_records(test["text_embed_2"])
tmp6.columns = [f'tt2_{str(i)}' for i in range(ss)]


tmp = pd.concat([tmp1, tmp2, tmp3, tmp4, tmp5, tmp6], axis = 1)

tmp['id'] = test['id']

columns1 = [f'ap_{str(i)}' for i in range(ss)]+[f'rp_{str(i)}' for i in range(ss)]+[f'tt_{str(i)}' for i in range(ss)]
columns2 = [f'af_{str(i)}' for i in range(ss)]+[f'rf_{str(i)}' for i in range(ss)]+[f'tt2_{str(i)}' for i in range(ss)]    
    

dict_to_write_test = {'test_x':tmp,
                 'columns_1':columns1,
                 'columns_2':columns2
}

    
with open('test_bert_7.pickle', 'wb') as handle:
    pickle.dump(dict_to_write_test, handle, protocol=pickle.HIGHEST_PROTOCOL)