In [118]:
from torch.utils.data import DataLoader
import math
import os
from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import STSDataReader, LabelSentenceReader, InputExample
import logging
from datetime import datetime
import csv
import gzip


In [2]:
# Load the dataset
path = '/Users/patrickrs/Documents/GitLab/revealapp/10_cleaning/src'
current_path = os.getcwd()
os.chdir(path)
%run ./Load+Clean_News.ipynb
%run ./cont_to_cat_News.ipynb
os.chdir(current_path)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/patrickrs/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
data = data[['sim', 'sentence1', 'sentence2']]
data.to_csv('train_data.csv')


In [4]:
# Create mock corpus:
corpus = np.concatenate((data['sentence1'].values, data['sentence2'].values), axis = 0)
# And mock text string:
text1 = ""
for sent in corpus:
    text1 = text1 + sent + ". "

In [5]:
# Let's first create a dicitonary with mock feedback

doc_dict = {
            'Dict0': {'DocID': 0, 'FullText': text1},
            'Dict1': {'DocID': 1, 'FullText': 'bbbbbb bb b 2'},
            'Dict2': {'DocID': 2, 'FullText': 'ccccc cccc  2'},
           }

tags_dict = {
             'Dict0': {'ID': 0, 'DocID': 0, 'startIDx': 25, 'Lenght': 88, 'TagFamID': 0},
             'Dict1': {'ID': 0, 'DocID': 0, 'startIDx': 0, 'Lenght': 24, 'TagFamID': 0},
             'Dict2': {'ID': 0, 'DocID': 2, 'startIDx': 150, 'Lenght': 10, 'TagFamID': 2},
            }   

user_feedback_dict = {'Dict0': {'ID': 0, 'DocID': 0, 'startIDx': 25, 'Lenght': 88, 'TagFamID': 0, 
                                'user_generated' : False, 'similariry_score': 0.6, 'user_feedback': 0},
                      'Dict1': {'ID': 0, 'DocID': 0, 'startIDx': 0, 'Lenght': 24, 'TagFamID': 0,
                               'user_generated' : False, 'similariry_score': 0.8, 'user_feedback': 1},
                      'Dict2': {'ID': 0, 'DocID': 2, 'startIDx': 150, 'Lenght': 10, 'TagFamID': 2,
                               'user_generated' : False, 'similariry_score': 0.7, 'user_feedback': 1}
                      }

# Extract User Feedback from JSON:

In [6]:
# Let's extract the data first:
fb_sent = [] #feedback sentences
for dic in user_feedback_dict:
    if user_feedback_dict[dic]['DocID'] == 0:
        start = user_feedback_dict[dic]['startIDx'] # start if sentence
        end = user_feedback_dict[dic]['startIDx'] + user_feedback_dict[dic]['Lenght']
        fb_sent.append([text1[start:end], user_feedback_dict[dic]['user_feedback']])

In [7]:
tags_list = []
for dic in tags_dict:
    if tags_dict[dic]['DocID'] == 0:
        start = tags_dict[dic]['startIDx'] # start if sentence
        end = tags_dict[dic]['startIDx'] + tags_dict[dic]['Lenght']
        tags_list.append(text1[start:end])  

In [8]:
fb_sent = pd.DataFrame(fb_sent, columns = ['sentence1', 'feedback'])
fb_sent['sentence2'] = [tags_list[0] for sent in fb_sent]

In [9]:
fb_sent = fb_sent[['feedback','sentence1', 'sentence2']] #reordering cols
fb_sent

Unnamed: 0,feedback,sentence1,sentence2
0,0,promarket economists dont object corporations blatantly use snob appeal promote products,promarket economists dont object corporations blatantly use snob appeal promote products
1,1,last year wanted murder.,promarket economists dont object corporations blatantly use snob appeal promote products


# Load Model

In [10]:
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

In [11]:
# Load the dataset
path = '/Users/patrickrs/Documents/GitLab/revealapp/00_exploration/data/'
current_path = os.getcwd()
os.chdir(path)

# Read the dataset
model_name = 'bert-base-nli-mean-tokens' # change this to our trained model later
train_batch_size = 128
num_epochs = 4
model_save_path = 'output/training_model_continue_training-'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
sts_reader = STSDataReader('stsbenchmark/', normalize_scores=True) 

In [15]:
# Load a pre-trained sentence transformer model
os.chdir(current_path)
model = SentenceTransformer(model_name)
os.chdir(path)

2020-04-16 07:41:12 - Load pretrained SentenceTransformer: bert-base-nli-mean-tokens
2020-04-16 07:41:12 - Did not find a '/' or '\' in the name. Assume to download model from server.
2020-04-16 07:41:12 - Load SentenceTransformer from folder: /Users/patrickrs/.cache/torch/sentence_transformers/public.ukp.informatik.tu-darmstadt.de_reimers_sentence-transformers_v0.2_bert-base-nli-mean-tokens.zip
2020-04-16 07:41:12 - loading configuration file /Users/patrickrs/.cache/torch/sentence_transformers/public.ukp.informatik.tu-darmstadt.de_reimers_sentence-transformers_v0.2_bert-base-nli-mean-tokens.zip/0_BERT/config.json
2020-04-16 07:41:12 - Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  

In [149]:
class DataReader:
    """
    Reads our data. Each line contains two sentences (s1_col_idx, s2_col_idx) and one label (score_col_idx)
    """
    def __init__(self, dataset_folder, s1_col_idx=1, s2_col_idx=2, score_col_idx=0, delimiter=",",
                 quoting=csv.QUOTE_NONE, normalize_scores=True, min_score=0, max_score=5):
        self.dataset_folder = dataset_folder
        self.score_col_idx = score_col_idx
        self.s1_col_idx = s1_col_idx
        self.s2_col_idx = s2_col_idx
        self.delimiter = delimiter
        self.quoting = quoting
        self.normalize_scores = normalize_scores
        self.min_score = min_score
        self.max_score = max_score

    def get_examples(self, filename, max_examples=0):
        """
        filename specified which data split to use (train.csv, dev.csv, test.csv).
        """
        filepath = os.path.join(self.dataset_folder, filename)
        fIn = gzip.open(filepath, 'rt', encoding='utf8') if filename.endswith('.gz') else open(filepath, encoding="utf-8")
        data = csv.reader(fIn, delimiter=self.delimiter, quoting=self.quoting)
        examples = []
        for id, row in enumerate(data):
            score = float(row[self.score_col_idx])
            if self.normalize_scores:  # Normalize to a 0...1 value
                score = (score - self.min_score) / (self.max_score - self.min_score)

            s1 = row[self.s1_col_idx]
            s2 = row[self.s2_col_idx]
            examples.append(InputExample(guid=filename+str(id), texts=[s1, s2], label=score))

            if max_examples > 0 and len(examples) >= max_examples:
                break

        return examples

In [135]:
csv.reader('train_data.csv')

<_csv.reader at 0x10f74b350>

In [151]:
# Convert the dataset to a DataLoader ready for training
mydatareader = DataReader('test/', normalize_scores=True)
logging.info("Read STSbenchmark train dataset")
train_data = SentencesDataset(examples = mydatareader.get_examples('train_data.csv'), model = model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

logging.info("Read STSbenchmark dev dataset")
dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) # converts to embedding
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) 

Convert dataset:  95%|█████████▍| 620/654 [00:00<00:00, 3213.92it/s]

2020-04-16 10:31:58 - Read STSbenchmark train dataset


Convert dataset: 100%|██████████| 654/654 [00:00<00:00, 3046.88it/s]
Convert dataset:  21%|██        | 317/1500 [00:00<00:00, 3165.10it/s]

2020-04-16 10:31:58 - Num sentences: 654
2020-04-16 10:31:58 - Sentences 0 longer than max_seqence_length: 0
2020-04-16 10:31:58 - Sentences 1 longer than max_seqence_length: 0
2020-04-16 10:31:58 - Read STSbenchmark dev dataset


Convert dataset: 100%|██████████| 1500/1500 [00:00<00:00, 2077.56it/s]

2020-04-16 10:31:59 - Num sentences: 1500
2020-04-16 10:31:59 - Sentences 0 longer than max_seqence_length: 0
2020-04-16 10:31:59 - Sentences 1 longer than max_seqence_length: 0





[<sentence_transformers.readers.InputExample.InputExample at 0x10f755b90>,
 <sentence_transformers.readers.InputExample.InputExample at 0x10f755510>,
 <sentence_transformers.readers.InputExample.InputExample at 0x10f755e90>,
 <sentence_transformers.readers.InputExample.InputExample at 0x10f755d50>,
 <sentence_transformers.readers.InputExample.InputExample at 0x10f755ed0>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a49996c10>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a49996c50>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a49996790>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a49996690>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a49996a10>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a49993e50>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a49993d10>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a49995ed0>,
 <sentence_transf

In [107]:
# DELETE THIS CELL
#SentencesDataset
def get_examples(data, max_examples=0):
    s1 = data['sentence1']
    s2 = data['sentence2']
    labels = data['sim']



    examples = []
    id = 0
    for sentence_a, sentence_b, label in zip(s1, s2, labels):
        guid = "%s-%d" % ('id_name', id)
        id += 1
        examples.append(InputExample(guid=guid, texts=[sentence_a, sentence_b], label=labels))

        if 0 < max_examples <= len(examples):
            break
    return examples

[<sentence_transformers.readers.InputExample.InputExample at 0x1a7e1b81d0>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a49996810>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a49996c90>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a49996510>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a49996dd0>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a49996f10>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a49996e90>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a49996d10>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a4992f050>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a4f40b8d0>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a4f40bd90>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a4f40b810>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a4f40b710>,
 <sentence_t

In [153]:
# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_data)*num_epochs/train_batch_size*0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

2020-04-16 10:32:21 - Warmup-steps: 3


In [154]:
# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=100,
          warmup_steps=warmup_steps,
          output_path=model_name
         )

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]
Iteration:   0%|          | 0/6 [00:00<?, ?it/s][A
Iteration:  17%|█▋        | 1/6 [00:15<01:16, 15.25s/it][A
Iteration:  33%|███▎      | 2/6 [00:29<01:00, 15.21s/it][A
Iteration:  50%|█████     | 3/6 [00:47<00:46, 15.36s/it][A
Iteration:  67%|██████▋   | 4/6 [01:02<00:30, 15.31s/it][A
Iteration:  83%|████████▎ | 5/6 [01:19<00:15, 15.42s/it][A
Iteration: 100%|██████████| 6/6 [01:22<00:00, 13.76s/it][A
Epoch:   0%|          | 0/4 [01:22<?, ?it/s]
Convert Evaluating:   0%|          | 0/12 [00:00<?, ?it/s][A

2020-04-16 10:33:46 - Evaluation the model on  dataset after epoch 0:



Convert Evaluating:   8%|▊         | 1/12 [00:04<00:50,  4.63s/it][A
Convert Evaluating:  17%|█▋        | 2/12 [00:09<00:47,  4.76s/it][A
Convert Evaluating:  25%|██▌       | 3/12 [00:15<00:44,  4.99s/it][A
Convert Evaluating:  33%|███▎      | 4/12 [00:21<00:43,  5.42s/it][A
Convert Evaluating:  42%|████▏     | 5/12 [00:31<00:47,  6.73s/it][A
Convert Evaluating:  50%|█████     | 6/12 [00:44<00:51,  8.51s/it][A
Convert Evaluating:  58%|█████▊    | 7/12 [00:53<00:44,  8.82s/it][A
Convert Evaluating:  67%|██████▋   | 8/12 [01:03<00:36,  9.03s/it][A
Convert Evaluating:  75%|███████▌  | 9/12 [01:14<00:29,  9.69s/it][A
Convert Evaluating:  83%|████████▎ | 10/12 [01:25<00:20, 10.17s/it][A
Convert Evaluating:  92%|█████████▏| 11/12 [01:31<00:08,  8.78s/it][A
Convert Evaluating: 100%|██████████| 12/12 [01:35<00:00,  7.93s/it][A
Epoch:   0%|          | 0/4 [02:57<?, ?it/s]

2020-04-16 10:35:21 - Cosine-Similarity :	Pearson: 0.5240	Spearman: 0.5845
2020-04-16 10:35:21 - Manhattan-Distance:	Pearson: 0.5820	Spearman: 0.5856
2020-04-16 10:35:21 - Euclidean-Distance:	Pearson: 0.5804	Spearman: 0.5855
2020-04-16 10:35:21 - Dot-Product-Similarity:	Pearson: 0.4837	Spearman: 0.4749
2020-04-16 10:35:21 - Save model to bert-base-nli-mean-tokens
2020-04-16 10:35:21 - Configuration saved in bert-base-nli-mean-tokens/0_BERT/config.json


Epoch:  25%|██▌       | 1/4 [02:58<08:55, 178.48s/it]
Iteration:   0%|          | 0/6 [00:00<?, ?it/s][A

2020-04-16 10:35:22 - Model weights saved in bert-base-nli-mean-tokens/0_BERT/pytorch_model.bin



Iteration:  17%|█▋        | 1/6 [00:15<01:16, 15.36s/it][A
Iteration:  33%|███▎      | 2/6 [00:29<01:01, 15.29s/it][A
Iteration:  50%|█████     | 3/6 [00:47<00:46, 15.41s/it][A
Iteration:  67%|██████▋   | 4/6 [01:01<00:30, 15.36s/it][A
Iteration:  83%|████████▎ | 5/6 [01:20<00:15, 15.52s/it][A
Iteration: 100%|██████████| 6/6 [01:22<00:00, 13.79s/it][A
Epoch:  25%|██▌       | 1/4 [04:21<08:55, 178.48s/it]
Convert Evaluating:   0%|          | 0/12 [00:00<?, ?it/s][A

2020-04-16 10:36:44 - Evaluation the model on  dataset after epoch 1:



Convert Evaluating:   8%|▊         | 1/12 [00:04<00:52,  4.75s/it][A
Convert Evaluating:  17%|█▋        | 2/12 [00:09<00:48,  4.84s/it][A
Convert Evaluating:  25%|██▌       | 3/12 [00:15<00:45,  5.07s/it][A
Convert Evaluating:  33%|███▎      | 4/12 [00:21<00:43,  5.49s/it][A
Convert Evaluating:  42%|████▏     | 5/12 [00:31<00:47,  6.82s/it][A
Convert Evaluating:  50%|█████     | 6/12 [00:44<00:51,  8.58s/it][A
Convert Evaluating:  58%|█████▊    | 7/12 [00:54<00:44,  8.89s/it][A
Convert Evaluating:  67%|██████▋   | 8/12 [01:03<00:36,  9.11s/it][A
Convert Evaluating:  75%|███████▌  | 9/12 [01:15<00:29,  9.93s/it][A
Convert Evaluating:  83%|████████▎ | 10/12 [01:27<00:20, 10.39s/it][A
Convert Evaluating:  92%|█████████▏| 11/12 [01:32<00:08,  8.97s/it][A
Convert Evaluating: 100%|██████████| 12/12 [01:36<00:00,  8.06s/it][A
Epoch:  50%|█████     | 2/4 [05:58<05:57, 178.80s/it]
Iteration:   0%|          | 0/6 [00:00<?, ?it/s][A

2020-04-16 10:38:21 - Cosine-Similarity :	Pearson: 0.3684	Spearman: 0.5002
2020-04-16 10:38:21 - Manhattan-Distance:	Pearson: 0.4618	Spearman: 0.4998
2020-04-16 10:38:21 - Euclidean-Distance:	Pearson: 0.4608	Spearman: 0.5005
2020-04-16 10:38:21 - Dot-Product-Similarity:	Pearson: -0.0039	Spearman: -0.0059



Iteration:  17%|█▋        | 1/6 [00:18<01:33, 18.76s/it][A
Iteration:  33%|███▎      | 2/6 [00:36<01:14, 18.72s/it][A
Iteration:  50%|█████     | 3/6 [00:53<00:55, 18.61s/it][A
Iteration:  67%|██████▋   | 4/6 [01:08<00:36, 18.44s/it][A
Iteration:  83%|████████▎ | 5/6 [01:22<00:18, 18.25s/it][A
Iteration: 100%|██████████| 6/6 [01:25<00:00, 14.29s/it][A
Epoch:  50%|█████     | 2/4 [07:23<05:57, 178.80s/it]
Convert Evaluating:   0%|          | 0/12 [00:00<?, ?it/s][A

2020-04-16 10:39:47 - Evaluation the model on  dataset after epoch 2:



Convert Evaluating:   8%|▊         | 1/12 [00:04<00:50,  4.62s/it][A
Convert Evaluating:  17%|█▋        | 2/12 [00:09<00:47,  4.78s/it][A
Convert Evaluating:  25%|██▌       | 3/12 [00:15<00:45,  5.06s/it][A
Convert Evaluating:  33%|███▎      | 4/12 [00:21<00:43,  5.49s/it][A
Convert Evaluating:  42%|████▏     | 5/12 [00:31<00:47,  6.81s/it][A
Convert Evaluating:  50%|█████     | 6/12 [00:44<00:51,  8.59s/it][A
Convert Evaluating:  58%|█████▊    | 7/12 [00:54<00:44,  8.96s/it][A
Convert Evaluating:  67%|██████▋   | 8/12 [01:04<00:36,  9.18s/it][A
Convert Evaluating:  75%|███████▌  | 9/12 [01:15<00:29,  9.82s/it][A
Convert Evaluating:  83%|████████▎ | 10/12 [01:26<00:20, 10.32s/it][A
Convert Evaluating:  92%|█████████▏| 11/12 [01:32<00:08,  8.92s/it][A
Convert Evaluating: 100%|██████████| 12/12 [01:36<00:00,  8.07s/it][A
Epoch:  75%|███████▌  | 3/4 [09:00<02:59, 179.94s/it]
Iteration:   0%|          | 0/6 [00:00<?, ?it/s][A

2020-04-16 10:41:24 - Cosine-Similarity :	Pearson: 0.3803	Spearman: 0.5022
2020-04-16 10:41:24 - Manhattan-Distance:	Pearson: 0.4703	Spearman: 0.5010
2020-04-16 10:41:24 - Euclidean-Distance:	Pearson: 0.4693	Spearman: 0.5021
2020-04-16 10:41:24 - Dot-Product-Similarity:	Pearson: -0.0131	Spearman: -0.0142



Iteration:  17%|█▋        | 1/6 [00:14<01:10, 14.07s/it][A
Iteration:  33%|███▎      | 2/6 [00:33<00:57, 14.32s/it][A
Iteration:  50%|█████     | 3/6 [00:51<00:43, 14.54s/it][A
Iteration:  67%|██████▋   | 4/6 [01:07<00:29, 14.60s/it][A
Iteration:  83%|████████▎ | 5/6 [01:24<00:14, 14.71s/it][A
Iteration: 100%|██████████| 6/6 [01:27<00:00, 14.52s/it][A
Epoch:  75%|███████▌  | 3/4 [10:27<02:59, 179.94s/it]
Convert Evaluating:   0%|          | 0/12 [00:00<?, ?it/s][A

2020-04-16 10:42:51 - Evaluation the model on  dataset after epoch 3:



Convert Evaluating:   8%|▊         | 1/12 [00:04<00:51,  4.65s/it][A
Convert Evaluating:  17%|█▋        | 2/12 [00:09<00:48,  4.82s/it][A
Convert Evaluating:  25%|██▌       | 3/12 [00:15<00:45,  5.09s/it][A
Convert Evaluating:  33%|███▎      | 4/12 [00:22<00:44,  5.54s/it][A
Convert Evaluating:  42%|████▏     | 5/12 [00:32<00:48,  6.93s/it][A
Convert Evaluating:  50%|█████     | 6/12 [00:45<00:52,  8.68s/it][A
Convert Evaluating:  58%|█████▊    | 7/12 [00:54<00:45,  9.01s/it][A
Convert Evaluating:  67%|██████▋   | 8/12 [01:04<00:36,  9.24s/it][A
Convert Evaluating:  75%|███████▌  | 9/12 [01:16<00:29,  9.87s/it][A
Convert Evaluating:  83%|████████▎ | 10/12 [01:27<00:20, 10.40s/it][A
Convert Evaluating:  92%|█████████▏| 11/12 [01:33<00:09,  9.02s/it][A
Convert Evaluating: 100%|██████████| 12/12 [01:37<00:00,  8.12s/it][A
Epoch: 100%|██████████| 4/4 [12:05<00:00, 181.32s/it]

2020-04-16 10:44:28 - Cosine-Similarity :	Pearson: 0.3877	Spearman: 0.5024
2020-04-16 10:44:28 - Manhattan-Distance:	Pearson: 0.4711	Spearman: 0.5010
2020-04-16 10:44:28 - Euclidean-Distance:	Pearson: 0.4699	Spearman: 0.5017
2020-04-16 10:44:28 - Dot-Product-Similarity:	Pearson: -0.0358	Spearman: -0.0511





In [155]:
model.save(model_save_path)

2020-04-16 10:45:58 - Save model to output/training_model_continue_training-bert-base-nli-mean-tokens-2020-04-16_07-39-09
2020-04-16 10:45:58 - Configuration saved in output/training_model_continue_training-bert-base-nli-mean-tokens-2020-04-16_07-39-09/0_BERT/config.json
2020-04-16 10:45:58 - Model weights saved in output/training_model_continue_training-bert-base-nli-mean-tokens-2020-04-16_07-39-09/0_BERT/pytorch_model.bin


In [None]:
train_dataloader