In [156]:
from torch.utils.data import DataLoader
import math
import os
from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, losses, models
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import STSDataReader, LabelSentenceReader, InputExample
import logging
from datetime import datetime
import csv
import gzip


In [2]:
# Load the dataset
path = '/Users/patrickrs/Documents/GitLab/revealapp/10_cleaning/src'
current_path = os.getcwd()
os.chdir(path)
%run ./Load+Clean_News.ipynb
%run ./cont_to_cat_News.ipynb
os.chdir(current_path)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/patrickrs/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
data = data[['sim', 'sentence1', 'sentence2']]
data.to_csv('train_data.csv')


In [4]:
# Create mock corpus:
corpus = np.concatenate((data['sentence1'].values, data['sentence2'].values), axis = 0)
# And mock text string:
text1 = ""
for sent in corpus:
    text1 = text1 + sent + ". "

In [250]:
# Let's first create a dicitonary with mock feedback

doc_dict = {
            "DocID":{"0":0,"1":2,"2":3},
            "FullText":{
                "0":text1,
                "1":"This is a random document with ID 2. In this document we are talking about dogs. And there is also something about mice.",
                "2":"This is a random document with ID 3. this doc is exclusivly about Elephants. So no mice, cats nor anything else."
                        }  
            }

tags_dict  = {
              "TagID":{"0":1,"1":2},
              "DocID":{"0":0,"1":1},
              "startIdx":{"0":37,"1":81},
              "Length":{"0":43,"1":39}
            } 

user_feedback_dict = {
                      "fbID":{"0":1,"1":2,"2":3,"3":4},
                      "DocID":{"0":1,"1":0,"2":0,"3":3},
                      "startIdx":{"0":81,"1":81,"2":37,"3":77},
                      "Length":{"0":39,"1":39,"2":39,"3":34},
                      "TagID":{"0":1,"1":1,"2":1,"3":1},
                      "SimilarityScore":{"0":0.74,"1":0.5,"2":0.8,"3":0.77},
                      "Accepted":{"0":None,"1":0.0,"2":1.0,"3":1.0}
                    }

In [220]:
pd.DataFrame(user_feedback_dict).dropna()

Unnamed: 0,fbID,DocID,startIdx,Length,TagID,SimilarityScore,Accepted
1,2,0,81,39,1,0.5,0.0
2,3,0,37,39,1,0.8,1.0
3,4,3,77,34,1,0.77,1.0


# Extract User Feedback from JSON:

In [216]:
user_feedback_df = pd.DataFrame(user_feedback_dict).dropna()

In [265]:
# Let's extract the data first:
fb_sent = [] #feedback sentences
for idx, row in user_feedback_df.iterrows():
    if row['DocID'] == 0: 
        start = int(row['startIdx']) # start if sentence
        end = int(row['startIdx'] + row['Length'])
        fb_sent.append([text1[start:end], row['Accepted'], row['TagID']])

In [266]:
fb_sent = pd.DataFrame(fb_sent, columns = ['sentence1', 'Accepted', 'TagID'])

In [267]:
tags_df = pd.DataFrame(tags_dict).dropna()
tags_df

Unnamed: 0,TagID,DocID,startIdx,Length
0,1,0,37,43
1,2,1,81,39


In [268]:
# Now let's match the user feedback to the original tags:
tags_list = []
for idx, row in tags_df.iterrows():
    if row['DocID'] == 0:
        start = int(row['startIdx']) # start if sentence
        end = int(row['startIdx'] + row['Length'])
        tags_list.append(text1[start:end])  

In [269]:
fb_sent['sentence2'] = [tags_list[0] for sent in fb_sent['sentence1']]

In [270]:
fb_sent

Unnamed: 0,sentence1,Accepted,TagID,sentence2
0,use snob appeal promote products. perha,0.0,1.0,onomists dont object corporations blatantly
1,onomists dont object corporations blata,1.0,1.0,onomists dont object corporations blatantly


In [272]:
fb_sent = fb_sent[['Accepted','sentence1', 'sentence2']] #reordering cols
fb_sent

Unnamed: 0,Accepted,sentence1,sentence2
0,0.0,use snob appeal promote products. perha,onomists dont object corporations blatantly
1,1.0,onomists dont object corporations blata,onomists dont object corporations blatantly


# Load Model

In [278]:
list(enumerate(fb_sent.values))

[(0,
  array([0.0, 'use snob appeal promote products. perha',
         'onomists dont object corporations blatantly'], dtype=object)),
 (1,
  array([1.0, 'onomists dont object corporations blata',
         'onomists dont object corporations blatantly'], dtype=object))]

In [10]:
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

In [288]:
# Load the dataset
path = '/Users/patrickrs/Documents/GitLab/revealapp/00_exploration/data/'
current_path = os.getcwd()
os.chdir(path)

# Read the dataset
model_name = 'bert-base-nli-mean-tokens' # change this to our trained model later
train_batch_size = 128
num_epochs = 4
model_save_path = 'output/training_model_continue_training-'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
sts_reader = STSDataReader('stsbenchmark/', normalize_scores=True) 

In [15]:
# Load a pre-trained sentence transformer model
os.chdir(current_path)
model = SentenceTransformer(model_name)
os.chdir(path)

2020-04-16 07:41:12 - Load pretrained SentenceTransformer: bert-base-nli-mean-tokens
2020-04-16 07:41:12 - Did not find a '/' or '\' in the name. Assume to download model from server.
2020-04-16 07:41:12 - Load SentenceTransformer from folder: /Users/patrickrs/.cache/torch/sentence_transformers/public.ukp.informatik.tu-darmstadt.de_reimers_sentence-transformers_v0.2_bert-base-nli-mean-tokens.zip
2020-04-16 07:41:12 - loading configuration file /Users/patrickrs/.cache/torch/sentence_transformers/public.ukp.informatik.tu-darmstadt.de_reimers_sentence-transformers_v0.2_bert-base-nli-mean-tokens.zip/0_BERT/config.json
2020-04-16 07:41:12 - Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  

In [149]:
# this reader works for csv and gzip files. There is one for dataframes below.
class DataReader:
    """
    Reads our data. Each line contains two sentences (s1_col_idx, s2_col_idx) and one label (score_col_idx)
    """
    def __init__(self, dataset_folder, s1_col_idx=1, s2_col_idx=2, score_col_idx=0, delimiter=",",
                 quoting=csv.QUOTE_NONE, normalize_scores=True, min_score=0, max_score=5):
        self.dataset_folder = dataset_folder
        self.score_col_idx = score_col_idx
        self.s1_col_idx = s1_col_idx
        self.s2_col_idx = s2_col_idx
        self.delimiter = delimiter
        self.quoting = quoting
        self.normalize_scores = normalize_scores
        self.min_score = min_score
        self.max_score = max_score

    def get_examples(self, filename, max_examples=0):
        """
        filename specified which data split to use (train.csv, dev.csv, test.csv).
        """
        filepath = os.path.join(self.dataset_folder, filename)
        fIn = gzip.open(filepath, 'rt', encoding='utf8') if filename.endswith('.gz') else open(filepath, encoding="utf-8")
        data = csv.reader(fIn, delimiter=self.delimiter, quoting=self.quoting)
        examples = []
        for id, row in enumerate(data):
            score = float(row[self.score_col_idx])
            if self.normalize_scores:  # Normalize to a 0...1 value
                score = (score - self.min_score) / (self.max_score - self.min_score)

            s1 = row[self.s1_col_idx]
            s2 = row[self.s2_col_idx]
            examples.append(InputExample(guid=filename+str(id), texts=[s1, s2], label=score))

            if max_examples > 0 and len(examples) >= max_examples:
                break

        return examples

In [283]:
# this reader works for dataframes.
class DFReader:
    """
    Reads our data. Each line contains two sentences (s1_col_idx, s2_col_idx) and one label (score_col_idx)
    """
    def __init__(self, s1_col_idx=1, s2_col_idx=2, score_col_idx=0):

        self.score_col_idx = score_col_idx
        self.s1_col_idx = s1_col_idx
        self.s2_col_idx = s2_col_idx

    def get_examples(self, df, max_examples=0):
        """
        
        """
        examples = []
        for id, row in df.iterrows():
            score = float(row[self.score_col_idx])
            s1 = row[self.s1_col_idx]
            s2 = row[self.s2_col_idx]
            examples.append(InputExample(guid='temp_df'+str(id), texts=[s1, s2], label=score))

        return examples

In [290]:
# Convert the dataset to a DataLoader ready for training
logging.info("Read feedback dataset")
train_data = SentencesDataset(examples = DFReader().get_examples(fb_sent), model = model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

logging.info("Read STSbenchmark dev dataset")
dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv', max_examples = 10), model=model) # converts to embedding
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) 

Convert dataset: 100%|██████████| 2/2 [00:00<00:00, 2070.24it/s]
Convert dataset: 100%|██████████| 10/10 [00:00<00:00, 3455.51it/s]

2020-04-17 08:50:22 - Read feedback dataset
2020-04-17 08:50:22 - Num sentences: 2
2020-04-17 08:50:22 - Sentences 0 longer than max_seqence_length: 0
2020-04-17 08:50:22 - Sentences 1 longer than max_seqence_length: 0
2020-04-17 08:50:22 - Read STSbenchmark dev dataset
2020-04-17 08:50:22 - Num sentences: 10
2020-04-17 08:50:22 - Sentences 0 longer than max_seqence_length: 0
2020-04-17 08:50:22 - Sentences 1 longer than max_seqence_length: 0





[<sentence_transformers.readers.InputExample.InputExample at 0x10f755b90>,
 <sentence_transformers.readers.InputExample.InputExample at 0x10f755510>,
 <sentence_transformers.readers.InputExample.InputExample at 0x10f755e90>,
 <sentence_transformers.readers.InputExample.InputExample at 0x10f755d50>,
 <sentence_transformers.readers.InputExample.InputExample at 0x10f755ed0>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a49996c10>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a49996c50>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a49996790>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a49996690>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a49996a10>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a49993e50>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a49993d10>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a49995ed0>,
 <sentence_transf

In [107]:
# DELETE THIS CELL
#SentencesDataset
def get_examples(data, max_examples=0):
    s1 = data['sentence1']
    s2 = data['sentence2']
    labels = data['sim']



    examples = []
    id = 0
    for sentence_a, sentence_b, label in zip(s1, s2, labels):
        guid = "%s-%d" % ('id_name', id)
        id += 1
        examples.append(InputExample(guid=guid, texts=[sentence_a, sentence_b], label=labels))

        if 0 < max_examples <= len(examples):
            break
    return examples

[<sentence_transformers.readers.InputExample.InputExample at 0x1a7e1b81d0>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a49996810>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a49996c90>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a49996510>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a49996dd0>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a49996f10>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a49996e90>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a49996d10>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a4992f050>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a4f40b8d0>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a4f40bd90>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a4f40b810>,
 <sentence_transformers.readers.InputExample.InputExample at 0x1a4f40b710>,
 <sentence_t

In [285]:
# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_data)*num_epochs/train_batch_size*0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

2020-04-17 08:38:32 - Warmup-steps: 1


In [292]:
# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=100,
          warmup_steps=warmup_steps,
          #output_path=model_save_path # will use save below in order to overwrite
         )

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]
Iteration:   0%|          | 0/1 [00:00<?, ?it/s][A
Iteration: 100%|██████████| 1/1 [00:02<00:00,  2.42s/it][A
Epoch:   0%|          | 0/4 [00:02<?, ?it/s]
Convert Evaluating:   0%|          | 0/1 [00:00<?, ?it/s][A

2020-04-17 08:50:38 - Evaluation the model on  dataset after epoch 0:



Convert Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.84it/s][A
Epoch:  25%|██▌       | 1/4 [00:02<00:08,  2.75s/it]
Iteration:   0%|          | 0/1 [00:00<?, ?it/s][A

2020-04-17 08:50:38 - Cosine-Similarity :	Pearson: -0.1779	Spearman: 0.0938
2020-04-17 08:50:38 - Manhattan-Distance:	Pearson: -0.1063	Spearman: 0.0938
2020-04-17 08:50:38 - Euclidean-Distance:	Pearson: -0.0961	Spearman: 0.0938
2020-04-17 08:50:38 - Dot-Product-Similarity:	Pearson: -0.2727	Spearman: -0.2001



Iteration: 100%|██████████| 1/1 [00:02<00:00,  2.21s/it][A
Epoch:  25%|██▌       | 1/4 [00:05<00:08,  2.75s/it]
Convert Evaluating:   0%|          | 0/1 [00:00<?, ?it/s][A

2020-04-17 08:50:40 - Evaluation the model on  dataset after epoch 1:



Convert Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.69it/s][A
Epoch:  50%|█████     | 2/4 [00:05<00:05,  2.69s/it]
Iteration:   0%|          | 0/1 [00:00<?, ?it/s][A

2020-04-17 08:50:41 - Cosine-Similarity :	Pearson: -0.1779	Spearman: 0.0938
2020-04-17 08:50:41 - Manhattan-Distance:	Pearson: -0.1053	Spearman: 0.0938
2020-04-17 08:50:41 - Euclidean-Distance:	Pearson: -0.0952	Spearman: 0.0938
2020-04-17 08:50:41 - Dot-Product-Similarity:	Pearson: -0.2644	Spearman: -0.1688



Iteration: 100%|██████████| 1/1 [00:02<00:00,  2.23s/it][A
Epoch:  50%|█████     | 2/4 [00:07<00:05,  2.69s/it]
Convert Evaluating:   0%|          | 0/1 [00:00<?, ?it/s][A

2020-04-17 08:50:43 - Evaluation the model on  dataset after epoch 2:



Convert Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.80it/s][A
Epoch:  75%|███████▌  | 3/4 [00:07<00:02,  2.65s/it]
Iteration:   0%|          | 0/1 [00:00<?, ?it/s][A

2020-04-17 08:50:43 - Cosine-Similarity :	Pearson: -0.1768	Spearman: 0.0938
2020-04-17 08:50:43 - Manhattan-Distance:	Pearson: -0.1034	Spearman: 0.0938
2020-04-17 08:50:43 - Euclidean-Distance:	Pearson: -0.0936	Spearman: 0.0938
2020-04-17 08:50:43 - Dot-Product-Similarity:	Pearson: -0.2566	Spearman: -0.1688



Iteration: 100%|██████████| 1/1 [00:02<00:00,  2.22s/it][A
Epoch:  75%|███████▌  | 3/4 [00:10<00:02,  2.65s/it]
Convert Evaluating:   0%|          | 0/1 [00:00<?, ?it/s][A

2020-04-17 08:50:46 - Evaluation the model on  dataset after epoch 3:



Convert Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.84it/s][A
Epoch: 100%|██████████| 4/4 [00:10<00:00,  2.60s/it]

2020-04-17 08:50:46 - Cosine-Similarity :	Pearson: -0.1753	Spearman: 0.0938
2020-04-17 08:50:46 - Manhattan-Distance:	Pearson: -0.1021	Spearman: 0.0938
2020-04-17 08:50:46 - Euclidean-Distance:	Pearson: -0.0924	Spearman: 0.0938
2020-04-17 08:50:46 - Dot-Product-Similarity:	Pearson: -0.2497	Spearman: -0.1688





In [294]:
model.save(model_save_path)

2020-04-17 08:58:45 - Save model to output/training_model_continue_training-bert-base-nli-mean-tokens-2020-04-17_08-41-35
2020-04-17 08:58:45 - Configuration saved in output/training_model_continue_training-bert-base-nli-mean-tokens-2020-04-17_08-41-35/0_BERT/config.json
2020-04-17 08:58:46 - Model weights saved in output/training_model_continue_training-bert-base-nli-mean-tokens-2020-04-17_08-41-35/0_BERT/pytorch_model.bin


In [None]:
train_dataloader

In [None]:
models.