In [121]:
#https://www.aclweb.org/anthology/D19-1410.pdf

from sentence_transformers import SentenceTransformer, LoggingHandler
import numpy as np
import logging
import os
import scipy

import torch
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import STSDataReader

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

from datetime import datetime

#plotting
import matplotlib.pyplot as plt
from matplotlib.image import NonUniformImage
from matplotlib import cm
import matplotlib.colors as mcolors

from sklearn.linear_model import SGDClassifier, LinearRegression, LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score, roc_curve, mean_squared_error

In [122]:
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

# Load data and prepare corpus:

In [123]:
# Load the dataset
current_path = os.getcwd()
os.chdir(path)
%run ./Load+Clean_News.ipynb
%run ./cont_to_cat_News.ipynb
os.chdir(current_path)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/patrickrs/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [124]:
# Create mock corpus
corpus = np.concatenate((data['sentence1'].values, data['sentence2'].values), axis = 0)

In [127]:
# Establishing some basic variables, names, and paths
model_name = 'bert-base-nli-mean-tokens' # change this to trained model later
train_batch_size = 128
num_epochs = 4
model_save_path = '../output/training_news_continue_training-'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [127]:
# Load a pre-trained sentence transformer model
model = SentenceTransformer(model_name)

# Tagged sentences

In [192]:
#Let's create a mock dictionary for practice:
# The user has tagged 3 sentences:

text1 = ""
for sent in corpus:
    text1 = text1 + sent + ". "


docs = {
        'Dict0': {'DocID': 0, 'FullText': text1},
        'Dict1': {'DocID': 1, 'FullText': 'bbbbbb bb b 2'},
        'Dict2': {'DocID': 2, 'FullText': 'ccccc cccc  2'},
       }

tags_dict = {
             'Dict0': {'ID': 0, 'DocID': 0, 'startIDx': 25, 'Lenght': 88, 'TagFamID': 0},
             'Dict1': {'ID': 0, 'DocID': 0, 'startIDx': 0, 'Lenght': 24, 'TagFamID': 0},
             'Dict2': {'ID': 0, 'DocID': 2, 'startIDx': 150, 'Lenght': 10, 'TagFamID': 2},
            }   

In [196]:
tags_dict.items()

dict_items([('Dict0', {'ID': 0, 'DocID': 0, 'startIDx': 25, 'Lenght': 88, 'TagFamID': 0}), ('Dict1', {'ID': 0, 'DocID': 0, 'startIDx': 0, 'Lenght': 24, 'TagFamID': 0}), ('Dict2', {'ID': 0, 'DocID': 2, 'startIDx': 150, 'Lenght': 10, 'TagFamID': 2})])

In [201]:
tags_list = []
for dic in tags_dict:
    if tags_dict[dic]['DocID'] == 0:
        start = tags_dict[dic]['startIDx'] # start if sentence
        end = tags_dict[dic]['startIDx'] + tags_dict[dic]['Lenght']
        tags_list.append(text1[start:end])   

In [202]:
tags_list # it works!

['promarket economists dont object corporations blatantly use snob appeal promote products',
 'last year wanted murder.']

# Encoding embeddings and tagged sentences

In [131]:
corpus_embeddings = model.encode(corpus)


Batches:   0%|          | 0/164 [00:00<?, ?it/s][A
Batches:   1%|          | 2/164 [00:00<00:09, 17.80it/s][A
Batches:   2%|▏         | 4/164 [00:00<00:09, 17.18it/s][A
Batches:   4%|▎         | 6/164 [00:00<00:09, 16.40it/s][A
Batches:   5%|▍         | 8/164 [00:00<00:09, 16.35it/s][A
Batches:   6%|▌         | 10/164 [00:00<00:10, 15.37it/s][A
Batches:   7%|▋         | 12/164 [00:00<00:09, 15.30it/s][A
Batches:   9%|▊         | 14/164 [00:00<00:10, 14.62it/s][A
Batches:  10%|▉         | 16/164 [00:01<00:10, 13.73it/s][A
Batches:  11%|█         | 18/164 [00:01<00:11, 13.00it/s][A
Batches:  12%|█▏        | 20/164 [00:01<00:11, 12.42it/s][A
Batches:  13%|█▎        | 22/164 [00:01<00:11, 12.24it/s][A
Batches:  15%|█▍        | 24/164 [00:01<00:11, 11.69it/s][A
Batches:  16%|█▌        | 26/164 [00:01<00:11, 11.77it/s][A
Batches:  17%|█▋        | 28/164 [00:02<00:11, 11.44it/s][A
Batches:  18%|█▊        | 30/164 [00:02<00:12, 10.76it/s][A
Batches:  20%|█▉        | 32/164 [00

In [132]:
tagged_embeddings = model.encode(tags_list)


Batches: 100%|██████████| 1/1 [00:00<00:00, 15.62it/s]


# List of most similar sentences

In [211]:
closest_n = 5
most_sim_id = []
for tag, tagged_embedding in zip(tags_list, tagged_embeddings):
    distances = scipy.spatial.distance.cdist([tagged_embedding], corpus_embeddings, "cosine")[0]
    
    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for idx, distance in results[1:closest_n+1]: # 1 and +1 because I didn't exclude the tagged sentences from the file
        most_sim_id.append(idx) # will use this to create a json file later
        print(corpus[idx].strip(), "(Score: %.4f)" % (1-distance))






Query: month ago mubarak dismissed demands constitutional reform futile

Top 5 most similar sentences in corpus:
month ago mubarak dismissed demands constitutional reform futile (Score: 1.0000)
month ago mubarak dismissed demands constitutional reform futile (Score: 1.0000)
month ago mubarak dismissed demands constitutional reform futile (Score: 1.0000)
month ago mubarak dismissed demands constitutional reform futile (Score: 1.0000)
month ago mubarak dismissed demands constitutional reform futile (Score: 1.0000)




Query: month ago mubarak dismissed demands constitutional reform futile

Top 5 most similar sentences in corpus:
divisions reproduced local level (Score: 1.0000)
divisions reproduced local level (Score: 1.0000)
divisions reproduced local level (Score: 1.0000)
divisions reproduced local level (Score: 1.0000)
divisions reproduced local level (Score: 1.0000)


In [205]:
most_sim_id = [most_sim_id[x:x+closest_n] for x in range(0, len(most_sim_id), closest_n)]

In [206]:
most_sim_id

[[6, 51, 127, 182, 223], [206, 277, 288, 303, 335]]

In [207]:
results

[(10, 1.4377388168895777e-13),
 (206, 1.4377388168895777e-13),
 (277, 1.4377388168895777e-13),
 (288, 1.4377388168895777e-13),
 (303, 1.4377388168895777e-13),
 (335, 1.4377388168895777e-13),
 (942, 0.14073870040777825),
 (931, 0.19385541285059815),
 (957, 0.19934158623070075),
 (860, 0.24813301397290666),
 (664, 0.2704115282978903),
 (989, 0.2799268658186498),
 (914, 0.3327170344981166),
 (935, 0.4089176066124509),
 (998, 0.4089176066124509),
 (750, 0.41292370822201596),
 (873, 0.41829515556375396),
 (901, 0.41829515556375396),
 (713, 0.42542930890279873),
 (790, 0.4552147057889179),
 (133, 0.455885686073187),
 (139, 0.455885686073187),
 (145, 0.455885686073187),
 (212, 0.455885686073187),
 (227, 0.455885686073187),
 (237, 0.455885686073187),
 (265, 0.455885686073187),
 (792, 0.45777736227963284),
 (815, 0.46214566603212426),
 (35, 0.4671078136407737),
 (218, 0.4671078136407737),
 (232, 0.4671078136407737),
 (243, 0.4671078136407737),
 (262, 0.4671078136407737),
 (354, 0.46710781364077