### IMPORTS

In [23]:
import numpy as np
import pandas as pd
import os
from sentence_transformers import SentenceTransformer
import sys
import pickle

### INPUTS

In [21]:
with open('../config/repository_path.txt', 'r') as reader:
    repository_path = reader.read().strip()
sys.path.append(repository_path)

from lib import tdmstudio

# ------ #
# OUTPUT #
# ------ #
embedding_path = os.path.join(repository_path, 'embeddings')
assert os.path.exists(embedding_path)
embedding_file = os.path.join(embedding_path, 'item_representation_sentence_bert.pickle')
# ----- #
# INPUT #
# ----- #
model_path='/home/ec2-user/SageMaker/mariano/sentence_bert/pretrained/all-MiniLM-L6-v2/'


data_path = '/home/ec2-user/SageMaker/mariano/datasets/displaced_persons/files/labeled_data_latest_08072022.csv'



### Data

In [18]:
df = pd.read_csv(data_path, sep=';')
df['text and title'] = list(map(lambda id_: tdmstudio.get_title_and_text(tdmstudio.get_filename(str(id_))), df['id']))
df.head()

Unnamed: 0,id,label,text and title
0,1293118416,R,Farm Jobs Wanted.\n \n \n\...
1,1291398730,R,DP's Are Eager to Learn CWL Executive Reports....
2,1325747973,R,Teams Find DP's Look to Canada For Opportunity...
3,1287771491,R,DP Killed on Highway Checking Stalled Auto.\n ...
4,1291249197,R,MISS C. LENORE CHARLES.\n \n ...


### Model

In [3]:
model = SentenceTransformer(model_path)

In [26]:
vecs = model.encode(df['text and title'])
item_representation =  {id_:vecs[ix,:] for ix,id_ in enumerate(df['id'])}

In [27]:
with open(embedding_file, 'wb') as writer:
    pickle.dump(item_representation, writer)

In [28]:
len(item_representation)

7282

In [31]:
list(item_representation)[:2]

[1293118416, 1291398730]

In [32]:
item_representation[1291398730]

array([-7.16225356e-02, -5.21733798e-02,  6.85236603e-02,  4.31508981e-02,
       -1.41899623e-02,  5.62452935e-02,  1.75629102e-03,  2.84031127e-02,
       -4.70792409e-03,  9.36669856e-03, -9.11799818e-03, -2.64405590e-02,
       -5.20544350e-02, -5.87883964e-02, -2.63360385e-02,  1.00081263e-03,
        5.42179868e-02,  6.38037920e-02, -7.89129660e-02,  5.82289174e-02,
       -4.68280874e-02, -1.75768696e-02, -5.81918517e-03,  2.36339048e-02,
        2.01583188e-02,  1.38717433e-02, -9.83785167e-02, -2.17814054e-02,
       -2.66266502e-02,  4.09667082e-02,  1.23735042e-02,  3.10856793e-02,
       -3.98325250e-02,  3.68262939e-02,  4.85713594e-02,  1.06317773e-01,
        1.30630448e-01,  3.40445130e-03,  2.24135239e-02, -1.61397911e-03,
       -2.65774485e-02, -2.95852814e-02, -4.61929291e-02,  1.39465621e-02,
       -4.57473733e-02, -1.70626771e-03, -3.47435288e-02,  2.32431702e-02,
       -6.68971911e-02, -6.08621538e-02,  6.48430921e-03, -2.67984122e-02,
        3.40518653e-02,  