### IMPORTS

In [1]:
import numpy as np
import pandas as pd
import os
from sentence_transformers import SentenceTransformer
import sys
import pickle

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

### INPUTS

In [8]:
with open('../config/repository_path.txt', 'r') as reader:
    repository_path = reader.read().strip()
sys.path.append(repository_path)

from lib import tdmstudio
from lib import nlp

# ------ #
# OUTPUT #
# ------ #
embedding_path = os.path.join(repository_path, 'embeddings')
assert os.path.exists(embedding_path)
embedding_file = os.path.join(embedding_path, 'item_representation_sentence_bert.pickle')

# ----- #
# INPUT #
# ----- #
# data_path = '/home/ec2-user/SageMaker/mariano/datasets/displaced_persons/files/labeled_data_latest_08072022.csv'
DATAPATH = '/home/ec2-user/SageMaker/mariano/datasets/20news-18828/files/'
model_path='/home/ec2-user/SageMaker/mariano/sentence_bert/pretrained/all-MiniLM-L6-v2/'


### Data

In [3]:
files = [os.path.join(DATAPATH, folder, file_) for folder  in os.listdir(DATAPATH) for file_ in os.listdir(os.path.join(DATAPATH, folder))]
assert all([os.path.isfile(file_) for file_ in files])

ids = ['/'.join(file_.split('/')[-2:]) for file_ in files]
texts = list(map(lambda file_: open(file_, 'r', encoding='latin-1').read(), files))


        
# pd.DataFrame(data)
df = pd.DataFrame({'id':ids, 'text':texts,})
df

Unnamed: 0,id,text
0,rec.motorcycles/104702,From: egreen@east.sun.com (Ed Green - Pixel Cr...
1,rec.motorcycles/104863,From: vech@Ra.MsState.Edu (Craig A. Vechorik)\...
2,rec.motorcycles/105203,From: asphaug@lpl.arizona.edu (Erik Asphaug x2...
3,rec.motorcycles/104694,From: nelson@seahunt.imat.com (Michael Nelson)...
4,rec.motorcycles/105132,From: jjb@dtc.hp.com (Jim Brewer)\nSubject: Re...
...,...,...
18823,comp.graphics/38943,From: mharring@cch.coventry.ac.uk (MARTIN)\nSu...
18824,comp.graphics/38516,From: capelli@vnet.IBM.COM (Ron Capelli)\nSubj...
18825,comp.graphics/38292,From: spl@ivem.ucsd.edu (Steve Lamont)\nSubjec...
18826,comp.graphics/38219,From: lewism@aix.rpi.edu (Michael C. Lewis)\nS...


### Model

In [6]:
model = SentenceTransformer(model_path)

In [9]:
vecs = model.encode(df['text'])
item_representation =  {id_:vecs[ix,:] for ix,id_ in enumerate(df['id'])}

In [10]:
embedding_file

'/home/ec2-user/SageMaker/mariano/repositories/train-test-split/all_ng_simulations/embeddings/item_representation_sentence_bert.pickle'

In [11]:
with open(embedding_file, 'wb') as writer:
    pickle.dump(item_representation, writer)

In [12]:
len(item_representation)

18828

In [13]:
list(item_representation)[:2]

['rec.motorcycles/104702', 'rec.motorcycles/104863']

In [17]:
item_representation['rec.motorcycles/104863']

array([-1.26719177e-01, -1.23524433e-02,  6.10969104e-02, -2.43054610e-02,
        1.86887030e-02,  6.51907583e-04,  2.67664138e-02, -2.45015584e-02,
        3.13243568e-02, -3.73944151e-03, -3.84143591e-02,  6.61230925e-03,
        8.36563483e-02, -5.59335351e-02, -1.07326441e-01,  8.21845308e-02,
        2.84472927e-02, -4.21651602e-02, -7.10067898e-02,  5.24298809e-02,
        4.96067591e-02,  6.12243861e-02,  5.08911572e-02,  5.34281209e-02,
       -6.78168982e-02, -1.79462098e-02, -7.76397884e-02, -8.82810261e-03,
       -3.31252627e-02, -9.65857040e-03,  1.98075711e-03,  6.23686761e-02,
        2.58533489e-02,  8.84562917e-03,  1.01139911e-01,  3.97649594e-02,
        1.32406717e-02, -2.42636278e-02, -3.56027012e-04, -1.57325566e-02,
       -1.82134635e-03,  1.86521299e-02,  3.98964109e-03,  5.19698188e-02,
        4.24822904e-02,  1.00480614e-03, -1.78762176e-03, -3.00311446e-02,
       -4.75977063e-02,  8.74330639e-04, -4.42991443e-02,  3.17157134e-02,
        2.97802351e-02, -