### IMPORTS

In [1]:
import spacy
import numpy as np
import sys
import pandas as pd
import os
import pickle

### INPUTS AND OUTPUTS

In [2]:
with open('../config/repository_path.txt', 'r') as reader:
    repository_path = reader.read().strip()
sys.path.append(repository_path)

from lib import tdmstudio
from lib import nlp

# ------ #
# OUTPUT #
# ------ #
embedding_path = os.path.join(repository_path, 'embeddings')
assert os.path.exists(embedding_path)
embedding_file = os.path.join(embedding_path, 'item_representation_glove.pickle')

# ----- #
# INPUT #
# ----- #
# data_path = '/home/ec2-user/SageMaker/mariano/datasets/displaced_persons/files/labeled_data_latest_08072022.csv'
DATAPATH = '/home/ec2-user/SageMaker/mariano/datasets/20news-18828/files/'


### DATA

In [4]:
files = [os.path.join(DATAPATH, folder, file_) for folder  in os.listdir(DATAPATH) for file_ in os.listdir(os.path.join(DATAPATH, folder))]
assert all([os.path.isfile(file_) for file_ in files])

ids = ['/'.join(file_.split('/')[-2:]) for file_ in files]
texts = list(map(lambda file_: open(file_, 'r', encoding='latin-1').read(), files))


        
# pd.DataFrame(data)
df = pd.DataFrame({'id':ids, 'text':texts,})
df

Unnamed: 0,id,text
0,rec.motorcycles/104702,From: egreen@east.sun.com (Ed Green - Pixel Cr...
1,rec.motorcycles/104863,From: vech@Ra.MsState.Edu (Craig A. Vechorik)\...
2,rec.motorcycles/105203,From: asphaug@lpl.arizona.edu (Erik Asphaug x2...
3,rec.motorcycles/104694,From: nelson@seahunt.imat.com (Michael Nelson)...
4,rec.motorcycles/105132,From: jjb@dtc.hp.com (Jim Brewer)\nSubject: Re...
...,...,...
18823,comp.graphics/38943,From: mharring@cch.coventry.ac.uk (MARTIN)\nSu...
18824,comp.graphics/38516,From: capelli@vnet.IBM.COM (Ron Capelli)\nSubj...
18825,comp.graphics/38292,From: spl@ivem.ucsd.edu (Steve Lamont)\nSubjec...
18826,comp.graphics/38219,From: lewism@aix.rpi.edu (Michael C. Lewis)\nS...


### MODEL

In [5]:
nlp = spacy.load('en_core_web_lg', disable=['textcat', 'ner', 'parser',])

In [6]:
vecs = list(map(lambda text: nlp(text).vector, df['text']))


In [7]:
item_representation =  {id_:vecs[ix] for ix,id_ in enumerate(df['id'])}

In [8]:
embedding_file

'/home/ec2-user/SageMaker/mariano/repositories/train-test-split/all_ng_simulations/embeddings/item_representation_glove.pickle'

In [9]:
with open(embedding_file, 'wb') as writer:
    pickle.dump(item_representation, writer)

In [11]:
list(item_representation)[:2]

['rec.motorcycles/104702', 'rec.motorcycles/104863']

In [15]:
item_representation['rec.motorcycles/104863']

array([-3.02651711e-02,  1.57828659e-01, -1.07036486e-01, -3.81977670e-02,
        4.59820591e-02,  2.15224102e-02, -6.24900870e-03, -1.22590661e-01,
       -3.13916206e-02,  1.80259597e+00, -1.76963836e-01,  4.34588939e-02,
        6.99802488e-02, -6.11593463e-02, -1.14718504e-01, -5.36652282e-02,
       -5.34182824e-02,  8.42997372e-01, -1.29043907e-01, -2.06496771e-02,
        1.88927799e-02, -2.70951539e-02, -2.39264760e-02, -3.79296690e-02,
        2.84314957e-02,  3.96626815e-02, -1.02900356e-01, -2.74022687e-02,
        1.97455026e-02, -2.50539239e-02,  2.41338726e-04,  8.07018578e-02,
       -4.54680547e-02,  7.76696205e-02,  2.26218421e-02, -5.13904653e-02,
       -3.70967537e-02,  1.51937138e-02, -1.00449257e-01, -5.96676618e-02,
        2.54643545e-03,  4.87659611e-02,  1.62375830e-02, -6.87728152e-02,
        3.61690670e-02,  2.57867724e-02, -1.00077562e-01,  7.82838278e-03,
        3.71154621e-02,  3.45281628e-03, -6.61983564e-02,  6.43182844e-02,
       -1.44191701e-02, -