In [52]:
import os
import pandas as pd
import numpy as np
from datetime import datetime

### Setting environmental variable $LASER

The LASER code uses the environmental variable `LASER` to know where the executables are stored.

In [26]:
!pwd

/Users/alexander/PycharmProjects/MDP/embedding


In [27]:
%env LASER=/Users/alexander/PycharmProjects/MDP/embedding/LASER

env: LASER=/Users/alexander/PycharmProjects/MDP/embedding/LASER


### Directly creating embeddings using bash

In [16]:
!tasks/embed/embed.sh data/in_test.csv data/out_test.bin

2023-04-12 16:22:44,756 | INFO | embed | spm_model: /Users/alexander/PycharmProjects/MDP/embedding/LASER/models/laser2.spm
2023-04-12 16:22:44,756 | INFO | embed | spm_cvocab: /Users/alexander/PycharmProjects/MDP/embedding/LASER/models/laser2.cvocab
2023-04-12 16:22:44,756 | INFO | embed | loading encoder: /Users/alexander/PycharmProjects/MDP/embedding/LASER/models/laser2.pt
2023-04-12 16:22:45,212 | INFO | preprocess | SPM processing in_test.csv  
2023-04-12 16:22:45,271 | INFO | embed | encoding /var/folders/sk/440wdj7s2csb8dhnqrl5p1n80000gn/T/tmp_vhymief/spm to data/out_test.bin
2023-04-12 16:22:45,403 | INFO | embed | encoded 7 sentences in 0s


In [19]:
dim = 1024
X = np.fromfile("data/out_test.bin", dtype=np.float32, count=-1)
X.resize(X.shape[0] // dim, dim)

In [20]:
print(f"Dog and no dog: {np.linalg.norm(X[0,:]-X[1,:])}")
print(f"Dog, but different language: {np.linalg.norm(X[0,:]-X[2,:])}")
print(f"Different: {np.linalg.norm(X[0,:]-X[4,:])}")
print(f"Different, but same language: {np.linalg.norm(X[0,:]-X[6,:])}")

Dog and no dog: 0.17333778738975525
Dog, but different language: 0.07015577703714371
Different: 0.5134618282318115
Different, but same language: 0.39835819602012634


### Wrapper to create embeddings in python

In [30]:
def df2embed(df):

    if not os.path.exists("temp"):
        os.makedirs("temp")

    if len(df.columns) != 1:
        raise ValueError

    t = int(datetime.timestamp(datetime.now()))
    df.to_csv(f"temp/{t}.csv", header=False, index=False)

    os.system(f"LASER/tasks/embed/embed.sh temp/{t}.csv temp/{t}_emb.bin")

    dim = 1024
    emb = np.fromfile(f"temp/{t}_emb.bin", dtype=np.float32, count=-1)
    emb.resize(emb.shape[0] // dim, dim)

    return emb


Testing it now with the Kaggle data set without any special data preprocessing. Could just be added after creating the dataframe.

In [40]:
a = pd.read_csv("/Users/alexander/PycharmProjects/MDP/Kaggle_dataset/Kaggle_dataset.csv")["Text"].to_frame()

In [43]:
tweets_embeddings = df2embed(a)

2023-04-19 12:21:42,661 | INFO | embed | spm_model: /Users/alexander/PycharmProjects/MDP/embedding/LASER/models/laser2.spm
2023-04-19 12:21:42,661 | INFO | embed | spm_cvocab: /Users/alexander/PycharmProjects/MDP/embedding/LASER/models/laser2.cvocab
2023-04-19 12:21:42,661 | INFO | embed | loading encoder: /Users/alexander/PycharmProjects/MDP/embedding/LASER/models/laser2.pt
2023-04-19 12:21:43,149 | INFO | preprocess | SPM processing 1681899700.csv  
2023-04-19 12:21:43,688 | INFO | embed | encoding /var/folders/sk/440wdj7s2csb8dhnqrl5p1n80000gn/T/tmp4bw68c_q/spm to temp/1681899700_emb.bin
2023-04-19 12:22:45,953 | INFO | embed | encoded 10000 sentences
2023-04-19 12:23:48,798 | INFO | embed | encoded 20000 sentences
2023-04-19 12:24:51,885 | INFO | embed | encoded 30000 sentences
2023-04-19 12:24:53,251 | INFO | embed | encoded 30157 sentences in 189s


Created ~30k embeddings locally in ~minutes.

In [51]:
print(tweets_embeddings.shape)

(30157, 1024)
