## CREATE TABLE AND POPULATE WITH EMBEDDINGS

In [4]:
import json
import os
import psycopg2
import pickle

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
default_config

{'tfidf': {'max_ratio': '0.9,', 'min_ratio': '0.1,', 'max_features': 800},
 'word2vec': {'size': '300,',
  'window': '5,',
  'min_count': '1,',
  'workers': '4,',
  'sg': '1,',
  'hs': '0,',
  'negative': '5,',
  'epochs': 10}}

In [2]:
# load data 
df = pd.read_csv("/Users/mariavivo/repos/meri/jurisprudence-semantic-search/data/train_test_set.csv")

data = df["clean_fundamentos"].to_list()

In [5]:
# generate tf-idf 800-dimensional embeddings
max_ratio = 0.9
min_ratio = 0.1
max_dim = 800

vectorizer = TfidfVectorizer(max_df=max_ratio, 
                             min_df=min_ratio, 
                             max_features=max_dim)


tfidf_vectors = vectorizer.fit_transform(data)

# save resulting vectors and model
base_path = "/Users/mariavivo/repos/meri/jurisprudence-semantic-search/data"
vectors_path = os.path.join(base_path, "embeddings", "tfidf_embeddings.npy")
vectorizer_path = os.path.join(base_path, "models", "vectorizer.pickle")

embeddings = np.array(tfidf_vectors)
np.save(vectors_path, embeddings)

pickle.dump(vectorizer, vectorizer_path, "wb")

In [109]:
# load embeddings
sparse_embeddings = csr_matrix(np.load(vectors_path, allow_pickle=True).all())
dense_embeddings = sparse_embeddings.toarray()

# format adequately to insert into db
dense_embedding_list = [[embedding.tolist()] for embedding in dense_embeddings]

In [17]:
# connect to db
# first execute in terminal: pg_ctl -D /opt/homebrew/var/postgresql@14 -o "-p 5433" start
conn = psycopg2.connect(
    host="localhost",
    database="juris",
    user="mariavivop",
    password="424242",
    port="5433"
)

cursor = conn.cursor()

In [19]:
# create table
cursor.execute(open("/Users/mariavivo/repos/meri/jurisprudence-semantic-search/create_vectorial_db.sql", "r").read())

In [102]:
# SQL statement to insert vectors into the table
sql = "INSERT INTO tfidf_vectors (vector) VALUES (%s)"

# Execute the SQL statement with multiple sets of parameters
cursor.executemany(sql, dense_embedding_list)

# Commit the changes to the database
conn.commit()

In [None]:
# verify insertion
cursor.execute("SELECT * FROM tfidf_vectors")
see_embedings = cursor.fetchall()

In [None]:
# Close the cursor and the database connection
cursor.close()
conn.close()