In [241]:
import os
import requests
import json

from dotenv import dotenv_values
import numpy as np
import weaviate
import weaviate.classes as wvc
import nltk
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dense, Flatten
from tensorflow.keras import Sequential

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [229]:
config = dotenv_values("../.env")

In [230]:
# function that reads json file
def read_json(file):
    with open(file, "r") as f:
        data = json.load(f)
    return data

In [231]:
client = weaviate.connect_to_custom(
    http_host="localhost",
    http_port=8080,
    http_secure=False,
    grpc_host="localhost",
    grpc_port=50051,
    grpc_secure=False,
    auth_credentials=weaviate.auth.AuthApiKey(
        config["AUTHENTICATION_APIKEY_ALLOWED_KEYS"]
    ),  # Set this environment variable
)

In [232]:
is_regenerate = True

if is_regenerate:
    client.collections.delete("Subject")

In [233]:
is_subject_collection = False
subjects = None

for collection in list(client.collections.list_all().keys()):
    if collection == "Subject":
        is_subject_collection = True

if is_subject_collection:
    subjects = client.collections.get("Subject")
else:
    subjects = client.collections.create(
        "Subject",
        vectorizer_config=wvc.config.Configure.Vectorizer.none(),
        vector_index_config=wvc.config.Configure.VectorIndex.hnsw(
            distance_metric=wvc.config.VectorDistances.COSINE  # select prefered distance metric
        ),
    )

In [234]:
is_data_loaded = False

for item in subjects.iterator():
    is_data_loaded = True
    break

In [235]:
if not is_data_loaded:
    subject_objs = []

    for filename in os.listdir("./document_embeddings/word2vec_mean"):
        if filename.endswith(".json"):
            subject_embedding = read_json(
                f"./document_embeddings/word2vec_mean/{filename}"
            )
            subject_code = filename.split(".")[0]

            subject_objs.append(
                wvc.data.DataObject(
                    properties={
                        "subjectCode": subject_code,
                    },
                    vector=subject_embedding,
                )
            )

    subjects.data.insert_many(subject_objs)

In [261]:
def get_one_hot_vector(data_point_index, vocab_size):
    one_hot_vector = np.zeros(vocab_size)
    one_hot_vector[data_point_index] = 1
    return one_hot_vector

In [264]:
def get_query_embedding(query: str):
  tokens = nltk.word_tokenize(query)
  tokens = [word.lower() for word in tokens if word.isalpha()] #  and len(word) > 1
  vocab = {"<pad>": 0} | {word: i+1 for i, word in enumerate(set(tokens))}
  vocab_size = len(vocab)
  
  print(tokens)
  
  train_samples = []

  window_size = 2

  for i in range(window_size, len(tokens) - window_size):
      for j in range(1, window_size + 1):
          train_samples.append((tokens[i], tokens[i-j]))
          train_samples.append((tokens[i], tokens[i+j]))
          
  print("train_samples", train_samples)
          
  x_train = []
  y_train = []

  for word, target_word in train_samples:
    x_train.append(vocab[word])
    y_train.append(get_one_hot_vector(vocab[target_word], vocab_size))
    
  print(x_train)
  print(y_train)
    
  x_train = np.asarray(x_train)
  y_train = np.asarray(y_train)
  
  # Build the Word2Vec model using TensorFlow
  embedding_dim = 100  # Adjust the dimensionality as needed

  model = Sequential()
  model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=1))
  model.add(Flatten())
  model.add(Dense(vocab_size, activation='softmax'))

  model.compile(loss='categorical_crossentropy', optimizer='adam')
  
  # Train the Word2Vec model
  num_epochs = 10  # Adjust the number of epochs as needed

  model.fit(x_train, y_train, epochs=num_epochs)
  
  word_embeddings = model.layers[0].get_weights()[0]
      
  return np.mean(word_embeddings, axis=0)

In [265]:
query_embedding = get_query_embedding("Machine Learning algorithm implementation with presentation at the end of semester")

['machine', 'learning', 'algorithm', 'implementation', 'with', 'presentation', 'at', 'the', 'end', 'of', 'semester']
train_samples [('algorithm', 'learning'), ('algorithm', 'implementation'), ('algorithm', 'machine'), ('algorithm', 'with'), ('implementation', 'algorithm'), ('implementation', 'with'), ('implementation', 'learning'), ('implementation', 'presentation'), ('with', 'implementation'), ('with', 'presentation'), ('with', 'algorithm'), ('with', 'at'), ('presentation', 'with'), ('presentation', 'at'), ('presentation', 'implementation'), ('presentation', 'the'), ('at', 'presentation'), ('at', 'the'), ('at', 'with'), ('at', 'end'), ('the', 'at'), ('the', 'end'), ('the', 'presentation'), ('the', 'of'), ('end', 'the'), ('end', 'of'), ('end', 'at'), ('end', 'semester')]
[10, 10, 10, 10, 11, 11, 11, 11, 1, 1, 1, 1, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 4, 4, 4, 4]
[array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]), array([0.,

In [266]:
query_embedding

array([-0.00966912,  0.01076559,  0.00152563, -0.02483802, -0.0232534 ,
        0.00537849, -0.0041363 ,  0.00763978,  0.0116906 ,  0.00137249,
        0.00061048, -0.00494826, -0.01506594,  0.00802219, -0.0135569 ,
       -0.01018921, -0.00572379, -0.00051977,  0.0109072 ,  0.0001546 ,
        0.01229581,  0.00207729,  0.0148156 , -0.00757831,  0.00922092,
       -0.00026447, -0.00368129, -0.00752631,  0.00791247, -0.00446834,
        0.00412131,  0.01322216,  0.01811293, -0.00446214,  0.00697161,
       -0.00640676,  0.00641331,  0.00771248,  0.0053491 ,  0.00446344,
       -0.00267432, -0.00848032, -0.00351101,  0.00166317,  0.00916756,
        0.00542568, -0.00524739, -0.01105762, -0.00564059,  0.00335803,
        0.00657216,  0.00056832, -0.01362528, -0.00043372, -0.00739061,
        0.00240722, -0.00163874, -0.01134397,  0.00399193, -0.01872429,
        0.00622769, -0.00318243, -0.00153846,  0.02161304, -0.01646972,
        0.00713221,  0.00656816,  0.00596208,  0.00036124, -0.00

In [268]:
response = subjects.query.near_vector(
    near_vector=query_embedding.tolist(),
    limit=5,
    return_metadata=wvc.query.MetadataQuery(certainty=True)
)

print(response)

QueryReturn(objects=[Object(uuid=_WeaviateUUIDInt('48ae18a4-9d4f-419c-a7f6-f4a039f432e6'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=0.6560970544815063, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'subjectCode': '43022'}, references=None, vector={}, collection='Subject'), Object(uuid=_WeaviateUUIDInt('6012e083-27eb-40ec-b503-bd9a18add807'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=0.6346902251243591, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'subjectCode': '42095'}, references=None, vector={}, collection='Subject'), Object(uuid=_WeaviateUUIDInt('22492e6d-90d1-4b4e-ab63-348b29fafa56'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=0.6285211443901062, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'subjectCode': '48572'}, refe

In [272]:
[x.properties["subjectCode"] for x in response.objects]

['43022', '42095', '48572', '31266', '41380']

In [237]:
# client.close()  # Close client gracefully