In [13]:
import os
from database_connector.qdrant_connector import connect_to_qdrant
from dotenv import load_dotenv
load_dotenv()

qdrant_client = connect_to_qdrant(os.getenv("QDRANT_URI"), os.getenv("QDRANT_API_KEY"))

Successfully connected to Qdrant!


In [14]:
from database_connector.mongodb_connector import connect_to_mongodb, get_all_documents, get_collection, get_database

mongo_client = connect_to_mongodb(os.getenv("MONGO_URI"))
database = get_database(mongo_client, os.getenv("DATABASE_NAME"))
collection = get_collection(database, os.getenv("LSA_COLLECTION_NAME"))
all_documents = get_all_documents(collection)
print(f"Total documents: {len(all_documents)}")

Pinged your deployment. You successfully connected to MongoDB!
Database 'Film' connected successfully!
Collection 'lsa_svd_preprocessed' connected successfully!
Total documents: 9504


In [15]:
def normalize_text_field(doc):
    directors_raw = doc['metadata'].get('directors')
    writers_raw = doc['metadata'].get('writers')

    # Nếu là None thì thay bằng chuỗi rỗng
    directors = " ".join([d.strip().lower() for d in directors_raw.split(',')]) if directors_raw else ""
    writers = " ".join([w.strip().lower() for w in writers_raw.split(',')]) if writers_raw else ""

    description = doc.get('cleaned_description', '')
    description = description.lower() if description else ""

    return f"{directors} {writers} {description}"

corpus = []
for doc in all_documents[:10]:
    text = normalize_text_field(doc)
    corpus.append(text)

print(f"Corpus length: {len(corpus)}")
print(f'Test some samples: ')
corpus[:5]

Corpus length: 10
Test some samples: 


['bertram bracken mary elizabeth braddon mary murillo mrs. henry wood isabel carlisl mistakenli believ husband richard love barbara hare leav two children noth correct report kill train wreck richard believ widow marri barbara month isabel long see children disguis get job gover son becom ill call mother isabel throw disguis goe comfort die arm discov isabel boy richard immedi forgiv left children isabel forgiv soon die grief',
 'frank lloyd leo ditrichstein fanny hatton frederic hatton j.e. nash ethel warren return studi europ make debut new york opera compani jean paurel world famou bariton star carlo sonino also member compani fall love ethel warn becom infatu amor singer paurel becom enchant ethel arous jealousi compani prima donna sabotini first act ethel hear paurel suffer attack throat rush backstag carlo urg sabotini follow make scene whereupon ethel indignantli announc engag paurel paurel unabl perform second act carlo take place vault stardom paurel diagnos never abl sing bia

# Bag Of Words

In [16]:
# from models.bow import BagOfWords

# print(f"Small corpus length: {len(corpus)}")

# # Initialize model
# bow = BagOfWords(
#     min_word_freq=1,
#     max_features=100,
#     tokenizer='whitespace'
# )

# bow_matrix = bow.fit_transform(corpus)

In [17]:
# for i, doc in enumerate(corpus):
#     print(f"\nDocument {i+1}: '{doc}'")
#     print(f"Tokens: {bow.get_document_tokens(doc)}")
#     print(f"Vector: {bow_matrix[i]}")

In [18]:
# bow.print_vocabulary_info()

# Dimensionality Reduction by SVD

In [19]:
# from models.dim_reduc import SVDModel

# dim_reduc_model = SVDModel(n_components=50, random_state=11)
# matrix =  dim_reduc_model.fit_transform(bow_matrix)
# matrix.shape, bow_matrix.shape, small_documents.shape

# Final Model

In [20]:
from embedding.bow_svd_model.final_model import BOW_SVD_Embedding

model_config = {
    'bow_args': {
        'min_word_freq': 1,
        'max_features': None,
        'tokenizer': 'whitespace'
    },
    'dim_reduc_args': {
        'n_components': 1830,
    }
}

model = BOW_SVD_Embedding(
    bow_args=model_config.get('bow_args', None),
    dim_reduc_args=model_config.get('dim_reduc_args', None)
)

In [21]:
embedding_reduc_matrix = model.fit_transform(corpus)
len(embedding_reduc_matrix)

10

In [22]:
import pickle
SAVED_MODEL_ROOT = "./embedding/trained_models/"
os.makedirs(SAVED_MODEL_ROOT, exist_ok=True)
model_path = os.path.join(SAVED_MODEL_ROOT, f"{BOW_SVD_Embedding.__name__}.pkl")
with open(model_path, "wb") as f:
    pickle.dump(model, f)

In [23]:
from preprocessing.data_models import QdrantPoint

list_of_points = []
for idx in range(len(embedding_reduc_matrix)):
    point = QdrantPoint(
        text = corpus[idx],
        vector = embedding_reduc_matrix[idx],
        metadata = all_documents[idx]['metadata']
    )
    list_of_points.append(point)

list_of_points

[QdrantPoint(id='0feff71e-1b18-4902-b156-3175eac71153', text='bertram bracken mary elizabeth braddon mary murillo mrs. henry wood isabel carlisl mistakenli believ husband richard love barbara hare leav two children noth correct report kill train wreck richard believ widow marri barbara month isabel long see children disguis get job gover son becom ill call mother isabel throw disguis goe comfort die arm discov isabel boy richard immedi forgiv left children isabel forgiv soon die grief', metadata=FilmMetadata(film_name='East Lynne', image_link=HttpUrl('https://m.media-amazon.com/images/M/MV5BM2E3MDgzNjItZjMzYi00ZWZiLTgwMWMtNWM2ZDAwOTg0MzBhXkEyXkFqcGc@.jpg'), is_adult=0, start_year=1916, runtime_minutes=50, genres='Drama', rating=5.5, votes=51, directors='Bertram Bracken', writers='Mary Elizabeth Braddon, Mary Murillo, Mrs. Henry Wood'), vector=[2.663046548657961, 0.11267661880407047, 1.8767893606708848, 1.7631481455839306, 0.8192620800186554, 1.9602964836172254, 0.12196189580306395, 7.9

In [24]:
# from database_connector.qdrant_connector import insert_points_batch_to_qdrant

# insert_points_batch_to_qdrant(
#     qdrant_client=qdrant_client,
#     collection_name=os.getenv("QDRANT_BOW_COLLECTION"),
#     qdrant_points=list_of_points
# )   