In [5]:
import os
from database_connector.qdrant_connector import connect_to_qdrant
from dotenv import load_dotenv
load_dotenv()

qdrant_client = connect_to_qdrant(os.getenv("QDRANT_URI"), os.getenv("QDRANT_API_KEY"))

Successfully connected to Qdrant!


In [6]:
from database_connector.mongodb_connector import connect_to_mongodb, get_all_documents, get_collection, get_database

mongo_client = connect_to_mongodb(os.getenv("MONGO_URI"))
database = get_database(mongo_client, os.getenv("DATABASE_NAME"))
collection = get_collection(database, os.getenv("LSA_PRE_COLLECTION"))
all_documents = get_all_documents(collection)
print(f"Total documents: {len(all_documents)}")

Pinged your deployment. You successfully connected to MongoDB!
Database 'Film' connected successfully!
Collection 'lsa_svd_preprocessed' connected successfully!
Total documents: 9504


In [7]:
all_documents[0]

{'_id': ObjectId('6839edbcdba46ba351843140'),
 'id': 'tt0006621',
 'cleaned_description': 'isabel carlisl mistakenli believ husband richard love barbara hare leav two children noth correct report kill train wreck richard believ widow marri barbara month isabel long see children disguis get job gover son becom ill call mother isabel throw disguis goe comfort die arm discov isabel boy richard immedi forgiv left children isabel forgiv soon die grief',
 'metadata': {'film_name': 'East Lynne',
  'image_link': 'https://m.media-amazon.com/images/M/MV5BM2E3MDgzNjItZjMzYi00ZWZiLTgwMWMtNWM2ZDAwOTg0MzBhXkEyXkFqcGc@.jpg',
  'is_adult': 0,
  'start_year': 1916,
  'runtime_minutes': 50,
  'genres': 'Drama',
  'rating': 5.5,
  'votes': 51,
  'directors': 'Bertram Bracken',
  'writers': 'Mary Elizabeth Braddon, Mary Murillo, Mrs. Henry Wood'},
 'original_description': 'When Isabel Carlisle mistakenly believes that her husband Richard loves Barbara Hare, she leaves him and their two children. She does 

In [8]:
def normalize_text_field(doc):
    directors_raw = doc['metadata'].get('directors')
    writers_raw = doc['metadata'].get('writers')

    # Nếu là None thì thay bằng chuỗi rỗng
    directors = " ".join([d.strip().lower() for d in directors_raw.split(',')]) if directors_raw else ""
    writers = " ".join([w.strip().lower() for w in writers_raw.split(',')]) if writers_raw else ""

    description = doc.get('cleaned_description', '')
    description = description.lower() if description else ""

    return f"{directors} {writers} {description}"

small_documents = all_documents[:10]
corpus = []
for doc in small_documents:
    text = normalize_text_field(doc)
    corpus.append(text)

print(f"Corpus length: {len(corpus)}")
print(f'Test some samples: ')
corpus[:5]

Corpus length: 10
Test some samples: 


['bertram bracken mary elizabeth braddon mary murillo mrs. henry wood isabel carlisl mistakenli believ husband richard love barbara hare leav two children noth correct report kill train wreck richard believ widow marri barbara month isabel long see children disguis get job gover son becom ill call mother isabel throw disguis goe comfort die arm discov isabel boy richard immedi forgiv left children isabel forgiv soon die grief',
 'frank lloyd leo ditrichstein fanny hatton frederic hatton j.e. nash ethel warren return studi europ make debut new york opera compani jean paurel world famou bariton star carlo sonino also member compani fall love ethel warn becom infatu amor singer paurel becom enchant ethel arous jealousi compani prima donna sabotini first act ethel hear paurel suffer attack throat rush backstag carlo urg sabotini follow make scene whereupon ethel indignantli announc engag paurel paurel unabl perform second act carlo take place vault stardom paurel diagnos never abl sing bia

In [9]:
from models.bow import BagOfWords

print(f"Small corpus length: {len(corpus)}")

# Initialize model
bow = BagOfWords(
    min_word_freq=1,
    max_features=100,
    tokenizer='whitespace'
)

bow_matrix = bow.fit_transform(corpus)

Small corpus length: 10


In [10]:
for i, doc in enumerate(corpus):
    print(f"\nDocument {i+1}: '{doc}'")
    print(f"Tokens: {bow.get_document_tokens(doc)}")
    print(f"Vector: {bow_matrix[i]}")


Document 1: 'bertram bracken mary elizabeth braddon mary murillo mrs. henry wood isabel carlisl mistakenli believ husband richard love barbara hare leav two children noth correct report kill train wreck richard believ widow marri barbara month isabel long see children disguis get job gover son becom ill call mother isabel throw disguis goe comfort die arm discov isabel boy richard immedi forgiv left children isabel forgiv soon die grief'
Tokens: ['bertram', 'bracken', 'mary', 'elizabeth', 'braddon', 'mary', 'murillo', 'mrs.', 'henry', 'wood', 'isabel', 'carlisl', 'mistakenli', 'believ', 'husband', 'richard', 'love', 'barbara', 'hare', 'leav', 'two', 'children', 'noth', 'correct', 'report', 'kill', 'train', 'wreck', 'richard', 'believ', 'widow', 'marri', 'barbara', 'month', 'isabel', 'long', 'see', 'children', 'disguis', 'get', 'job', 'gover', 'son', 'becom', 'ill', 'call', 'mother', 'isabel', 'throw', 'disguis', 'goe', 'comfort', 'die', 'arm', 'discov', 'isabel', 'boy', 'richard', 'im

In [11]:
bow.print_vocabulary_info()

Vocabulary size: 100
Total words in corpus: 1200
Unique words in corpus: 709
Top 10 most common words:
   1. 'naomi          ':   12
   2. 'becom          ':   10
   3. 'two            ':    8
   4. 'paurel         ':    8
   5. 'girl           ':    8
   6. 'home           ':    8
   7. 'life           ':    8
   8. 'father         ':    8
   9. 'dori           ':    8
  10. 'harmon         ':    8


# Dimensionality Reduction by SVD

In [12]:
from models.svd import SVDModel

dim_reduc_model = SVDModel(n_components=50, random_state=11)
matrix =  dim_reduc_model.fit_transform(bow_matrix)
matrix.shape, bow_matrix.shape

((10, 10), (10, 100))

In [13]:
len(matrix), len(small_documents)

(10, 10)

In [14]:
from preprocessing.data_models import QdrantPoint

list_of_points = []
for idx in range(len(matrix)):
    point = QdrantPoint(
        id = small_documents[idx]['id'],
        text = corpus[idx],
        vector = matrix[idx],
        metadata = small_documents[idx]['metadata']
    )
    list_of_points.append(point)

In [15]:
list_of_points

[QdrantPoint(id='tt0006621', text='bertram bracken mary elizabeth braddon mary murillo mrs. henry wood isabel carlisl mistakenli believ husband richard love barbara hare leav two children noth correct report kill train wreck richard believ widow marri barbara month isabel long see children disguis get job gover son becom ill call mother isabel throw disguis goe comfort die arm discov isabel boy richard immedi forgiv left children isabel forgiv soon die grief', metadata=FilmMetadata(film_name='East Lynne', image_link=HttpUrl('https://m.media-amazon.com/images/M/MV5BM2E3MDgzNjItZjMzYi00ZWZiLTgwMWMtNWM2ZDAwOTg0MzBhXkEyXkFqcGc@.jpg'), is_adult=0, start_year=1916, runtime_minutes=50, genres='Drama', rating=5.5, votes=51, directors='Bertram Bracken', writers='Mary Elizabeth Braddon, Mary Murillo, Mrs. Henry Wood'), vector=[2.226082771608802, -0.1746777588553621, -1.6674290454267546, 0.6733490977776098, -1.2844510658153454, 4.052384376210664, -4.59875629141452, 3.9563347576488432, 0.691047559