In [1]:
from mongodb_connector import connect_to_mongodb, get_database, get_collection, get_all_documents
import os
from dotenv import load_dotenv

load_dotenv()

# Connect to mongodb
client = connect_to_mongodb(mongo_uri=os.getenv("MONGO_URI"))
database = get_database(mongo_client=client, database_name=os.getenv("DATABASE_NAME"))
collection = get_collection(database=database, collection_name=os.getenv("COLLECTION_NAME"))

Pinged your deployment. You successfully connected to MongoDB!
Database 'Film' connected successfully!
Collection 'Data' connected successfully!


In [2]:
# Check documents
all_documents = get_all_documents(collection)
print("Data lenght: ", len(all_documents))
print("Some samples")
all_documents[:3]

Data lenght:  10563
Some samples


[{'_id': ObjectId('6836de6f2522369bf29e1b7d'),
  'id': 'tt0006621',
  'film_name': 'East Lynne',
  'description': 'When Isabel Carlisle mistakenly believes that her husband Richard loves Barbara Hare, she leaves him and their two children. She does nothing to correct the report that she has been killed in a train wreck, and so Richard, believing himself to be a widower, marries Barbara. After a few months, Isabel longs to see her children and so, disguising herself, gets a job as their governess. Then, when her son becomes ill and calls out for his mother, Isabel throws off her disguise and goes to comfort him, but he dies in her arms. Discovering Isabel with the boy, Richard immediately forgives her for having left him and the children, but Isabel cannot forgive herself, and soon dies of grief.',
  'image_link': 'https://m.media-amazon.com/images/M/MV5BM2E3MDgzNjItZjMzYi00ZWZiLTgwMWMtNWM2ZDAwOTg0MzBhXkEyXkFqcGc@.jpg',
  'isAdult': 0,
  'startYear': 1916,
  'runtimeMinutes': 50,
  'gen

In [9]:
from typing import List
from data_models import RawFilm, CleanFilm, FilmMetadata
from pydantic import ValidationError
from pymongo import UpdateOne

# Main process
def process_and_upsert(films: List[dict], pipeline, dest_collection_name):
    # Connect to MongoDB client
    mongo_client = connect_to_mongodb(mongo_uri=os.getenv("MONGO_URI"))
    database = get_database(mongo_client=mongo_client, database_name=os.getenv("DATABASE_NAME"))
    dest_collection = get_collection(database=database, collection_name=dest_collection_name)

    ops = []
    validation_errors = 0

    # Getr all fields in raw data
    required_fields = list(RawFilm.model_fields.keys())

    for raw in films:
        # If raw data missed any fields -> assign None (avoid to loose data)
        for field in required_fields:
            if field not in raw:
                raw[field] = None

        try:
            rf = RawFilm(**raw)
        except ValidationError as e:
            validation_errors += 1
            print(f"Validation error on {raw.get('id')}: {e}")
            continue

        if hasattr(pipeline, "preprocess_single_text"):
            cleaned = pipeline.preprocess_single_text(rf.description)
        elif hasattr(pipeline, "preprocess"):
            cleaned = pipeline.preprocess(rf.description)
        else:
            raise Exception("Invalid pipeline!!!")

        if not cleaned:
            print(f"Warning: cleaned data empty or None for id {rf.id}")
            continue

        cf = CleanFilm(
            id=rf.id,
            original_description=rf.description,
            cleaned_description=cleaned,
            metadata=FilmMetadata(
                film_name=rf.film_name,
                image_link=rf.image_link,
                is_adult=rf.isAdult,
                start_year=rf.startYear,
                runtime_minutes=rf.runtimeMinutes,
                genres=rf.genres,
                rating=rf.rating,
                votes=rf.votes,
                directors=rf.directors,
                writers=rf.writers
            )
        )

        ops.append(
            UpdateOne(
                {"id": cf.id},
                {"$set": cf.model_dump(mode="json")},
                upsert=True
            )
        )

    print(f"Validation errors skipped: {validation_errors}")
    if ops:
        result = dest_collection.bulk_write(ops)
        print(f"Matched: {result.matched_count}, Inserted: {result.upserted_count}, Modified: {result.modified_count}")
    else:
        print("No operations to write.")

In [10]:
# Demo
from preprocessing import LSASVDPipeline, WordEmbeddingPipeline

# LSA/SVD pipeline
lsa = LSASVDPipeline()

process_and_upsert(
    films=all_documents[:3],     # Run some samples for test
    pipeline=lsa,
    dest_collection_name='lsa_collection_test'         
)

# WordEmbedding pipeline
wemb = WordEmbeddingPipeline()

process_and_upsert(
    films=all_documents[:3],    # Run some samples for test
    pipeline=wemb,
    dest_collection_name='wemb_collection_test'         
)

Pinged your deployment. You successfully connected to MongoDB!
Database 'Film' connected successfully!
Collection 'lsa_collection_test' connected successfully!
Validation errors skipped: 0
Matched: 0, Inserted: 3, Modified: 0
Pinged your deployment. You successfully connected to MongoDB!
Database 'Film' connected successfully!
Collection 'wemb_collection_test' connected successfully!
Validation errors skipped: 0
Matched: 0, Inserted: 3, Modified: 0


In [13]:
# Check result
client = connect_to_mongodb(mongo_uri=os.getenv("MONGO_URI"))
database = get_database(mongo_client=client, database_name=os.getenv("DATABASE_NAME"))
lsa_svd_collection = get_collection(database=database, collection_name='lsa_collection_test')
wemb_collection = get_collection(database=database, collection_name='wemb_collection_test')

Pinged your deployment. You successfully connected to MongoDB!
Database 'Film' connected successfully!
Collection 'lsa_collection_test' connected successfully!
Collection 'wemb_collection_test' connected successfully!


In [12]:
get_all_documents(lsa_svd_collection)

[{'_id': ObjectId('683eb7afdba46ba351847cbb'),
  'id': 'tt0006621',
  'cleaned_description': 'isabel carlisl mistakenli believ husband richard love barbara hare leav two children noth correct report kill train wreck richard believ widow marri barbara month isabel long see children disguis get job gover son becom ill call mother isabel throw disguis goe comfort die arm discov isabel boy richard immedi forgiv left children isabel forgiv soon die grief',
  'metadata': {'film_name': 'East Lynne',
   'image_link': 'https://m.media-amazon.com/images/M/MV5BM2E3MDgzNjItZjMzYi00ZWZiLTgwMWMtNWM2ZDAwOTg0MzBhXkEyXkFqcGc@.jpg',
   'is_adult': 0,
   'start_year': 1916,
   'runtime_minutes': 50,
   'genres': 'Drama',
   'rating': 5.5,
   'votes': 51,
   'directors': 'Bertram Bracken',
   'writers': 'Mary Elizabeth Braddon, Mary Murillo, Mrs. Henry Wood'},
  'original_description': 'When Isabel Carlisle mistakenly believes that her husband Richard loves Barbara Hare, she leaves him and their two child

In [14]:
get_all_documents(wemb_collection)

[{'_id': ObjectId('683eb7b0dba46ba351847cbe'),
  'id': 'tt0006621',
  'cleaned_description': ['when',
   'isabel',
   'carlisle',
   'mistakenly',
   'belief',
   'that',
   'her',
   'husband',
   'richard',
   'love',
   'barbara',
   'hare',
   'she',
   'leaf',
   'him',
   'their',
   'two',
   'child',
   'she',
   'doe',
   'nothing',
   'to',
   'correct',
   'report',
   'that',
   'she',
   'ha',
   'been',
   'killed',
   'in',
   'train',
   'wreck',
   'so',
   'richard',
   'believing',
   'himself',
   'to',
   'be',
   'widower',
   'marries',
   'barbara',
   'after',
   'few',
   'month',
   'isabel',
   'longs',
   'to',
   'see',
   'her',
   'child',
   'so',
   'disguising',
   'herself',
   'get',
   'job',
   'a',
   'their',
   'governess',
   'then',
   'when',
   'her',
   'son',
   'becomes',
   'ill',
   'call',
   'out',
   'for',
   'his',
   'mother',
   'isabel',
   'throw',
   'off',
   'her',
   'disguise',
   'go',
   'to',
   'comfort',
   'him',
  