In [34]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Completeness

In [35]:
import utils
from pymongo import MongoClient
configs = utils.get_configs("../configs.yaml")

Connect to MongoDB and get the collection

In [36]:
client = MongoClient(configs['MONGODB_HOST'],configs['MONGODB_PORT'])
db = client[configs['MONGODB_DB_NAME']]
collection = db.get_collection(configs['MONGODB_COLLECTION_NAME'])

Count total documents in the collection

In [37]:
total_documents = collection.count_documents({})
print(f"Total documents in collection: {total_documents}")

Total documents in collection: 2010


Count documents where all review sources (ComingSoon, IMDb, and MyMovies) are filled.

In [38]:
all_reviews_count = collection.count_documents({"$and": [{"review.ComingSoon": {"$exists": True},}, 
                                                         {"review.IMDb": {"$exists": True},}, 
                                                         {"review.MyMovies": {"$exists": True},}]})
print(f"{all_reviews_count} documents have all three reviews, equal to {round((all_reviews_count / total_documents) * 100, 2)} % of total documents.")

1740 documents have all three reviews, equal to 86.57 % of total documents.


Count complete documents

In [39]:
query = {
        "title": {"$exists": True},
        #"original_title": {"$exists": True},
        "register": {"$exists": True},
        "genere": {"$exists": True},
        "region": {"$exists": True},
        "year": {"$exists": True},
        "box_office": {"$exists": True},
        "review": {"$exists": True}
    }
print(f"{collection.count_documents(query)} documents are complete, equal to {round((collection.count_documents(query) / total_documents) * 100, 2)} % of total documents.")

2005 documents are complete, equal to 99.75 % of total documents.


Count documents with ComingSoon, IMDb or MyMovies reviews

In [40]:
comingsoon_count = collection.count_documents({"review.ComingSoon": {"$exists": True},})
imdb_count = collection.count_documents({"review.IMDb": {"$exists": True},})
mymovies_count = collection.count_documents({"review.MyMovies": {"$exists": True},})
print(f"ComingSoon review is present in {comingsoon_count} documents, equal to {round((comingsoon_count / total_documents) * 100, 2)} %")
print(f"IMDb review is present in {imdb_count} documents, equal to {round((imdb_count / total_documents) * 100, 2)} %")
print(f"MyMovies review is present in {mymovies_count} documents, equal to {round((mymovies_count / total_documents) * 100, 2)} %")

ComingSoon review is present in 2010 documents, equal to 100.0 %
IMDb review is present in 1740 documents, equal to 86.57 %
MyMovies review is present in 2010 documents, equal to 100.0 %


Completeness for each attribute

In [41]:
attributes = [
    "title",
    "original_title",
    "register",
    "genere",
    "region",
    "year",
    "box_office",
    "review"
]

completeness = {}
for attribute in attributes:
    query = {attribute: {"$exists": True}}
    count = collection.count_documents(query)
    print(f"<{attribute}> is present in {count} documents, equal to {round((count / total_documents) * 100, 2)} % of total documents.")

<title> is present in 2010 documents, equal to 100.0 % of total documents.
<original_title> is present in 1494 documents, equal to 74.33 % of total documents.
<register> is present in 2009 documents, equal to 99.95 % of total documents.
<genere> is present in 2010 documents, equal to 100.0 % of total documents.
<region> is present in 2005 documents, equal to 99.75 % of total documents.
<year> is present in 2010 documents, equal to 100.0 % of total documents.
<box_office> is present in 2010 documents, equal to 100.0 % of total documents.
<review> is present in 2010 documents, equal to 100.0 % of total documents.


In [42]:
client.close()