In [1]:
!docker-compose up -d --build

Creating network "research_default" with the default driver
Creating volume "research_mongodb_data_container" with default driver
Creating mongodb_container ... 

Creating mongodb_container ... done



In [2]:
import time
from typing import Callable
from uuid import uuid4

from pymongo import MongoClient

In [3]:
MONGO_HOST = "127.0.0.1"
MONGO_PORT = 27017
MONGO_DB = "ugc_db"
MONGO_COLLECTION_LIKE = "likedFilms"
MONGO_COLLECTION_REVIEW = "reviews"
MONGO_COLLECTION_BOOKMARK = "bookmarks"
ITERATIONS_NUMBER = 10
USERS_IN_BATCH = 10
OPTIMAL_BATCH_SIZE = 200
TEST_RECORDS_SIZE = 10000

client = MongoClient(MONGO_HOST, MONGO_PORT, connect=True)
mongo_db = client[MONGO_DB]

In [4]:

"""Генерация фейковых данных для исслдеования."""
from random import choice, randint
from typing import Callable
from uuid import uuid4

from faker import Faker

LIKE = 1
DISLIKE = 0
START_DATE = "-30d"
END_DATE = "now"
MIN_RATING = 1
MAX_RATING = 10
fake = Faker()

In [5]:
def fake_like_event(user_id: str = None, film_id: str = None) -> dict:
    """Генерация события like."""
    return {
        "user_id": user_id if user_id else str(uuid4()),
        "film_id": film_id if film_id else str(uuid4()),
        "type": choice([LIKE, DISLIKE]),
        "datetime": fake.date_time_between(start_date=START_DATE, end_date=END_DATE),
    }

In [6]:
def fake_review_event(user_id: str = None, film_id: str = None) -> dict:
    """Генерация события review."""
    return {
        "user_id": user_id if user_id else str(uuid4()),
        "film_id": film_id if film_id else str(uuid4()),
        "text": fake.text(),
        "rating": randint(MIN_RATING, MAX_RATING),
        "datetime": fake.date_time_between(start_date=START_DATE, end_date=END_DATE),
    }

In [7]:
def fake_bookmark_event(user_id: str = None, film_id: str = None) -> dict:
    """Генерация события bookmark."""
    return {
        "user_id": user_id if user_id else str(uuid4()),
        "film_id": film_id if film_id else str(uuid4()),
        "datetime": fake.date_time_between(start_date=START_DATE, end_date=END_DATE),
    }

In [8]:
def fake_batch(event_faker: Callable, user_size: int, batch_size: int) -> list[dict]:
    """Генерация батча событий."""
    users = [str(uuid4()) for _ in range(user_size)]
    return [event_faker(user_id=choice(users)) for _ in range(batch_size)]

In [9]:
def fake_users_batch(event_faker: Callable, users: list, batch_size: int) -> list[dict]:
    """Генерация батча событий с фиксированными юзерами."""
    return [event_faker(user_id=choice(users)) for _ in range(batch_size)]

In [10]:
def test_insert_step(
        faker: Callable,
        collection_name: str,
        batch_size: int,
        iterations: int = ITERATIONS_NUMBER,
) -> None:
    """Тестирование вставки."""
    collection = mongo_db.get_collection(collection_name)
    statistics = []
    for _ in range(iterations):
        batch = fake_batch(faker, USERS_IN_BATCH, batch_size)
        start = time.time()
        collection.insert_many(batch)
        end = time.time()
        statistics.append(end - start)
    mean_batch = sum(statistics) / len(statistics)
    print(
        f"Statistics for {collection_name} batch_size={batch_size}: batch={mean_batch} sec, "
        f"item={mean_batch/batch_size} sec.",
    )

In [11]:
def test_insert(faker: Callable, collection_name: str) -> None:
    """Тестирование вставки с разным размером батча."""
    batch_sizes = [1, 10, 50, 100, 200, 500, 1000, 2000, 5000]
    for batch_size in batch_sizes:
        test_insert_step(faker, collection_name, batch_size)

In [12]:
 test_insert(
        fake_like_event,
        MONGO_COLLECTION_LIKE,
    )

Statistics for likedFilms batch_size=1: batch=0.007854580879211426 sec, item=0.007854580879211426 sec.
Statistics for likedFilms batch_size=10: batch=0.003737664222717285 sec, item=0.00037376642227172853 sec.
Statistics for likedFilms batch_size=50: batch=0.005108976364135742 sec, item=0.00010217952728271483 sec.
Statistics for likedFilms batch_size=100: batch=0.006158351898193359 sec, item=6.15835189819336e-05 sec.
Statistics for likedFilms batch_size=200: batch=0.01172327995300293 sec, item=5.8616399765014646e-05 sec.
Statistics for likedFilms batch_size=500: batch=0.023988509178161622 sec, item=4.797701835632325e-05 sec.
Statistics for likedFilms batch_size=1000: batch=0.03752453327178955 sec, item=3.752453327178955e-05 sec.
Statistics for likedFilms batch_size=2000: batch=0.05679442882537842 sec, item=2.839721441268921e-05 sec.
Statistics for likedFilms batch_size=5000: batch=0.11802396774291993 sec, item=2.3604793548583984e-05 sec.


In [12]:
test_insert(
        fake_review_event,
        MONGO_COLLECTION_REVIEW,
    )

Statistics for reviews batch_size=1: batch=0.004799699783325196 sec, item=0.004799699783325196 sec.
Statistics for reviews batch_size=10: batch=0.07557168006896972 sec, item=0.007557168006896972 sec.
Statistics for reviews batch_size=50: batch=0.1677483558654785 sec, item=0.0033549671173095704 sec.
Statistics for reviews batch_size=100: batch=0.4014437198638916 sec, item=0.004014437198638916 sec.
Statistics for reviews batch_size=200: batch=0.5239423990249634 sec, item=0.002619711995124817 sec.
Statistics for reviews batch_size=500: batch=1.2240280866622926 sec, item=0.0024480561733245853 sec.
Statistics for reviews batch_size=1000: batch=2.921845269203186 sec, item=0.0029218452692031864 sec.
Statistics for reviews batch_size=2000: batch=5.708000755310058 sec, item=0.002854000377655029 sec.
Statistics for reviews batch_size=5000: batch=12.710458731651306 sec, item=0.0025420917463302613 sec.


In [13]:
test_insert(
        fake_bookmark_event,
        MONGO_COLLECTION_BOOKMARK,
    )

Statistics for bookmarks batch_size=1: batch=0.003978061676025391 sec, item=0.003978061676025391 sec.
Statistics for bookmarks batch_size=10: batch=0.07902326583862304 sec, item=0.007902326583862303 sec.
Statistics for bookmarks batch_size=50: batch=0.08672583103179932 sec, item=0.0017345166206359864 sec.
Statistics for bookmarks batch_size=100: batch=0.43888702392578127 sec, item=0.004388870239257813 sec.
Statistics for bookmarks batch_size=200: batch=0.4062222480773926 sec, item=0.002031111240386963 sec.
Statistics for bookmarks batch_size=500: batch=0.5381228446960449 sec, item=0.0010762456893920898 sec.
Statistics for bookmarks batch_size=1000: batch=1.0303995847702025 sec, item=0.0010303995847702025 sec.
Statistics for bookmarks batch_size=2000: batch=2.6336779832839965 sec, item=0.0013168389916419982 sec.
Statistics for bookmarks batch_size=5000: batch=6.216507315635681 sec, item=0.0012433014631271363 sec.


In [17]:
def test_read_data(faker: Callable, collection_name: str, users_size: int) -> None:
    """Тестирование чтения."""
    statistics = []
    collection = mongo_db.get_collection(collection_name)
    users = [str(uuid4()) for _ in range(users_size)]

    for i in range(0, TEST_RECORDS_SIZE, OPTIMAL_BATCH_SIZE):
        batch = fake_users_batch(faker, users, batch_size=OPTIMAL_BATCH_SIZE)
        collection.insert_many(batch)

    for user in users:
        start = time.time()
        _ = list(collection.find({"user_id": user}))
        statistics.append(time.time() - start)

    mean_batch = sum(statistics) / len(statistics)
    print(
        f"Statistics read for {collection_name} for ~{int(TEST_RECORDS_SIZE/users_size)} records: {mean_batch} sec",
    )

In [18]:
test_read_data(fake_like_event, MONGO_COLLECTION_LIKE, 20)

Statistics read for likedFilms for ~500 records: 0.2809668183326721 sec


In [19]:
test_read_data(fake_review_event, MONGO_COLLECTION_REVIEW, 20)

Statistics read for reviews for ~500 records: 0.34582369327545165 sec


In [20]:
test_read_data(fake_review_event, MONGO_COLLECTION_REVIEW, 20)

Statistics read for reviews for ~500 records: 0.5513761401176452 sec
