In [6]:
from pymongo import MongoClient
from mimesis import Person
from uuid import uuid4
from collections import Counter
import random
from clickhouse_driver import Client

clickhouse_client = Client(host='ctube-study.ru')
client = MongoClient('ctube-study.ru', 27019)
db = client['docker']


In [7]:
import time
import statistics

class profile_code():
    def __init__(self):
        self.start = time.time()
        self.all_durations = []
        
    def setup_start_time(self):
        self.start = time.time()

    def checkpoint(self):
        end_time = time.time()
        self.all_durations.append(end_time - self.start)
        self.start = end_time

    def __enter__(self):
        return self

    def __exit__(self, type, value, traceback):
        print(f"Median - {statistics.median(self.all_durations)}")
        avg = sum(self.all_durations) / len(self.all_durations)
        print(f"Average - {avg}")
        print(f"Summary - {sum(self.all_durations)}")
        
person = Person()
movies_rand = [str(uuid4()) for x in range(1,200)]
user_rand = [random.randint(1,500) for x in range(1,500)]
movies_rand = [f'{str(uuid4())}'] + movies_rand

def gen_person_data():
    return [{'uuid': str(uuid4()),
                'first_name': person.name(), 
                'age': person.age(),
                 'email': person.email(),
                 'political': person.political_views(),
                 'phone': person.telephone(),
                 'username': person.username()
                 } for x in range(10000)]


def gen_movie_likes():
    return [{'rating': random.randint(0,1),
             'user_id': random.choice(user_rand),
             'movie_uuid': random.choice(movies_rand),
             'created_at': random.randint(633801229, 1675180434),
             'updated_at': random.randint(633801229, 1675180434)} for x in range(10000)]


def gen_bookmarks():
    return [{'user_id': random.choice(user_rand),
             'movie_uuid': random.choice(movies_rand),
             'created_at': random.randint(633801229, 1675180434),
             'updated_at': random.randint(633801229, 1675180434)} for x in range(10000)]

In [36]:
clickhouse_client.execute("""
CREATE TABLE if not exists default.tests (
id UUID,
first_name VARCHAR,
age INT,
email VARCHAR,
political VARCHAR,
phone VARCHAR,
username VARCHAR
) Engine=MergeTree() ORDER BY id;
""")

clickhouse_client.execute("""
CREATE TABLE if not exists movie_likes (
uuid UUID,
rating TINYINT,
user_id BIGINT,
movie_uud UUID,
created_at DateTime,
updated_at DateTime
) Engine=MergeTree() ORDER BY uuid;
""")

clickhouse_client.execute("""
CREATE TABLE if not exists bookmarks (
uuid UUID,
user_id BIGINT,
movie_uuid UUID,
created_at DateTime,
updated_at DateTime
) Engine=MergeTree() ORDER BY uuid;
""")

[]

In [24]:
newcol = db['test']
test_data = gen_person_data()
newcol.delete_many({})

<pymongo.results.DeleteResult at 0x7f760b156770>

**Пакетная вставка данных в MongoDB**

In [25]:
%%time
with profile_code() as profiler:
    for i in range(2):
        profiler.setup_start_time()
        newcol.insert_many(test_data)
        profiler.checkpoint()
        newcol.delete_many({})

Median - 0.34908080101013184
Average - 0.34908080101013184
Summary - 0.6981616020202637
CPU times: user 209 ms, sys: 12.3 ms, total: 222 ms
Wall time: 1.75 s


**Одиночная вставка данных в MongoDB**

In [27]:
%%time
with profile_code() as profiler:
    for i in range(2):
        profiler.setup_start_time()
        for each in test_data:
            newcol.insert_one(each)
        profiler.checkpoint()
        
        newcol.delete_many({})

Median - 83.11603128910065
Average - 83.11603128910065
Summary - 166.2320625782013
CPU times: user 6.96 s, sys: 943 ms, total: 7.9 s
Wall time: 2min 47s


In [28]:
newcol = db['movie_likes']
movie_likes_dataset = gen_movie_likes()
newcol.delete_many({})

<pymongo.results.DeleteResult at 0x7f760b337c10>

**Пакетная вставка данных в MongoDB**

In [29]:
%%time
with profile_code() as profiler:
    for i in range(2):
        profiler.setup_start_time()
        newcol.insert_many(movie_likes_dataset)
        profiler.checkpoint()
        newcol.delete_many({})

Median - 0.31899750232696533
Average - 0.31899750232696533
Summary - 0.6379950046539307
CPU times: user 250 ms, sys: 3.1 ms, total: 253 ms
Wall time: 1.65 s


**Одиночная вставка данных в MongoDB**

In [30]:
%%time
with profile_code() as profiler:
    for i in range(2):
        profiler.setup_start_time()
        for each in movie_likes_dataset:
            newcol.insert_one(each)
        profiler.checkpoint()
        
        newcol.delete_many({})

Median - 83.62954795360565
Average - 83.62954795360565
Summary - 167.2590959072113
CPU times: user 7.18 s, sys: 946 ms, total: 8.13 s
Wall time: 2min 48s


In [31]:
newcol = db['bookmarks']
bookmarks_dataset = gen_bookmarks()
newcol.delete_many({})

<pymongo.results.DeleteResult at 0x7f7608112800>

**Пакетная вставка данных в MongoDB**

In [32]:
%%time
with profile_code() as profiler:
    for i in range(2):
        profiler.setup_start_time()
        newcol.insert_many(bookmarks_dataset)
        profiler.checkpoint()
        newcol.delete_many({})

Median - 0.26420676708221436
Average - 0.26420676708221436
Summary - 0.5284135341644287
CPU times: user 159 ms, sys: 7.54 ms, total: 167 ms
Wall time: 1.51 s


**Одиночная вставка данных в MongoDB**

In [33]:
%%time
with profile_code() as profiler:
    for i in range(2):
        profiler.setup_start_time()
        for each in bookmarks_dataset:
            newcol.insert_one(each)
        profiler.checkpoint()
        
        newcol.delete_many({})

Median - 90.14863133430481
Average - 90.14863133430481
Summary - 180.29726266860962
CPU times: user 7.25 s, sys: 1 s, total: 8.25 s
Wall time: 3min 1s


**Одиночная вставка данных в Clickhouse**

In [41]:
%%time
with profile_code() as profiler:
    for i in range(2):
        profiler.setup_start_time()
        for row in test_data:
            clickhouse_client.execute(f"""INSERT INTO default.tests (id, first_name, age, email, political, phone, username) 
            VALUES ('{row['uuid']}', '{row['first_name']}', {row['age']}, '{row['email']}', 
            '{row['political']}', '{row['phone']}', '{row['username']}');""")
        profiler.checkpoint()
        clickhouse_client.execute("""TRUNCATE TABLE default.tests;""")

Median - 34.4092720746994
Average - 34.4092720746994
Summary - 68.8185441493988
CPU times: user 4.3 s, sys: 2.28 s, total: 6.58 s
Wall time: 1min 8s


In [42]:
movie_likes_dataset = gen_movie_likes()

**Одиночная вставка данных в Clickhouse**

In [44]:
%%time
with profile_code() as profiler:
    for i in range(2):
        profiler.setup_start_time()
        for row in movie_likes_dataset:
            clickhouse_client.execute(f"""INSERT INTO movie_likes 
            VALUES ('{str(uuid4())}', '{row['rating']}', '{row['user_id']}', '{row['movie_uuid']}', 
            '{row['created_at']}', '{row['updated_at']}');""")
        profiler.checkpoint()
        clickhouse_client.execute("""TRUNCATE TABLE movie_likes;""")

Median - 50.92217040061951
Average - 50.92217040061951
Summary - 101.84434080123901
CPU times: user 4.87 s, sys: 2.62 s, total: 7.5 s
Wall time: 1min 42s


**Одиночная вставка данных в Clickhouse**

In [45]:
%%time
with profile_code() as profiler:
    for i in range(2):
        profiler.setup_start_time()
        for row in movie_likes_dataset:
            clickhouse_client.execute(f"""INSERT INTO bookmarks 
            VALUES ('{str(uuid4())}', '{row['user_id']}', '{row['movie_uuid']}', '{row['created_at']}', 
            '{row['updated_at']}');""")
        profiler.checkpoint()
        clickhouse_client.execute("""TRUNCATE TABLE bookmarks;""")

Median - 41.97558069229126
Average - 41.97558069229126
Summary - 83.95116138458252
CPU times: user 4.66 s, sys: 2.56 s, total: 7.22 s
Wall time: 1min 23s


In [46]:
movies_col = db['movie_likes']
movies_col.insert_many(gen_movie_likes())
bookmarks_col = db['bookmarks']
bookmarks_col.insert_many(gen_bookmarks())

<pymongo.results.InsertManyResult at 0x7f760979d300>

**Выборка списка фильмов, понравившихся пользователю, MongoDB**

In [47]:
%%time
with profile_code() as profiler:
    for i in range(10):
        profiler.setup_start_time()
        user = movies_col.find_one().get('user_id')
        mydoc = movies_col.find({"user_id":user})
        result = list(map(lambda x: x, mydoc))
        profiler.checkpoint()

Median - 0.008145451545715332
Average - 0.009596562385559082
Summary - 0.09596562385559082
CPU times: user 8.95 ms, sys: 1.5 ms, total: 10.5 ms
Wall time: 96.1 ms


**Вывод количества лайков и дизлайков пользователя, MongoDB**

In [48]:
%%time
with profile_code() as profiler:
    for i in range(10):
        profiler.setup_start_time()
        movie = movies_col.find_one().get('movie_uuid')
        mydoc = movies_col.find({'movie_uuid': movie})
        result = list(map(lambda x: x, mydoc))
        counter = Counter(x['rating'] for x in result)
        likes = counter.get(1)
        dislikes = counter.get(0)
        profiler.checkpoint()

Median - 0.008039474487304688
Average - 0.008637690544128418
Summary - 0.08637690544128418
CPU times: user 8.76 ms, sys: 1.45 ms, total: 10.2 ms
Wall time: 86.5 ms


**Выбор списка закладок пользователя, MongoDB**

In [49]:
%%time
with profile_code() as profiler:
    for i in range(10):
        profiler.setup_start_time()
        user = bookmarks_col.find_one().get('user_id')
        mydoc = bookmarks_col.find({'user_id':user})
        result = list(map(lambda x: x, mydoc))
        profiler.checkpoint()

Median - 0.017369389533996582
Average - 0.01831841468811035
Summary - 0.18318414688110352
CPU times: user 9.15 ms, sys: 3.21 ms, total: 12.4 ms
Wall time: 183 ms


**Подсчет среднего рейтинга фильма, MongoDB**

In [50]:
%%time
with profile_code() as profiler:
    for i in range(10):
        profiler.setup_start_time()
        movie = movies_col.find_one().get('movie_uuid')
        mydoc = movies_col.find({'movie_uuid': movie})
        result = list(map(lambda x: x, mydoc))
        counter = Counter(x['rating'] for x in result)
        likes = counter.get(1)
        dislikes = counter.get(0)
        average_rating = (1 * likes) / (likes + dislikes)
        profiler.checkpoint()

Median - 0.008525967597961426
Average - 0.009981942176818848
Summary - 0.09981942176818848
CPU times: user 9.8 ms, sys: 1.77 ms, total: 11.6 ms
Wall time: 100 ms


In [4]:
import requests

base_url = 'http://ctube-study.ru:8000'

**Тестирование скорости работы данных поступающих в реальном времени (запрос на API - FastAPI backend - Kafka topic - ETL - MongoDB - чтение из MongoDB)**

In [26]:
%%time
with profile_code() as profiler:
    for i in range(10):
        movie_id = str(uuid4())
        user_id = random.randint(1,500)
        db = client['dataBase']
        topic = db['ugcViews']
        mydoc = list()
        
        profiler.setup_start_time()
        new_view = requests.post(f'{base_url}/api/v1/views/set', json = {
              "value": {
                "movie_timestamp": random.randint(1, 86400),
                "event_timestamp": random.randint(633801229, 1675180434),
                "movie_id": movie_id,
                "user_id": user_id
              },
              "topic": "ugcViews"
        })
        while len(list(map(lambda x: x, mydoc))) == 0:
            mydoc = topic.find({
                'movie_id': movie_id,
                'user_id': user_id
            })
        profiler.checkpoint()
        topic.delete_many({})

Median - 0.019607186317443848
Average - 0.019745945930480957
Summary - 0.19745945930480957
CPU times: user 56.1 ms, sys: 105 µs, total: 56.2 ms
Wall time: 266 ms
