In [1]:
from datetime import datetime
from time import sleep
from urllib.parse import quote_plus
from uuid import uuid4

In [2]:
from bson import ObjectId
from pymongo import ASCENDING, DESCENDING, TEXT, MongoClient

In [3]:
MONGO_SERVICE_DOMAIN = "mongo"
MONGO_SERVICE_PORT = 27017
MONGO_USER = "mongo"
MONGO_PASS = "mongo"
MONGO_DB = "data_playground"
MONGO_COLLECTION = "user_events"

In [4]:
client = MongoClient(
    f"mongodb://{quote_plus(MONGO_USER)}:{quote_plus(MONGO_PASS)}@{MONGO_SERVICE_DOMAIN}",
    MONGO_SERVICE_PORT,
    uuidRepresentation="standard",
)
db = client[MONGO_DB]
collection = db[MONGO_COLLECTION]

In [5]:
def print_cursor_head(cursor):
    for i, doc in enumerate(cursor):
        print(doc)
        if i >= 2:
            break

In [6]:
def print_cursor_all(cursor):
    for doc in cursor:
        print(doc)

In [7]:
# List all mongo dbs
client.list_database_names()

['admin', 'config', 'data_playground', 'local']

In [8]:
# List all collections of MONGO_DB
db.list_collection_names()

['user_events', 'categories']

In [9]:
# Get all documents
all_docs = collection.find()
print_cursor_head(all_docs)

{'_id': ObjectId('66897a54207ae258110a6cb8'), 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 6), 'event_type': 'view', 'product_id': 1996170, 'category_id': 2144415922528452715, 'category_code': 'electronics.telephone', 'brand': None, 'price': 31.9, 'user_id': 1515915625519388267, 'user_session': 'LJuJVLEjPT'}
{'_id': ObjectId('66897a54207ae258110a6cb9'), 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 26), 'event_type': 'view', 'product_id': 139905, 'category_id': 2144415926932472027, 'category_code': 'computers.components.cooler', 'brand': 'zalman', 'price': 17.16, 'user_id': 1515915625519380411, 'user_session': 'tdicluNnRY'}
{'_id': ObjectId('66897a54207ae258110a6cba'), 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 27), 'event_type': 'view', 'product_id': 215454, 'category_id': 2144415927158964449, 'category_code': None, 'brand': None, 'price': 9.81, 'user_id': 1515915625513238515, 'user_session': '4TMArHtXQy'}


In [10]:
# Get all documents sorted by ascending event_time
all_docs = collection.find().sort(
    "event_time",
    ASCENDING,
)
print_cursor_head(all_docs)

{'_id': ObjectId('66897a54207ae258110a6cb8'), 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 6), 'event_type': 'view', 'product_id': 1996170, 'category_id': 2144415922528452715, 'category_code': 'electronics.telephone', 'brand': None, 'price': 31.9, 'user_id': 1515915625519388267, 'user_session': 'LJuJVLEjPT'}
{'_id': ObjectId('66897a54207ae258110a6cb9'), 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 26), 'event_type': 'view', 'product_id': 139905, 'category_id': 2144415926932472027, 'category_code': 'computers.components.cooler', 'brand': 'zalman', 'price': 17.16, 'user_id': 1515915625519380411, 'user_session': 'tdicluNnRY'}
{'_id': ObjectId('66897a54207ae258110a6cba'), 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 27), 'event_type': 'view', 'product_id': 215454, 'category_id': 2144415927158964449, 'category_code': None, 'brand': None, 'price': 9.81, 'user_id': 1515915625513238515, 'user_session': '4TMArHtXQy'}


In [11]:
# Get all documents sorted by descending event_time
all_docs = collection.find().sort(
    "event_time",
    DESCENDING,
)
print_cursor_head(all_docs)

{'_id': ObjectId('66897a60207ae2581117ee40'), 'event_time': datetime.datetime(2021, 2, 28, 23, 59, 9), 'event_type': 'view', 'product_id': 743182, 'category_id': 2144415935631458761, 'category_code': 'construction.tools.soldering', 'brand': 'kada', 'price': 65.08, 'user_id': 1515915625556087775, 'user_session': 'BejOXRngEW'}
{'_id': ObjectId('66897a60207ae2581117ee3f'), 'event_time': datetime.datetime(2021, 2, 28, 23, 58, 14), 'event_type': 'view', 'product_id': 888273, 'category_id': 2144415921932861531, 'category_code': 'electronics.telephone', 'brand': None, 'price': 10.16, 'user_id': 1515915625611024030, 'user_session': '9pCbKMIcSx'}
{'_id': ObjectId('66897a60207ae2581117ee3e'), 'event_time': datetime.datetime(2021, 2, 28, 23, 58, 9), 'event_type': 'view', 'product_id': 4170534, 'category_id': 2144415939364389423, 'category_code': 'electronics.clocks', 'brand': 'amazfit', 'price': 64.92, 'user_id': 1515915625611024020, 'user_session': 'xNIJBqZdkd'}


In [12]:
# Get only one document
doc = collection.find_one()
doc

{'_id': ObjectId('66897a54207ae258110a6cb8'),
 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 6),
 'event_type': 'view',
 'product_id': 1996170,
 'category_id': 2144415922528452715,
 'category_code': 'electronics.telephone',
 'brand': None,
 'price': 31.9,
 'user_id': 1515915625519388267,
 'user_session': 'LJuJVLEjPT'}

In [13]:
# Find documents where event_type == view
docs = collection.find({"event_type": "view"})
print_cursor_head(docs)

{'_id': ObjectId('66897a54207ae258110a6cb8'), 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 6), 'event_type': 'view', 'product_id': 1996170, 'category_id': 2144415922528452715, 'category_code': 'electronics.telephone', 'brand': None, 'price': 31.9, 'user_id': 1515915625519388267, 'user_session': 'LJuJVLEjPT'}
{'_id': ObjectId('66897a54207ae258110a6cb9'), 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 26), 'event_type': 'view', 'product_id': 139905, 'category_id': 2144415926932472027, 'category_code': 'computers.components.cooler', 'brand': 'zalman', 'price': 17.16, 'user_id': 1515915625519380411, 'user_session': 'tdicluNnRY'}
{'_id': ObjectId('66897a54207ae258110a6cba'), 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 27), 'event_type': 'view', 'product_id': 215454, 'category_id': 2144415927158964449, 'category_code': None, 'brand': None, 'price': 9.81, 'user_id': 1515915625513238515, 'user_session': '4TMArHtXQy'}


In [14]:
# Find documents where event_type == view
#   but return only product_id field
docs = collection.find(
    {"event_type": "view"},
    projection={
        # "_id": False,
        "product_id": True,
    },
)
print_cursor_head(docs)

{'_id': ObjectId('66897a54207ae258110a6cb8'), 'product_id': 1996170}
{'_id': ObjectId('66897a54207ae258110a6cb9'), 'product_id': 139905}
{'_id': ObjectId('66897a54207ae258110a6cba'), 'product_id': 215454}


In [15]:
# Find all distinct values for event_type
distinct_event_types = collection.find(projection={"event_type": True}).distinct(
    key="event_type",
)
print_cursor_all(distinct_event_types)

cart
purchase
view


In [16]:
# Count the total number of documents
count = collection.count_documents({})
count

885129

In [17]:
# Alternative way to count the total number of documents
db.command("count", MONGO_COLLECTION)

{'n': 885129, 'ok': 1.0}

In [18]:
# Count the total number of documents
#   filtered by event_type
for etype in distinct_event_types:
    count = collection.count_documents({"event_type": etype})
    print(f"{etype}: {count}")

cart: 54035
purchase: 37346
view: 793748


In [19]:
# Alternative way to count the total number of documents
#   filtered by event_type
for etype in distinct_event_types:
    count = db.command(
        "count",
        MONGO_COLLECTION,
        query={"event_type": etype},
    )
    print(f"{etype}: {count}")

cart: {'n': 54035, 'ok': 1.0}
purchase: {'n': 37346, 'ok': 1.0}
view: {'n': 793748, 'ok': 1.0}


In [20]:
# Count the total number of documents
#   grouped by event_type
pipeline = [{"$group": {"_id": "$event_type", "count": {"$sum": 1}}}]
result = collection.aggregate(pipeline)
print_cursor_all(result)

{'_id': 'view', 'count': 793748}
{'_id': 'cart', 'count': 54035}
{'_id': 'purchase', 'count': 37346}


In [21]:
# Count the total number of documents
#   grouped by category_code and event_type
pipeline = [
    {
        "$group": {
            "_id": ["$category_code", "$event_type"],
            "count": {"$sum": 1},
            "average_price": {"$avg": "$price"},
        },
    },
]
result = collection.aggregate(pipeline)
print_cursor_head(result)

{'_id': ['computers.peripherals.keyboard', 'cart'], 'count': 146, 'average_price': 94.99308219178081}
{'_id': ['construction.tools.welding', 'view'], 'count': 3048, 'average_price': 172.22666994750656}
{'_id': ['apparel.glove', 'cart'], 'count': 1, 'average_price': 12.06}


In [22]:
# Sort the count of total number of documents
#   grouped by category_code and event_type
pipeline = [
    {
        "$group": {
            "_id": ["$category_code", "$event_type"],
            "count": {"$sum": 1},
            "average_price": {"$avg": "$price"},
        },
    },
    {"$sort": {"_id.0": 1, "_id.1": 1}},
]
result = collection.aggregate(pipeline)
print_cursor_head(result)

{'_id': [None, 'cart'], 'count': 10409, 'average_price': 71.28231146123547}
{'_id': [None, 'purchase'], 'count': 7568, 'average_price': 65.26603065539112}
{'_id': [None, 'view'], 'count': 218242, 'average_price': 84.18748591013646}


In [23]:
# Sort the count of total number of documents
#   grouped by category_code and event_type
#   where category_code is not None
pipeline = [
    {"$match": {"category_code": {"$ne": None}}},
    {
        "$group": {
            "_id": ["$category_code", "$event_type"],
            "count": {"$sum": 1},
            "average_price": {"$avg": "$price"},
        },
    },
    {"$sort": {"_id.0": 1, "_id.1": 1}},
]
result = collection.aggregate(pipeline)
print_cursor_head(result)

{'_id': ['accessories.bag', 'cart'], 'count': 64, 'average_price': 38.75234375}
{'_id': ['accessories.bag', 'purchase'], 'count': 56, 'average_price': 37.22160714285714}
{'_id': ['accessories.bag', 'view'], 'count': 1947, 'average_price': 55.13916281458654}


In [24]:
# Find the total quantity and revenue by category_code
#   and sort by descending revenue
# Limit the result to the top 3 categories that generated most revenue
# Also rename _id filed to category_code
pipeline = [
    {"$match": {"event_type": "purchase"}},
    {
        "$group": {
            "_id": "$category_code",
            "qty": {"$sum": 1},
            "revenue": {"$sum": "$price"},
        },
    },
    {"$sort": {"revenue": -1}},
    {"$limit": 3},
    {"$addFields": {"category_code": "$_id"}},
    {"$unset": "_id"},
]
result = collection.aggregate(pipeline)
print_cursor_head(result)

{'qty': 6888, 'revenue': 2604764.8, 'category_code': 'computers.components.videocards'}
{'qty': 7568, 'revenue': 493933.32, 'category_code': None}
{'qty': 2557, 'revenue': 364566.8, 'category_code': 'computers.peripherals.printer'}


In [25]:
# Find the total quantity and revenue by category_code
#   and sort by descending revenue
# Get only the 3rd category that generated most revenue
pipeline = [
    {"$match": {"event_type": "purchase"}},
    {
        "$group": {
            "_id": "$category_code",
            "qty": {"$sum": 1},
            "revenue": {"$sum": "$price"},
        },
    },
    {"$sort": {"revenue": -1}},
    {"$skip": 2},
    {"$limit": 1},
]
result = collection.aggregate(pipeline)
print_cursor_head(result)

{'_id': 'computers.peripherals.printer', 'qty': 2557, 'revenue': 364566.8}


In [26]:
start_date = datetime(2020, 9, 24)
end_date = datetime(2020, 9, 25)
ets = collection.find({"event_time": {"$gte": start_date, "$lte": end_date}})
print_cursor_head(ets)

{'_id': ObjectId('66897a54207ae258110a6cb8'), 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 6), 'event_type': 'view', 'product_id': 1996170, 'category_id': 2144415922528452715, 'category_code': 'electronics.telephone', 'brand': None, 'price': 31.9, 'user_id': 1515915625519388267, 'user_session': 'LJuJVLEjPT'}
{'_id': ObjectId('66897a54207ae258110a6cb9'), 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 26), 'event_type': 'view', 'product_id': 139905, 'category_id': 2144415926932472027, 'category_code': 'computers.components.cooler', 'brand': 'zalman', 'price': 17.16, 'user_id': 1515915625519380411, 'user_session': 'tdicluNnRY'}
{'_id': ObjectId('66897a54207ae258110a6cba'), 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 27), 'event_type': 'view', 'product_id': 215454, 'category_id': 2144415927158964449, 'category_code': None, 'brand': None, 'price': 9.81, 'user_id': 1515915625513238515, 'user_session': '4TMArHtXQy'}


In [27]:
# Calculate the average price of the all distinct products
pipeline = [
    {
        "$group": {
            "_id": {"product_id": "$product_id", "price": "$price"},
            "count": {"$sum": 1},
        },
    },
    {
        "$project": {
            "product_id": "$_id.product_id",
            "price": "$_id.price",
        },
    },
    {
        "$group": {
            "_id": None,
            "avg_price": {"$avg": "$price"},
        },
    },
    {"$unset": ["_id", "count"]},
]
result = collection.aggregate(pipeline)
print_cursor_head(result)

{'avg_price': 105.34616130058183}


In [28]:
# Find products with more than 1 price
pipeline = [
    {
        "$group": {
            "_id": {"product_id": "$product_id", "price": "$price"},
            "count": {"$sum": 1},
        },
    },
    {
        "$project": {
            "product_id": "$_id.product_id",
            "price": "$_id.price",
        },
    },
    {
        "$group": {
            "_id": "$product_id",
            "count": {"$sum": 1},
        },
    },
    {"$match": {"count": {"$gt": 1}}},
]
result = collection.aggregate(pipeline)
print_cursor_head(result)

In [29]:
# Find min and max price
pipeline = [
    {
        "$group": {
            "_id": None,
            "min_price": {"$min": "$price"},
            "max_price": {"$max": "$price"},
        },
    },
    {"$unset": "_id"},
]
result = collection.aggregate(pipeline)
print_cursor_head(result)

{'min_price': 0.22, 'max_price': 64771.06}


In [30]:
# update document with hard-coded values
result = collection.update_one(
    {"product_id": 1996170},
    {
        "$set": {
            "category_id": 2144415926932472027,
            "category_code": "computers.components.cooler",
        },
    },
)
print(
    collection.find_one(
        {"product_id": 1996170},
        projection=["product_id", "category_id", "category_code"],
    ),
)

{'_id': ObjectId('66897a54207ae258110a6cb8'), 'product_id': 1996170, 'category_id': 2144415926932472027, 'category_code': 'computers.components.cooler'}


In [31]:
# update document with other product data
result = collection.update_one(
    {"product_id": 1996170},
    {
        "$set": collection.find_one(
            {"category_id": 2227847332769039290},
            projection={"_id": False, "category_id": True, "category_code": True},
        ),
    },
)
print(
    collection.find_one(
        {"product_id": 1996170},
        projection=["product_id", "category_id", "category_code"],
    ),
)

{'_id': ObjectId('66897a54207ae258110a6cb8'), 'product_id': 1996170, 'category_id': 2227847332769039290, 'category_code': 'auto.accessories.light'}


In [32]:
# update document with other product data
result = collection.delete_many({"product_id": 4183853})
print(collection.find_one({"product_id": 4183853}))

None


In [33]:
# unwind array
result = collection.update_one(
    {"_id": ObjectId("66815bc07748e4801174481d")},
    update={"$set": {"tmp_sizes": ["Small", "Medium", "Large"]}},
)
print(
    collection.find_one(
        {"_id": ObjectId("66815bc07748e4801174481d")},
        projection=["tmp_sizes"],
    ),
)
result_unwind = collection.aggregate([{"$unwind": "$tmp_sizes"}])
print_cursor_all(result_unwind)

None


In [34]:
# collection lookup
result = collection.aggregate(
    [
        {
            "$lookup": {
                "from": "categories",
                "localField": "category_code",
                "foreignField": "category_code",
                "as": "category_details",
            },
        },
    ],
)
print_cursor_head(result)

{'_id': ObjectId('66897a54207ae258110a6cb8'), 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 6), 'event_type': 'view', 'product_id': 1996170, 'category_id': 2227847332769039290, 'category_code': 'auto.accessories.light', 'brand': None, 'price': 31.9, 'user_id': 1515915625519388267, 'user_session': 'LJuJVLEjPT', 'category_details': [{'_id': ObjectId('668955ef49d62c0f29f74464'), 'category_code': 'auto.accessories.light', 'category_description': 'Foo'}]}
{'_id': ObjectId('66897a54207ae258110a6cb9'), 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 26), 'event_type': 'view', 'product_id': 139905, 'category_id': 2144415926932472027, 'category_code': 'computers.components.cooler', 'brand': 'zalman', 'price': 17.16, 'user_id': 1515915625519380411, 'user_session': 'tdicluNnRY', 'category_details': []}
{'_id': ObjectId('66897a54207ae258110a6cba'), 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 27), 'event_type': 'view', 'product_id': 215454, 'category_id': 2144415927158964449, '

In [35]:
# conditional aggregation
result = collection.aggregate(
    [
        {
            "$project": {
                "price": True,
                "is_expensive": {
                    "$cond": [{"$gte": ["$price", 15]}, True, False],
                },
            },
        },
    ],
)
print_cursor_head(result)

{'_id': ObjectId('66897a54207ae258110a6cb8'), 'price': 31.9, 'is_expensive': True}
{'_id': ObjectId('66897a54207ae258110a6cb9'), 'price': 17.16, 'is_expensive': True}
{'_id': ObjectId('66897a54207ae258110a6cba'), 'price': 9.81, 'is_expensive': False}


In [36]:
# misc array operations: concat, size, diff, intersection, zip
result = collection.update_one(
    {"_id": ObjectId("66815bc07748e4801174481e")},
    update={
        "$set": {
            "tmp_test_array_1": [1, 2, 3],
            "tmp_test_array_2": [1, 4, 5],
        },
    },
)
print(
    collection.find_one(
        {"_id": ObjectId("66815bc07748e4801174481e")},
        projection=["tmp_test_array_1", "tmp_test_array_2"],
    ),
)
result = collection.aggregate(
    [
        {"$match": {"_id": ObjectId("66815bc07748e4801174481e")}},
        {
            "$project": {
                "concat_tmp_test": {
                    "$concatArrays": ["$tmp_test_array_1", "$tmp_test_array_2"],
                },
                "tmp_test_array_1_size": {"$size": "$tmp_test_array_1"},
                "tmp_test_array_2_size": {"$size": "$tmp_test_array_2"},
                "items_in_1_not_in_2": {
                    "$setDifference": ["$tmp_test_array_1", "$tmp_test_array_2"],
                },
                "items_in_2_not_in_1": {
                    "$setDifference": ["$tmp_test_array_2", "$tmp_test_array_1"],
                },
                "items_in_1_and_2": {
                    "$setIntersection": ["$tmp_test_array_1", "$tmp_test_array_2"],
                },
                "1_and_2_zip": {
                    "$zip": {"inputs": ["$tmp_test_array_1", "$tmp_test_array_2"]},
                },
            },
        },
    ],
)
print_cursor_head(result)

None


In [37]:
# datetime operations
result = collection.aggregate(
    [
        {
            "$project": {
                "event_time": True,
                "event_time_year": {"$year": "$event_time"},
                "event_time_month": {"$month": "$event_time"},
                "event_time_week": {"$week": "$event_time"},
                "event_time_day_of_year": {"$dayOfYear": "$event_time"},
                "event_time_day_of_month": {"$dayOfMonth": "$event_time"},
                "event_time_iso_day_of_week": {"$isoDayOfWeek": "$event_time"},
                "event_time_hour": {"$hour": "$event_time"},
                "event_time_minutes": {"$minute": "$event_time"},
                "event_time_seconds": {"$second": "$event_time"},
                "event_time_milliseconds": {"$millisecond": "$event_time"},
            },
        },
    ],
)
print_cursor_head(result)

{'_id': ObjectId('66897a54207ae258110a6cb8'), 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 6), 'event_time_year': 2020, 'event_time_month': 9, 'event_time_week': 38, 'event_time_day_of_year': 268, 'event_time_day_of_month': 24, 'event_time_iso_day_of_week': 4, 'event_time_hour': 11, 'event_time_minutes': 57, 'event_time_seconds': 6, 'event_time_milliseconds': 0}
{'_id': ObjectId('66897a54207ae258110a6cb9'), 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 26), 'event_time_year': 2020, 'event_time_month': 9, 'event_time_week': 38, 'event_time_day_of_year': 268, 'event_time_day_of_month': 24, 'event_time_iso_day_of_week': 4, 'event_time_hour': 11, 'event_time_minutes': 57, 'event_time_seconds': 26, 'event_time_milliseconds': 0}
{'_id': ObjectId('66897a54207ae258110a6cba'), 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 27), 'event_time_year': 2020, 'event_time_month': 9, 'event_time_week': 38, 'event_time_day_of_year': 268, 'event_time_day_of_month': 24, 'event_time_iso

In [38]:
# string manipulation
result = collection.aggregate(
    [
        {"$match": {"category_code": {"$ne": None}}},
        {
            "$project": {
                "category_code": True,
                "category_code_upper": {"$toUpper": "$category_code"},
            },
        },
    ],
)
print_cursor_head(result)

{'_id': ObjectId('66897a54207ae258110a6cb8'), 'category_code': 'auto.accessories.light', 'category_code_upper': 'AUTO.ACCESSORIES.LIGHT'}
{'_id': ObjectId('66897a54207ae258110a6cb9'), 'category_code': 'computers.components.cooler', 'category_code_upper': 'COMPUTERS.COMPONENTS.COOLER'}
{'_id': ObjectId('66897a54207ae258110a6cbb'), 'category_code': 'computers.peripherals.printer', 'category_code_upper': 'COMPUTERS.PERIPHERALS.PRINTER'}


In [39]:
# single index
collection.create_index({"product_id": 1})

'product_id_1'

In [40]:
# compound index
collection.create_index({"product_id": 1, "event_type": 1})

'product_id_1_event_type_1'

In [41]:
# unique index
for doc in collection.find():
    new_uuid = uuid4()
    collection.update_one(
        {"_id": ObjectId(doc["_id"])},
        {"$set": {"unique_mock_field": uuid4()}},
    )
collection.create_index({"unique_mock_field": 1}, unique=True)

'unique_mock_field_1'

In [42]:
# text index
collection.create_index([("category_code", TEXT)], default_language="english")

'category_code_text'

In [43]:
# Find text using search
results = collection.find({"$text": {"$search": "accessories"}})
print_cursor_head(results)

{'_id': ObjectId('66897a60207ae2581117e8d4'), 'event_time': datetime.datetime(2021, 2, 28, 17, 38, 23), 'event_type': 'view', 'product_id': 941183, 'category_id': 2144415924575273120, 'category_code': 'accessories.bag', 'brand': 'sumdex', 'price': 42.0, 'user_id': 1515915625610939240, 'user_session': 'zWYaqSPGCr', 'unique_mock_field': UUID('72b044ef-d06b-4e99-8026-7249a1f68c3a')}
{'_id': ObjectId('66897a60207ae2581117e63a'), 'event_time': datetime.datetime(2021, 2, 28, 15, 23, 42), 'event_type': 'view', 'product_id': 4102595, 'category_id': 2144415952853271466, 'category_code': 'accessories.bag', 'brand': 'thermos', 'price': 28.1, 'user_id': 1515915625610897690, 'user_session': 'Nf452Y2SAr', 'unique_mock_field': UUID('1f73fc96-8720-434e-bc9b-92ba87c9402c')}
{'_id': ObjectId('66897a60207ae2581117e394'), 'event_time': datetime.datetime(2021, 2, 28, 13, 16, 53), 'event_type': 'view', 'product_id': 659916, 'category_id': 2144415924575273120, 'category_code': 'accessories.bag', 'brand': 'hp

In [44]:
# geospatial index
collection.update_many({}, {"$set": {"coordinates": [-73.97, 40.77]}})
collection.create_index([("coordinates", "2dsphere")])

'coordinates_2dsphere'

In [45]:
# Find docs proximate to coordinate [-73.9667, 40.78]
results = collection.find(
    {
        "coordinates": {
            "$nearSphere": {
                "$geometry": {
                    "type": "Point",
                    "coordinates": [-73.9667, 40.78],
                },
                "$minDistance": 1000,
                "$maxDistance": 5000,
            },
        },
    },
)
print_cursor_head(results)

{'_id': ObjectId('66897a54207ae258110a6cb8'), 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 6), 'event_type': 'view', 'product_id': 1996170, 'category_id': 2227847332769039290, 'category_code': 'auto.accessories.light', 'brand': None, 'price': 31.9, 'user_id': 1515915625519388267, 'user_session': 'LJuJVLEjPT', 'unique_mock_field': UUID('d30392a6-ad1a-4e8e-add7-83e33754973d'), 'coordinates': [-73.97, 40.77]}
{'_id': ObjectId('66897a54207ae258110a6cba'), 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 27), 'event_type': 'view', 'product_id': 215454, 'category_id': 2144415927158964449, 'category_code': None, 'brand': None, 'price': 9.81, 'user_id': 1515915625513238515, 'user_session': '4TMArHtXQy', 'unique_mock_field': UUID('1c4fc512-8dbd-4bb0-af67-101f1433b8cd'), 'coordinates': [-73.97, 40.77]}
{'_id': ObjectId('66897a54207ae258110a6cbe'), 'event_time': datetime.datetime(2020, 9, 24, 11, 58, 23), 'event_type': 'view', 'product_id': 3791349, 'category_id': 2144415935086199225,

In [46]:
# Find docs proximate to coordinate [3, 6]
results = collection.find(
    {
        "coordinates": {
            "$nearSphere": {
                "$geometry": {
                    "type": "Point",
                    "coordinates": [3, 6],
                },
                "$minDistance": 1000,
                "$maxDistance": 5000,
            },
        },
    },
)
if list(results):
    print_cursor_head(results)
else:
    print("Empty")

Empty


In [47]:
# hashed index
collection.create_index([("_id", "hashed")])

'_id_hashed'

In [48]:
# Find last event_time and count before applying ttl index
print(db.command("count", MONGO_COLLECTION))
result = collection.aggregate(
    [
        {"$group": {"_id": None, "max_event_time": {"$max": "$event_time"}}},
        {"$unset": "_id"},
    ],
)
print_cursor_head(result)

{'n': 885122, 'ok': 1.0}
{'max_event_time': datetime.datetime(2021, 2, 28, 23, 59, 9)}


In [49]:
# ttl index
collection.create_index([("event_time", 1)], expireAfterSeconds=3600)

'event_time_1'

In [None]:
sleep(180)

In [51]:
# Find last event_time and count after applying ttl index
print(db.command("count", MONGO_COLLECTION))
result = collection.aggregate(
    [
        {"$group": {"_id": None, "max_event_time": {"$max": "$event_time"}}},
        {"$unset": "_id"},
    ],
)
print_cursor_head(result)

{'n': 0, 'ok': 1.0}
