In [1]:
from datetime import datetime
from urllib.parse import quote_plus

In [2]:
from pymongo import ASCENDING, DESCENDING, MongoClient

In [3]:
MONGO_SERVICE_DOMAIN = "mongo"
MONGO_SERVICE_PORT = 27017
MONGO_USER = "mongo"
MONGO_PASS = "mongo"
MONGO_DB = "data_playground"
MONGO_COLLECTION = "user_events"

In [4]:
client = MongoClient(
    f"mongodb://{quote_plus(MONGO_USER)}:{quote_plus(MONGO_PASS)}@{MONGO_SERVICE_DOMAIN}",
    MONGO_SERVICE_PORT,
)
db = client[MONGO_DB]
collection = db[MONGO_COLLECTION]

In [5]:
def print_cursor_head(cursor):
    for i, doc in enumerate(cursor):
        print(doc)
        if i >= 2:
            break

In [6]:
def print_cursor_all(cursor):
    for doc in cursor:
        print(doc)

In [7]:
# List all mongo dbs
client.list_database_names()

['admin', 'config', 'data_playground', 'local']

In [8]:
# List all collections of MONGO_DB
db.list_collection_names()

['user_events']

In [9]:
# Get all documents
all_docs = collection.find()
print_cursor_head(all_docs)

{'_id': ObjectId('66815bc07748e4801174481d'), 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 6), 'event_type': 'view', 'product_id': 1996170, 'category_id': 2144415922528452715, 'category_code': 'electronics.telephone', 'brand': None, 'price': 31.9, 'user_id': 1515915625519388267, 'user_session': 'LJuJVLEjPT'}
{'_id': ObjectId('66815bc07748e4801174481e'), 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 26), 'event_type': 'view', 'product_id': 139905, 'category_id': 2144415926932472027, 'category_code': 'computers.components.cooler', 'brand': 'zalman', 'price': 17.16, 'user_id': 1515915625519380411, 'user_session': 'tdicluNnRY'}
{'_id': ObjectId('66815bc07748e4801174481f'), 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 27), 'event_type': 'view', 'product_id': 215454, 'category_id': 2144415927158964449, 'category_code': None, 'brand': None, 'price': 9.81, 'user_id': 1515915625513238515, 'user_session': '4TMArHtXQy'}


In [10]:
# Get all documents sorted by ascending event_time
all_docs = collection.find().sort(
    "event_time",
    ASCENDING,
)
print_cursor_head(all_docs)

{'_id': ObjectId('66815bc07748e4801174481d'), 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 6), 'event_type': 'view', 'product_id': 1996170, 'category_id': 2144415922528452715, 'category_code': 'electronics.telephone', 'brand': None, 'price': 31.9, 'user_id': 1515915625519388267, 'user_session': 'LJuJVLEjPT'}
{'_id': ObjectId('66815bc07748e4801174481e'), 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 26), 'event_type': 'view', 'product_id': 139905, 'category_id': 2144415926932472027, 'category_code': 'computers.components.cooler', 'brand': 'zalman', 'price': 17.16, 'user_id': 1515915625519380411, 'user_session': 'tdicluNnRY'}
{'_id': ObjectId('66815bc07748e4801174481f'), 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 27), 'event_type': 'view', 'product_id': 215454, 'category_id': 2144415927158964449, 'category_code': None, 'brand': None, 'price': 9.81, 'user_id': 1515915625513238515, 'user_session': '4TMArHtXQy'}


In [11]:
# Get all documents sorted by descending event_time
all_docs = collection.find().sort(
    "event_time",
    DESCENDING,
)
print_cursor_head(all_docs)

{'_id': ObjectId('66815bcd7748e4801181c9a5'), 'event_time': datetime.datetime(2021, 2, 28, 23, 59, 9), 'event_type': 'view', 'product_id': 743182, 'category_id': 2144415935631458761, 'category_code': 'construction.tools.soldering', 'brand': 'kada', 'price': 65.08, 'user_id': 1515915625556087775, 'user_session': 'BejOXRngEW'}
{'_id': ObjectId('66815bcd7748e4801181c9a4'), 'event_time': datetime.datetime(2021, 2, 28, 23, 58, 14), 'event_type': 'view', 'product_id': 888273, 'category_id': 2144415921932861531, 'category_code': 'electronics.telephone', 'brand': None, 'price': 10.16, 'user_id': 1515915625611024030, 'user_session': '9pCbKMIcSx'}
{'_id': ObjectId('66815bcd7748e4801181c9a3'), 'event_time': datetime.datetime(2021, 2, 28, 23, 58, 9), 'event_type': 'view', 'product_id': 4170534, 'category_id': 2144415939364389423, 'category_code': 'electronics.clocks', 'brand': 'amazfit', 'price': 64.92, 'user_id': 1515915625611024020, 'user_session': 'xNIJBqZdkd'}


In [12]:
# Get only one document
doc = collection.find_one()
doc

{'_id': ObjectId('66815bc07748e4801174481d'),
 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 6),
 'event_type': 'view',
 'product_id': 1996170,
 'category_id': 2144415922528452715,
 'category_code': 'electronics.telephone',
 'brand': None,
 'price': 31.9,
 'user_id': 1515915625519388267,
 'user_session': 'LJuJVLEjPT'}

In [13]:
# Find documents where event_type == view
docs = collection.find({"event_type": "view"})
print_cursor_head(docs)

{'_id': ObjectId('66815bc07748e4801174481d'), 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 6), 'event_type': 'view', 'product_id': 1996170, 'category_id': 2144415922528452715, 'category_code': 'electronics.telephone', 'brand': None, 'price': 31.9, 'user_id': 1515915625519388267, 'user_session': 'LJuJVLEjPT'}
{'_id': ObjectId('66815bc07748e4801174481e'), 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 26), 'event_type': 'view', 'product_id': 139905, 'category_id': 2144415926932472027, 'category_code': 'computers.components.cooler', 'brand': 'zalman', 'price': 17.16, 'user_id': 1515915625519380411, 'user_session': 'tdicluNnRY'}
{'_id': ObjectId('66815bc07748e4801174481f'), 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 27), 'event_type': 'view', 'product_id': 215454, 'category_id': 2144415927158964449, 'category_code': None, 'brand': None, 'price': 9.81, 'user_id': 1515915625513238515, 'user_session': '4TMArHtXQy'}


In [14]:
# Find documents where event_type == view
#   but return only product_id field
docs = collection.find(
    {"event_type": "view"},
    projection={
        # "_id": False,
        "product_id": True,
    },
)
print_cursor_head(docs)

{'_id': ObjectId('66815bc07748e4801174481d'), 'product_id': 1996170}
{'_id': ObjectId('66815bc07748e4801174481e'), 'product_id': 139905}
{'_id': ObjectId('66815bc07748e4801174481f'), 'product_id': 215454}


In [15]:
# Find all distinct values for event_type
distinct_event_types = collection.find(projection={"event_type": True}).distinct(
    key="event_type",
)
print_cursor_all(distinct_event_types)

cart
purchase
view


In [16]:
# Count the total number of documents
count = collection.count_documents({})
count

885129

In [17]:
# Alternative way to count the total number of documents
db.command("count", MONGO_COLLECTION)

{'n': 885129, 'ok': 1.0}

In [18]:
# Count the total number of documents
#   filtered by event_type
for etype in distinct_event_types:
    count = collection.count_documents({"event_type": etype})
    print(f"{etype}: {count}")

cart: 54035
purchase: 37346
view: 793748


In [19]:
# Alternative way to count the total number of documents
#   filtered by event_type
for etype in distinct_event_types:
    count = db.command(
        "count",
        MONGO_COLLECTION,
        query={"event_type": etype},
    )
    print(f"{etype}: {count}")

cart: {'n': 54035, 'ok': 1.0}
purchase: {'n': 37346, 'ok': 1.0}
view: {'n': 793748, 'ok': 1.0}


In [20]:
# Count the total number of documents
#   grouped by event_type
pipeline = [{"$group": {"_id": "$event_type", "count": {"$sum": 1}}}]
result = collection.aggregate(pipeline)
print_cursor_all(result)

{'_id': 'cart', 'count': 54035}
{'_id': 'purchase', 'count': 37346}
{'_id': 'view', 'count': 793748}


In [21]:
# Count the total number of documents
#   grouped by category_code and event_type
pipeline = [
    {
        "$group": {
            "_id": ["$category_code", "$event_type"],
            "count": {"$sum": 1},
            "average_price": {"$avg": "$price"},
        },
    },
]
result = collection.aggregate(pipeline)
print_cursor_head(result)

{'_id': ['appliances.kitchen.fryer', 'cart'], 'count': 23, 'average_price': 63.39173913043478}
{'_id': ['appliances.environment.vacuum', 'view'], 'count': 8483, 'average_price': 166.009002711305}
{'_id': ['appliances.iron', 'purchase'], 'count': 8, 'average_price': 69.64874999999999}


In [22]:
# Sort the count of total number of documents
#   grouped by category_code and event_type
pipeline = [
    {
        "$group": {
            "_id": ["$category_code", "$event_type"],
            "count": {"$sum": 1},
            "average_price": {"$avg": "$price"},
        },
    },
    {"$sort": {"_id.0": 1, "_id.1": 1}},
]
result = collection.aggregate(pipeline)
print_cursor_head(result)

{'_id': [None, 'cart'], 'count': 10409, 'average_price': 71.28231146123547}
{'_id': [None, 'purchase'], 'count': 7568, 'average_price': 65.26603065539112}
{'_id': [None, 'view'], 'count': 218242, 'average_price': 84.18748591013646}


In [23]:
# Sort the count of total number of documents
#   grouped by category_code and event_type
#   where category_code is not None
pipeline = [
    {"$match": {"category_code": {"$ne": None}}},
    {
        "$group": {
            "_id": ["$category_code", "$event_type"],
            "count": {"$sum": 1},
            "average_price": {"$avg": "$price"},
        },
    },
    {"$sort": {"_id.0": 1, "_id.1": 1}},
]
result = collection.aggregate(pipeline)
print_cursor_head(result)

{'_id': ['accessories.bag', 'cart'], 'count': 64, 'average_price': 38.75234375}
{'_id': ['accessories.bag', 'purchase'], 'count': 56, 'average_price': 37.22160714285714}
{'_id': ['accessories.bag', 'view'], 'count': 1947, 'average_price': 55.13916281458654}


In [24]:
# Find the total quantity and revenue by category_code
#   and sort by descending revenue
# Limit the result to the top 3 categories that generated most revenue
# Also rename _id filed to category_code
pipeline = [
    {"$match": {"event_type": "purchase"}},
    {
        "$group": {
            "_id": "$category_code",
            "qty": {"$sum": 1},
            "revenue": {"$sum": "$price"},
        },
    },
    {"$sort": {"revenue": -1}},
    {"$limit": 3},
    {"$addFields": {"category_code": "$_id"}},
    {"$unset": "_id"},
]
result = collection.aggregate(pipeline)
print_cursor_head(result)

{'qty': 6888, 'revenue': 2604764.8, 'category_code': 'computers.components.videocards'}
{'qty': 7568, 'revenue': 493933.32, 'category_code': None}
{'qty': 2557, 'revenue': 364566.8, 'category_code': 'computers.peripherals.printer'}


In [25]:
# Find the total quantity and revenue by category_code
#   and sort by descending revenue
# Get only the 3rd category that generated most revenue
pipeline = [
    {"$match": {"event_type": "purchase"}},
    {
        "$group": {
            "_id": "$category_code",
            "qty": {"$sum": 1},
            "revenue": {"$sum": "$price"},
        },
    },
    {"$sort": {"revenue": -1}},
    {"$skip": 2},
    {"$limit": 1},
]
result = collection.aggregate(pipeline)
print_cursor_head(result)

{'_id': 'computers.peripherals.printer', 'qty': 2557, 'revenue': 364566.8}


In [26]:
start_date = datetime(2020, 9, 24)
end_date = datetime(2020, 9, 25)
ets = collection.find({"event_time": {"$gte": start_date, "$lte": end_date}})
print_cursor_head(ets)

{'_id': ObjectId('66815bc07748e4801174481d'), 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 6), 'event_type': 'view', 'product_id': 1996170, 'category_id': 2144415922528452715, 'category_code': 'electronics.telephone', 'brand': None, 'price': 31.9, 'user_id': 1515915625519388267, 'user_session': 'LJuJVLEjPT'}
{'_id': ObjectId('66815bc07748e4801174481e'), 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 26), 'event_type': 'view', 'product_id': 139905, 'category_id': 2144415926932472027, 'category_code': 'computers.components.cooler', 'brand': 'zalman', 'price': 17.16, 'user_id': 1515915625519380411, 'user_session': 'tdicluNnRY'}
{'_id': ObjectId('66815bc07748e4801174481f'), 'event_time': datetime.datetime(2020, 9, 24, 11, 57, 27), 'event_type': 'view', 'product_id': 215454, 'category_id': 2144415927158964449, 'category_code': None, 'brand': None, 'price': 9.81, 'user_id': 1515915625513238515, 'user_session': '4TMArHtXQy'}


In [27]:
# Calculate the average price of the all distinct products
pipeline = [
    {
        "$group": {
            "_id": {"product_id": "$product_id", "price": "$price"},
            "count": {"$sum": 1},
        },
    },
    {
        "$project": {
            "product_id": "$_id.product_id",
            "price": "$_id.price",
        },
    },
    {
        "$group": {
            "_id": None,
            "avg_price": {"$avg": "$price"},
        },
    },
    {"$unset": ["_id", "count"]},
]
result = collection.aggregate(pipeline)
print_cursor_head(result)

{'avg_price': 105.34616130058183}


In [28]:
# Find products with more than 1 price
pipeline = [
    {
        "$group": {
            "_id": {"product_id": "$product_id", "price": "$price"},
            "count": {"$sum": 1},
        },
    },
    {
        "$project": {
            "product_id": "$_id.product_id",
            "price": "$_id.price",
        },
    },
    {
        "$group": {
            "_id": "$product_id",
            "count": {"$sum": 1},
        },
    },
    {"$match": {"count": {"$gt": 1}}},
]
result = collection.aggregate(pipeline)
print_cursor_head(result)

In [29]:
# Find min and max price
pipeline = [
    {
        "$group": {
            "_id": None,
            "min_price": {"$min": "$price"},
            "max_price": {"$max": "$price"},
        },
    },
    {"$unset": "_id"},
]
result = collection.aggregate(pipeline)
print_cursor_head(result)

{'min_price': 0.22, 'max_price': 64771.06}
