# Imports

In [1]:
import datetime as dt

import pymongo

# MongoDB Connection

In [2]:
# Requirements:
# - container running this notebooks must be attached to same network as container running MongoDB
# - container running MongoDB must be called "my-mongodb"
user = 'user'
password = 'password'
url = f'mongodb://{user}:{password}@my-mongodb:27017/'
client = pymongo.MongoClient(url, serverSelectionTimeoutMS = 2000)
client.server_info()

{'version': '4.0.23',
 'gitVersion': '07c6611b38d2aacbdb1846b688db70b3273170fb',
 'modules': [],
 'allocator': 'tcmalloc',
 'javascriptEngine': 'mozjs',
 'sysInfo': 'deprecated',
 'versionArray': [4, 0, 23, 0],
 'openssl': {'running': 'OpenSSL 1.0.2g  1 Mar 2016',
  'compiled': 'OpenSSL 1.0.2g  1 Mar 2016'},
 'buildEnvironment': {'distmod': 'ubuntu1604',
  'distarch': 'x86_64',
  'cc': '/opt/mongodbtoolchain/v2/bin/gcc: gcc (GCC) 5.4.0',
  'ccflags': '-fno-omit-frame-pointer -fno-strict-aliasing -ggdb -pthread -Wall -Wsign-compare -Wno-unknown-pragmas -Winvalid-pch -Werror -O2 -Wno-unused-local-typedefs -Wno-unused-function -Wno-deprecated-declarations -Wno-unused-but-set-variable -Wno-missing-braces -fstack-protector-strong -fno-builtin-memcmp',
  'cxx': '/opt/mongodbtoolchain/v2/bin/g++: g++ (GCC) 5.4.0',
  'cxxflags': '-Woverloaded-virtual -Wno-maybe-uninitialized -std=c++14',
  'target_arch': 'x86_64',
  'target_os': 'linux'},
 'bits': 64,
 'debug': False,
 'maxBsonObjectSize': 167

# Get Collection

In [5]:
print('Databases:', client.list_database_names())
db = client['my-db']
print('Collections:', db.list_collection_names())
collection = db['my-collection']
print('First Document:', collection.find_one())

Databases: ['my-db']
Collections: ['my-collection']
First Document: None


# Check if DB is empty

In [6]:
# res = collection.delete_many({})

In [7]:
collection.count_documents({})

0

# Insert Some Data

In [9]:
now = dt.datetime(2021, 3, 1, 13, 0, 0)
now

datetime.datetime(2021, 3, 1, 13, 0)

In [10]:
item_templalte = {
    # metadata
    'datetime': now,
    'url': 'https://www.ebuyer.com/store/Components/cat/Graphics-Cards-Nvidia/subcat/GeForce-RTX-3060-Ti',
    
    # category
    'supplier': 'ebuyer',
    'model': '3060ti',
    'page': 'p1',

    # status
    'status': 'ok',      
    'traceback': None,
    'content': None,
    
    
    'is_request_err': 0,
    'is_parse_err': 0,
    'is_ok_not_available': 1,
    'is_ok_available': 0,
    
    # data
    'num_products': 24,
    'num_available': 0,
    'items_list': None,
}

In [11]:
# possible status:
# - 'ok' - all good, 
# - 'error_request_exception' - request raised exception
# - 'error_request_403' - request other issue, e.g. 403
# - 'error_parse_exception' - parsing raised exception
# - 'error_parse_no_product' - parsing other issue, e.g. products not found

In [61]:
now = dt.datetime(2021, 3, 1, 13, 0, 0)

for i in range(3600*10):
    item_dict = item_templalte.copy()
    item_dict['datetime'] = now + dt.timedelta(seconds=i)
    
    if i == (3600*1+10):
        print(i)
        item_dict['is_request_err'] = 1
        item_dict['is_parse_err'] = 0
        item_dict['is_ok'] = 0
    elif i == (3600*2+10):
        print(i)
        item_dict['is_request_err'] = 0
        item_dict['is_parse_err'] = 1
        item_dict['is_ok'] = 0
    else:
        item_dict['is_request_err'] = 0
        item_dict['is_parse_err'] = 0
        item_dict['is_ok'] = 1
        if i % 3511 == 0:
            item_dict['num_available'] = 1
        else:
            item_dict['num_available'] = 0
    # print(item_dict)
    inserted_id = collection.insert_one(item_dict).inserted_id
    # print(inserted_id)

3610
7210


In [62]:
collection.count_documents({})

36000

# Query The Data

In [102]:
pipeline = [
    {
        "$project": {
            "_id": 0,
            "is_request_err": 1,
            "datetime_hourly": {
                "$dateToString": { "format": "%Y-%m-%dT%H:00:00", "date": "$datetime" }
            },
        }
    },
    {
        "$group": {
            "_id": {"parsed_utc": "$datetime_hourly"},
            "value": {
                "$max": "$is_request_err"
            },
        }
    },
    {
        "$project": {
            "_id": 0,
            "parsed_utc": "$_id.parsed_utc",
            "value": 1
        }
    },
    {
        "$sort": {
            "parsed_utc": 1
        }
    }
]

In [103]:
list(collection.aggregate(pipeline))[:10]

[{'value': 0, 'parsed_utc': '2021-03-01T13:00:00'},
 {'value': 1, 'parsed_utc': '2021-03-01T14:00:00'},
 {'value': 0, 'parsed_utc': '2021-03-01T15:00:00'},
 {'value': 0, 'parsed_utc': '2021-03-01T16:00:00'},
 {'value': 0, 'parsed_utc': '2021-03-01T17:00:00'},
 {'value': 0, 'parsed_utc': '2021-03-01T18:00:00'},
 {'value': 0, 'parsed_utc': '2021-03-01T19:00:00'},
 {'value': 0, 'parsed_utc': '2021-03-01T20:00:00'},
 {'value': 0, 'parsed_utc': '2021-03-01T21:00:00'},
 {'value': 0, 'parsed_utc': '2021-03-01T22:00:00'}]

# Delete Test Data

In [51]:
# res = collection.delete_many({})

In [12]:
collection.count_documents({})

0