<a href="https://colab.research.google.com/github/matthewpecsok/data_engineering/blob/main/tutorials/de_nosql_databases_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This tutorial will introduce you to  MongoDB a document database. MongoDB deals in data that is effictively a dictionary in Python or JSON data if it's a file. This document-oriented database is very different from the relational database designs we've seen thus far. We'll learn how to populate the database with data, how to query the database for documents we'd like to find, how to extract data for data engineering purposes.

In [1]:
!pip install faker
!pip install pymongo

Collecting faker
  Downloading Faker-22.2.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faker
Successfully installed faker-22.2.0
Collecting pymongo
  Downloading pymongo-4.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (677 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m677.1/677.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.4.2-py3-none-any.whl (300 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m300.4/300.4 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.4.2 pymongo-4.6.1


In [320]:
from faker import Faker
from pymongo import MongoClient
import random
import datetime
import json

# install MongoDB

In [3]:
!apt-get install gnupg curl

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
curl is already the newest version (7.81.0-1ubuntu1.15).
gnupg is already the newest version (2.2.27-3ubuntu2.1).
gnupg set to manually installed.
0 upgraded, 0 newly installed, 0 to remove and 24 not upgraded.


In [4]:
!curl -fsSL https://pgp.mongodb.com/server-7.0.asc | \
   sudo gpg -o /usr/share/keyrings/mongodb-server-7.0.gpg \
   --dearmor

In [5]:
!echo "deb [ arch=amd64,arm64 signed-by=/usr/share/keyrings/mongodb-server-7.0.gpg ] https://repo.mongodb.org/apt/ubuntu jammy/mongodb-org/7.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-7.0.list

deb [ arch=amd64,arm64 signed-by=/usr/share/keyrings/mongodb-server-7.0.gpg ] https://repo.mongodb.org/apt/ubuntu jammy/mongodb-org/7.0 multiverse


In [6]:
!apt-get update

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Ign:4 https://repo.mongodb.org/apt/ubuntu jammy/mongodb-org/7.0 InRelease
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Get:7 https://repo.mongodb.org/apt/ubuntu jammy/mongodb-org/7.0 Release [2,090 B]
Get:8 https://repo.mongodb.org/apt/ubuntu jammy/mongodb-org/7.0 Release.gpg [866 B]
Get:9 https://repo.mongodb.org/apt/ubuntu jammy/mongodb-org/7.0/multiverse amd64 Packages [22.9 kB]
Hit:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Get:11 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [1,617 kB]
Hit:12 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Get:13 https://repo.

In [7]:
!apt-get install -y mongodb-org

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  mongodb-database-tools mongodb-mongosh mongodb-org-database mongodb-org-database-tools-extra
  mongodb-org-mongos mongodb-org-server mongodb-org-shell mongodb-org-tools
The following NEW packages will be installed:
  mongodb-database-tools mongodb-mongosh mongodb-org mongodb-org-database
  mongodb-org-database-tools-extra mongodb-org-mongos mongodb-org-server mongodb-org-shell
  mongodb-org-tools
0 upgraded, 9 newly installed, 0 to remove and 24 not upgraded.
Need to get 162 MB of archives.
After this operation, 530 MB of additional disk space will be used.
Get:1 https://repo.mongodb.org/apt/ubuntu jammy/mongodb-org/7.0/multiverse amd64 mongodb-database-tools amd64 100.9.4 [51.9 MB]
Get:2 https://repo.mongodb.org/apt/ubuntu jammy/mongodb-org/7.0/multiverse amd64 mongodb-mongosh amd64 2.1.1 [47.9 MB]
Get:3 https://repo.mongodb.org/apt/ub

In [11]:
!mkdir /data
!mkdir /data/db

In [12]:
import subprocess
subprocess.Popen(["mongod"])

<Popen: returncode: None args: ['mongod']>

In [326]:
from pymongo import MongoClient
client = MongoClient()
client.list_database_names() # ['admin', 'local']

['admin', 'cloud_purchase_db', 'config', 'local']

## create the db

name our database

In [327]:
db = client['cloud_purchase_db']

## Set up Collections

drop the collections in case they already exist so we don't duplicate data

In [328]:
db.drop_collection('customers')
db.drop_collection('products')
#db.drop_collection('orders')

{'nIndexesWas': 1, 'ns': 'cloud_purchase_db.products', 'ok': 1.0}

do we have any collections?

## list collections

In [329]:
db.list_collection_names()

[]

In [330]:
# Create Faker instance
fake = Faker()

# Create the data

## create customer data

create from 1 to 5 orders (randomly)
pick a random product id from 1 to 10
units between 10 and 100
year from 1 year ago to today

In [331]:
# Generate customer orders
def gen_orders():

  orders = []
  for i in range(1,random.randint(1, 5)):

      productid = random.randint(1, 10)
      units = random.randint(10, 100)
      if random.random() < 0.1:
          units *= 10
      purchase_date = fake.date_between(start_date='-1y', end_date='today')
      purchase_date = datetime.datetime.combine(purchase_date, datetime.datetime.min.time())

      order = {
          'productid': productid,
          'units': units,
          'purchase_date': purchase_date
      }

      orders.append(order)

  return orders

In [332]:
gen_orders()

[{'productid': 5,
  'units': 22,
  'purchase_date': datetime.datetime(2023, 6, 8, 0, 0)}]

generate 100 customers

In [333]:
# Generate customers
customers = []
for i in range(100):
    customer = {
        'customerid': i+1,
        'name': fake.name(),
        'email': fake.email(),
        'phone': [fake.phone_number(),fake.phone_number(),fake.phone_number()],
        'orders' : gen_orders() #this is where we generate orders
    }
    customers.append(customer)

In [334]:
len(customers)

100

In [335]:
customers[0:3]

[{'customerid': 1,
  'name': 'Michael Garcia',
  'email': 'plove@example.net',
  'phone': ['233-642-9599', '(777)557-9252x39540', '954.766.9840x6835'],
  'orders': [{'productid': 2,
    'units': 68,
    'purchase_date': datetime.datetime(2023, 1, 16, 0, 0)},
   {'productid': 9,
    'units': 41,
    'purchase_date': datetime.datetime(2023, 5, 2, 0, 0)},
   {'productid': 2,
    'units': 96,
    'purchase_date': datetime.datetime(2023, 9, 9, 0, 0)},
   {'productid': 2,
    'units': 34,
    'purchase_date': datetime.datetime(2023, 10, 13, 0, 0)}]},
 {'customerid': 2,
  'name': 'Karen Bridges',
  'email': 'andrewmcconnell@example.com',
  'phone': ['001-457-474-1212x1260',
   '+1-663-483-6068x68501',
   '+1-313-550-7481x5221'],
  'orders': [{'productid': 7,
    'units': 53,
    'purchase_date': datetime.datetime(2023, 8, 24, 0, 0)},
   {'productid': 9,
    'units': 95,
    'purchase_date': datetime.datetime(2023, 1, 17, 0, 0)},
   {'productid': 1,
    'units': 56,
    'purchase_date': dateti

In [336]:
# Insert customers into MongoDB
db.customers.insert_many(customers)

InsertManyResult([ObjectId('65a2015d4de10d2f8cecb673'), ObjectId('65a2015d4de10d2f8cecb674'), ObjectId('65a2015d4de10d2f8cecb675'), ObjectId('65a2015d4de10d2f8cecb676'), ObjectId('65a2015d4de10d2f8cecb677'), ObjectId('65a2015d4de10d2f8cecb678'), ObjectId('65a2015d4de10d2f8cecb679'), ObjectId('65a2015d4de10d2f8cecb67a'), ObjectId('65a2015d4de10d2f8cecb67b'), ObjectId('65a2015d4de10d2f8cecb67c'), ObjectId('65a2015d4de10d2f8cecb67d'), ObjectId('65a2015d4de10d2f8cecb67e'), ObjectId('65a2015d4de10d2f8cecb67f'), ObjectId('65a2015d4de10d2f8cecb680'), ObjectId('65a2015d4de10d2f8cecb681'), ObjectId('65a2015d4de10d2f8cecb682'), ObjectId('65a2015d4de10d2f8cecb683'), ObjectId('65a2015d4de10d2f8cecb684'), ObjectId('65a2015d4de10d2f8cecb685'), ObjectId('65a2015d4de10d2f8cecb686'), ObjectId('65a2015d4de10d2f8cecb687'), ObjectId('65a2015d4de10d2f8cecb688'), ObjectId('65a2015d4de10d2f8cecb689'), ObjectId('65a2015d4de10d2f8cecb68a'), ObjectId('65a2015d4de10d2f8cecb68b'), ObjectId('65a2015d4de10d2f8cecb6

## create products data

In [337]:
# Generate products
products = []
for i in range(10):
    product = {
        'productid': i+1,
        'category': random.choice(['Electronics', 'Clothing', 'Books', 'Home']),
        'price': random.randint(1, 100)
    }
    products.append(product)

In [338]:
products[0:3]

[{'productid': 1, 'category': 'Electronics', 'price': 66},
 {'productid': 2, 'category': 'Clothing', 'price': 4},
 {'productid': 3, 'category': 'Electronics', 'price': 78}]

put the 10 products into the database

In [339]:
# Insert products into MongoDB
db.products.insert_many(products)

InsertManyResult([ObjectId('65a2015d4de10d2f8cecb6d7'), ObjectId('65a2015d4de10d2f8cecb6d8'), ObjectId('65a2015d4de10d2f8cecb6d9'), ObjectId('65a2015d4de10d2f8cecb6da'), ObjectId('65a2015d4de10d2f8cecb6db'), ObjectId('65a2015d4de10d2f8cecb6dc'), ObjectId('65a2015d4de10d2f8cecb6dd'), ObjectId('65a2015d4de10d2f8cecb6de'), ObjectId('65a2015d4de10d2f8cecb6df'), ObjectId('65a2015d4de10d2f8cecb6e0')], acknowledged=True)

In [340]:
db.list_collection_names() # list collections

['products', 'customers']

loop through collections and count the number of documents

In [341]:
for collection_name in db.list_collection_names():
  collection = db.get_collection(collection_name)
  print(f'{collection_name}:{collection.count_documents({})}')

products:10
customers:100


# Querying



## Customers

Find all customers.

In [342]:
for document in db.customers.find({}):
  print(document)

{'_id': ObjectId('65a2015d4de10d2f8cecb673'), 'customerid': 1, 'name': 'Michael Garcia', 'email': 'plove@example.net', 'phone': ['233-642-9599', '(777)557-9252x39540', '954.766.9840x6835'], 'orders': [{'productid': 2, 'units': 68, 'purchase_date': datetime.datetime(2023, 1, 16, 0, 0)}, {'productid': 9, 'units': 41, 'purchase_date': datetime.datetime(2023, 5, 2, 0, 0)}, {'productid': 2, 'units': 96, 'purchase_date': datetime.datetime(2023, 9, 9, 0, 0)}, {'productid': 2, 'units': 34, 'purchase_date': datetime.datetime(2023, 10, 13, 0, 0)}]}
{'_id': ObjectId('65a2015d4de10d2f8cecb674'), 'customerid': 2, 'name': 'Karen Bridges', 'email': 'andrewmcconnell@example.com', 'phone': ['001-457-474-1212x1260', '+1-663-483-6068x68501', '+1-313-550-7481x5221'], 'orders': [{'productid': 7, 'units': 53, 'purchase_date': datetime.datetime(2023, 8, 24, 0, 0)}, {'productid': 9, 'units': 95, 'purchase_date': datetime.datetime(2023, 1, 17, 0, 0)}, {'productid': 1, 'units': 56, 'purchase_date': datetime.dat

find customerid 76

In [343]:
import pprint

In [344]:
for document in db.customers.find({'customerid':76}):
  print(pprint.pprint(document))

{'_id': ObjectId('65a2015d4de10d2f8cecb6be'),
 'customerid': 76,
 'email': 'patrick44@example.net',
 'name': 'Heather Wallace',
 'orders': [{'productid': 1,
             'purchase_date': datetime.datetime(2023, 11, 14, 0, 0),
             'units': 890},
            {'productid': 3,
             'purchase_date': datetime.datetime(2023, 12, 14, 0, 0),
             'units': 96},
            {'productid': 5,
             'purchase_date': datetime.datetime(2023, 5, 27, 0, 0),
             'units': 610}],
 'phone': ['+1-949-234-9451x0498', '2204570836', '001-856-669-6052x032']}
None


## Products

find all products

In [345]:
for document in db.products.find({}):
  print(document)

{'_id': ObjectId('65a2015d4de10d2f8cecb6d7'), 'productid': 1, 'category': 'Electronics', 'price': 66}
{'_id': ObjectId('65a2015d4de10d2f8cecb6d8'), 'productid': 2, 'category': 'Clothing', 'price': 4}
{'_id': ObjectId('65a2015d4de10d2f8cecb6d9'), 'productid': 3, 'category': 'Electronics', 'price': 78}
{'_id': ObjectId('65a2015d4de10d2f8cecb6da'), 'productid': 4, 'category': 'Books', 'price': 16}
{'_id': ObjectId('65a2015d4de10d2f8cecb6db'), 'productid': 5, 'category': 'Electronics', 'price': 11}
{'_id': ObjectId('65a2015d4de10d2f8cecb6dc'), 'productid': 6, 'category': 'Books', 'price': 16}
{'_id': ObjectId('65a2015d4de10d2f8cecb6dd'), 'productid': 7, 'category': 'Clothing', 'price': 46}
{'_id': ObjectId('65a2015d4de10d2f8cecb6de'), 'productid': 8, 'category': 'Clothing', 'price': 79}
{'_id': ObjectId('65a2015d4de10d2f8cecb6df'), 'productid': 9, 'category': 'Electronics', 'price': 38}
{'_id': ObjectId('65a2015d4de10d2f8cecb6e0'), 'productid': 10, 'category': 'Home', 'price': 50}


find all products with prices less than 40

In [346]:
for document in db.products.find({'price': {'$lt':40}}):
  print(document)

{'_id': ObjectId('65a2015d4de10d2f8cecb6d8'), 'productid': 2, 'category': 'Clothing', 'price': 4}
{'_id': ObjectId('65a2015d4de10d2f8cecb6da'), 'productid': 4, 'category': 'Books', 'price': 16}
{'_id': ObjectId('65a2015d4de10d2f8cecb6db'), 'productid': 5, 'category': 'Electronics', 'price': 11}
{'_id': ObjectId('65a2015d4de10d2f8cecb6dc'), 'productid': 6, 'category': 'Books', 'price': 16}
{'_id': ObjectId('65a2015d4de10d2f8cecb6df'), 'productid': 9, 'category': 'Electronics', 'price': 38}


find all products with category clothing

In [347]:
for document in db.products.find({'category': 'Clothing'}):
  print(document)

{'_id': ObjectId('65a2015d4de10d2f8cecb6d8'), 'productid': 2, 'category': 'Clothing', 'price': 4}
{'_id': ObjectId('65a2015d4de10d2f8cecb6dd'), 'productid': 7, 'category': 'Clothing', 'price': 46}
{'_id': ObjectId('65a2015d4de10d2f8cecb6de'), 'productid': 8, 'category': 'Clothing', 'price': 79}


find all products with price less than 40 AND category Clothing

In [348]:
for document in db.products.find({'price': {'$lt':40},'category': 'Clothing'}):
  print(document)

{'_id': ObjectId('65a2015d4de10d2f8cecb6d8'), 'productid': 2, 'category': 'Clothing', 'price': 4}


## Orders

In [349]:
# Calculate the total number of orders
pipeline = [
    {
        '$unwind': '$orders'
    },
    {
        '$group': {
            '_id': None,
            'total_orders': {'$sum': 1}
        }
    },
    {
        '$project': {
            '_id': 0,
            'total_orders': 1
        }
    }
]

result = db.customers.aggregate(pipeline)

# Extract the total number of orders
total_orders = next(result)['total_orders']

# Print the total number of orders
print(f"Total Orders: {total_orders}")

Total Orders: 205


In [350]:
filter_criteria = {
    'orders.units': {'$lt': 45000}
}

# Calculate the total number of orders with units less than 12000
pipeline = [
    {
      '$unwind': '$orders'
    },
    {
      '$match': filter_criteria
    },
    {
        '$group': {
            '_id': None,
            'total_orders': {'$sum': 1}
        }
    },
    {
        '$project': {
            '_id': 0,
            'total_orders': 1
        }
    }
]

result = db.customers.aggregate(pipeline)

# Extract the total number of orders
total_orders = next(result)['total_orders']

# Print the total number of orders
print(f"Total Orders: {total_orders}")

Total Orders: 205


# Extraction


## Dump mongodb data to json file.

create a JSON file with the filtered data.

In [351]:
from bson.json_util import dumps
import json

open a file. create a cursor that gets passed to dumps which takes binary json data and converts it serialized json, deseralizes it to python object and then serializes it back to a file.  

## dump orders

## dump customers

(and orders as they are nested)

In [352]:
with open('customers.json', 'w') as file:
  cursor = db.customers.find({})
  file.write(dumps(cursor))

## dump products

In [353]:
with open('products.json', 'w') as file:
  cursor = db.products.find({})
  file.write(dumps(cursor))

In [354]:
!ls -lh *.json

-rw-r--r-- 1 root root  38K Jan 13 03:19 customers.json
-rw-r--r-- 1 root root 1002 Jan 13 03:19 products.json


# Reading JSON file data into Python

## JSON into Pandas DataFrame

In [355]:
import pandas as pd

In [356]:
customers_df = pd.read_json('customers.json')
customers_df.head(2)

Unnamed: 0,_id,customerid,name,email,phone,orders
0,{'$oid': '65a2015d4de10d2f8cecb673'},1,Michael Garcia,plove@example.net,"[233-642-9599, (777)557-9252x39540, 954.766.98...","[{'productid': 2, 'units': 68, 'purchase_date'..."
1,{'$oid': '65a2015d4de10d2f8cecb674'},2,Karen Bridges,andrewmcconnell@example.com,"[001-457-474-1212x1260, +1-663-483-6068x68501,...","[{'productid': 7, 'units': 53, 'purchase_date'..."


In [357]:
products_df = pd.read_json('products.json')
products_df.head(2)

Unnamed: 0,_id,productid,category,price
0,{'$oid': '65a2015d4de10d2f8cecb6d7'},1,Electronics,66
1,{'$oid': '65a2015d4de10d2f8cecb6d8'},2,Clothing,4


### Merge DataFrames

In [358]:
customers_df.head(4)

Unnamed: 0,_id,customerid,name,email,phone,orders
0,{'$oid': '65a2015d4de10d2f8cecb673'},1,Michael Garcia,plove@example.net,"[233-642-9599, (777)557-9252x39540, 954.766.98...","[{'productid': 2, 'units': 68, 'purchase_date'..."
1,{'$oid': '65a2015d4de10d2f8cecb674'},2,Karen Bridges,andrewmcconnell@example.com,"[001-457-474-1212x1260, +1-663-483-6068x68501,...","[{'productid': 7, 'units': 53, 'purchase_date'..."
2,{'$oid': '65a2015d4de10d2f8cecb675'},3,April Acevedo,elizabethbarrera@example.net,"[(535)545-4553, 001-404-429-5749x1377, 525-822...","[{'productid': 8, 'units': 970, 'purchase_date..."
3,{'$oid': '65a2015d4de10d2f8cecb676'},4,Taylor Watts,jmiller@example.com,"[5299700959, 560.262.6289, (987)910-1259x32587]","[{'productid': 1, 'units': 81, 'purchase_date'..."


In [359]:
orders_df = customers_df.explode('orders')

In [360]:
df_orders_expanded = pd.concat([orders_df.drop(['orders'], axis=1), orders_df['orders'].apply(pd.Series)], axis=1)
df_orders_expanded.head(3)


Unnamed: 0,_id,customerid,name,email,phone,productid,units,purchase_date,0
0,{'$oid': '65a2015d4de10d2f8cecb673'},1,Michael Garcia,plove@example.net,"[233-642-9599, (777)557-9252x39540, 954.766.98...",2.0,68.0,{'$date': '2023-01-16T00:00:00Z'},
0,{'$oid': '65a2015d4de10d2f8cecb673'},1,Michael Garcia,plove@example.net,"[233-642-9599, (777)557-9252x39540, 954.766.98...",9.0,41.0,{'$date': '2023-05-02T00:00:00Z'},
0,{'$oid': '65a2015d4de10d2f8cecb673'},1,Michael Garcia,plove@example.net,"[233-642-9599, (777)557-9252x39540, 954.766.98...",2.0,96.0,{'$date': '2023-09-09T00:00:00Z'},


In [361]:
customer_product_orders = df_orders_expanded.merge(products_df, on='productid', how='left')
customer_product_orders.head(3)

Unnamed: 0,_id_x,customerid,name,email,phone,productid,units,purchase_date,0,_id_y,category,price
0,{'$oid': '65a2015d4de10d2f8cecb673'},1,Michael Garcia,plove@example.net,"[233-642-9599, (777)557-9252x39540, 954.766.98...",2.0,68.0,{'$date': '2023-01-16T00:00:00Z'},,{'$oid': '65a2015d4de10d2f8cecb6d8'},Clothing,4.0
1,{'$oid': '65a2015d4de10d2f8cecb673'},1,Michael Garcia,plove@example.net,"[233-642-9599, (777)557-9252x39540, 954.766.98...",9.0,41.0,{'$date': '2023-05-02T00:00:00Z'},,{'$oid': '65a2015d4de10d2f8cecb6df'},Electronics,38.0
2,{'$oid': '65a2015d4de10d2f8cecb673'},1,Michael Garcia,plove@example.net,"[233-642-9599, (777)557-9252x39540, 954.766.98...",2.0,96.0,{'$date': '2023-09-09T00:00:00Z'},,{'$oid': '65a2015d4de10d2f8cecb6d8'},Clothing,4.0


In [362]:
customer_product_orders['total_sales'] = customer_product_orders['price'] * customer_product_orders['units']

In [363]:
customer_product_orders.groupby(by='category').agg({'total_sales': 'sum'}).sort_values(by='total_sales', ascending=False)

Unnamed: 0_level_0,total_sales
category,Unnamed: 1_level_1
Electronics,414809.0
Clothing,302975.0
Home,112350.0
Books,38528.0
