<a href="https://colab.research.google.com/github/matthewpecsok/data_engineering/blob/main/tutorials/de_nosql_databases_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This tutorial will introduce you to  MongoDB a document database. MongoDB deals in data that is effictively a dictionary in Python or JSON data if it's a file. This document-oriented database is very different from the relational database designs we've seen thus far. We'll learn how to populate the database with data, how to query the database for documents we'd like to find, how to extract data for data engineering purposes.

In [1]:
!pip install faker
!pip install pymongo

Collecting faker
  Downloading Faker-22.2.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faker
Successfully installed faker-22.2.0
Collecting pymongo
  Downloading pymongo-4.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (677 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m677.1/677.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.4.2-py3-none-any.whl (300 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m300.4/300.4 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.4.2 pymongo-4.6.1


In [2]:
from faker import Faker
from pymongo import MongoClient
import random
import datetime

# install MongoDB

In [3]:
!apt-get install gnupg curl

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
curl is already the newest version (7.81.0-1ubuntu1.15).
gnupg is already the newest version (2.2.27-3ubuntu2.1).
gnupg set to manually installed.
0 upgraded, 0 newly installed, 0 to remove and 24 not upgraded.


In [4]:
!curl -fsSL https://pgp.mongodb.com/server-7.0.asc | \
   sudo gpg -o /usr/share/keyrings/mongodb-server-7.0.gpg \
   --dearmor

In [5]:
!echo "deb [ arch=amd64,arm64 signed-by=/usr/share/keyrings/mongodb-server-7.0.gpg ] https://repo.mongodb.org/apt/ubuntu jammy/mongodb-org/7.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-7.0.list

deb [ arch=amd64,arm64 signed-by=/usr/share/keyrings/mongodb-server-7.0.gpg ] https://repo.mongodb.org/apt/ubuntu jammy/mongodb-org/7.0 multiverse


In [6]:
!apt-get update

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Ign:4 https://repo.mongodb.org/apt/ubuntu jammy/mongodb-org/7.0 InRelease
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Get:7 https://repo.mongodb.org/apt/ubuntu jammy/mongodb-org/7.0 Release [2,090 B]
Get:8 https://repo.mongodb.org/apt/ubuntu jammy/mongodb-org/7.0 Release.gpg [866 B]
Get:9 https://repo.mongodb.org/apt/ubuntu jammy/mongodb-org/7.0/multiverse amd64 Packages [22.9 kB]
Hit:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Get:11 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [1,617 kB]
Hit:12 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Get:13 https://repo.

In [7]:
!apt-get install -y mongodb-org

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  mongodb-database-tools mongodb-mongosh mongodb-org-database mongodb-org-database-tools-extra
  mongodb-org-mongos mongodb-org-server mongodb-org-shell mongodb-org-tools
The following NEW packages will be installed:
  mongodb-database-tools mongodb-mongosh mongodb-org mongodb-org-database
  mongodb-org-database-tools-extra mongodb-org-mongos mongodb-org-server mongodb-org-shell
  mongodb-org-tools
0 upgraded, 9 newly installed, 0 to remove and 24 not upgraded.
Need to get 162 MB of archives.
After this operation, 530 MB of additional disk space will be used.
Get:1 https://repo.mongodb.org/apt/ubuntu jammy/mongodb-org/7.0/multiverse amd64 mongodb-database-tools amd64 100.9.4 [51.9 MB]
Get:2 https://repo.mongodb.org/apt/ubuntu jammy/mongodb-org/7.0/multiverse amd64 mongodb-mongosh amd64 2.1.1 [47.9 MB]
Get:3 https://repo.mongodb.org/apt/ub

In [11]:
!mkdir /data
!mkdir /data/db

In [12]:
import subprocess
subprocess.Popen(["mongod"])

<Popen: returncode: None args: ['mongod']>

In [13]:
from pymongo import MongoClient
client = MongoClient()
client.list_database_names() # ['admin', 'local']

['admin', 'config', 'local']

## create the db

name our database

In [14]:
# Access a specific database (replace 'your_database' with the desired database name)
db = client['cloud_purchase_db']

## Set up Collections

drop the collections in case they already exist so we don't duplicate data

In [15]:
db.drop_collection('customers')
db.drop_collection('products')
db.drop_collection('orders')

{'ok': 1.0}

do we have any collections?

## list collections

In [16]:
db.list_collection_names()

[]

In [17]:
# Create Faker instance
fake = Faker()

# Create the data

## create customer data

In [18]:
# Generate customer orders
def gen_orders():

  orders = []
  for i in range(1,random.randint(1, 5)):

      productid = random.randint(1, 10)
      units = random.randint(10000, 1000000)
      if random.random() < 0.1:
          units *= 1000
      purchase_date = fake.date_between(start_date='-1y', end_date='today')
      purchase_date = datetime.datetime.combine(purchase_date, datetime.datetime.min.time())

      order = {
          'productid': productid,
          'units': units,
          'purchase_date': purchase_date
      }

      orders.append(order)

  return orders

In [19]:
# Generate customers
customers = []
for i in range(100):
    customer = {
        'customerid': i+1,
        'name': fake.name(),
        'email': fake.email(),
        'phone': [fake.phone_number(),fake.phone_number(),fake.phone_number()],
        'orders' : gen_orders()
    }
    customers.append(customer)

In [20]:
customers[0:3]

[{'customerid': 1,
  'name': 'Rhonda Fox',
  'email': 'megangarcia@example.net',
  'phone': ['898-529-5735x4517', '432-289-0539x82060', '225.976.0238x3825'],
  'orders': [{'productid': 4,
    'units': 409452,
    'purchase_date': datetime.datetime(2023, 12, 19, 0, 0)},
   {'productid': 10,
    'units': 11528,
    'purchase_date': datetime.datetime(2023, 10, 22, 0, 0)}]},
 {'customerid': 2,
  'name': 'Jill Moore',
  'email': 'tammy52@example.org',
  'phone': ['309.224.5458', '918-741-9946x793', '4567089928'],
  'orders': [{'productid': 3,
    'units': 521791000,
    'purchase_date': datetime.datetime(2023, 8, 11, 0, 0)}]},
 {'customerid': 3,
  'name': 'Margaret Mccoy',
  'email': 'kathy84@example.net',
  'phone': ['(926)614-6532x7747', '(597)509-0033', '863.615.4098x3859'],
  'orders': [{'productid': 9,
    'units': 377528,
    'purchase_date': datetime.datetime(2023, 11, 30, 0, 0)},
   {'productid': 6,
    'units': 440201,
    'purchase_date': datetime.datetime(2023, 5, 29, 0, 0)}]}]

In [21]:
# Insert customers into MongoDB
db.customers.insert_many(customers)

InsertManyResult([ObjectId('65a1ce664de10d2f8cecb3dc'), ObjectId('65a1ce664de10d2f8cecb3dd'), ObjectId('65a1ce664de10d2f8cecb3de'), ObjectId('65a1ce664de10d2f8cecb3df'), ObjectId('65a1ce664de10d2f8cecb3e0'), ObjectId('65a1ce664de10d2f8cecb3e1'), ObjectId('65a1ce664de10d2f8cecb3e2'), ObjectId('65a1ce664de10d2f8cecb3e3'), ObjectId('65a1ce664de10d2f8cecb3e4'), ObjectId('65a1ce664de10d2f8cecb3e5'), ObjectId('65a1ce664de10d2f8cecb3e6'), ObjectId('65a1ce664de10d2f8cecb3e7'), ObjectId('65a1ce664de10d2f8cecb3e8'), ObjectId('65a1ce664de10d2f8cecb3e9'), ObjectId('65a1ce664de10d2f8cecb3ea'), ObjectId('65a1ce664de10d2f8cecb3eb'), ObjectId('65a1ce664de10d2f8cecb3ec'), ObjectId('65a1ce664de10d2f8cecb3ed'), ObjectId('65a1ce664de10d2f8cecb3ee'), ObjectId('65a1ce664de10d2f8cecb3ef'), ObjectId('65a1ce664de10d2f8cecb3f0'), ObjectId('65a1ce664de10d2f8cecb3f1'), ObjectId('65a1ce664de10d2f8cecb3f2'), ObjectId('65a1ce664de10d2f8cecb3f3'), ObjectId('65a1ce664de10d2f8cecb3f4'), ObjectId('65a1ce664de10d2f8cecb3

## create products data

In [22]:
# Generate products
products = []
for i in range(10):
    product = {
        'productid': i+1,
        'category': random.choice(['Electronics', 'Clothing', 'Books', 'Home']),
        'price': random.randint(1, 100)
    }
    products.append(product)

In [23]:
# Insert products into MongoDB
db.products.insert_many(products)

InsertManyResult([ObjectId('65a1ce6a4de10d2f8cecb440'), ObjectId('65a1ce6a4de10d2f8cecb441'), ObjectId('65a1ce6a4de10d2f8cecb442'), ObjectId('65a1ce6a4de10d2f8cecb443'), ObjectId('65a1ce6a4de10d2f8cecb444'), ObjectId('65a1ce6a4de10d2f8cecb445'), ObjectId('65a1ce6a4de10d2f8cecb446'), ObjectId('65a1ce6a4de10d2f8cecb447'), ObjectId('65a1ce6a4de10d2f8cecb448'), ObjectId('65a1ce6a4de10d2f8cecb449')], acknowledged=True)

In [24]:
db.list_collection_names() # list collections

['products', 'customers']

loop through collections and count the number of documents

In [25]:
for collection_name in db.list_collection_names():
  collection = db.get_collection(collection_name)
  print(f'{collection_name}:{collection.count_documents({})}')

products:10
customers:100


# Querying



## Customers

Find all customers.

In [26]:
for document in db.customers.find({}):
  print(document)

{'_id': ObjectId('65a1ce664de10d2f8cecb3dc'), 'customerid': 1, 'name': 'Rhonda Fox', 'email': 'megangarcia@example.net', 'phone': ['898-529-5735x4517', '432-289-0539x82060', '225.976.0238x3825'], 'orders': [{'productid': 4, 'units': 409452, 'purchase_date': datetime.datetime(2023, 12, 19, 0, 0)}, {'productid': 10, 'units': 11528, 'purchase_date': datetime.datetime(2023, 10, 22, 0, 0)}]}
{'_id': ObjectId('65a1ce664de10d2f8cecb3dd'), 'customerid': 2, 'name': 'Jill Moore', 'email': 'tammy52@example.org', 'phone': ['309.224.5458', '918-741-9946x793', '4567089928'], 'orders': [{'productid': 3, 'units': 521791000, 'purchase_date': datetime.datetime(2023, 8, 11, 0, 0)}]}
{'_id': ObjectId('65a1ce664de10d2f8cecb3de'), 'customerid': 3, 'name': 'Margaret Mccoy', 'email': 'kathy84@example.net', 'phone': ['(926)614-6532x7747', '(597)509-0033', '863.615.4098x3859'], 'orders': [{'productid': 9, 'units': 377528, 'purchase_date': datetime.datetime(2023, 11, 30, 0, 0)}, {'productid': 6, 'units': 440201,

find customerid 76

In [29]:
import pprint

In [30]:
for document in db.customers.find({'customerid':76}):
  print(pprint.pprint(document))

{'_id': ObjectId('65a1ccab1449f39f493c53ae'),
 'customerid': 76,
 'email': 'wayne36@example.net',
 'name': 'Marcus Gomez',
 'orders': [{'productid': 7,
             'purchase_date': datetime.datetime(2023, 12, 27, 0, 0),
             'units': 151589},
            {'productid': 8,
             'purchase_date': datetime.datetime(2023, 4, 20, 0, 0),
             'units': 752215}],
 'phone': ['001-833-547-7166x7413', '(298)987-2856x14677', '734.689.9357']}
None


## Products

find all products

In [31]:
for document in db.products.find({}):
  print(document)

{'_id': ObjectId('65a1ccb11449f39f493c53c7'), 'productid': 1, 'category': 'Electronics', 'price': 69}
{'_id': ObjectId('65a1ccb11449f39f493c53c8'), 'productid': 2, 'category': 'Clothing', 'price': 39}
{'_id': ObjectId('65a1ccb11449f39f493c53c9'), 'productid': 3, 'category': 'Home', 'price': 2}
{'_id': ObjectId('65a1ccb11449f39f493c53ca'), 'productid': 4, 'category': 'Clothing', 'price': 81}
{'_id': ObjectId('65a1ccb11449f39f493c53cb'), 'productid': 5, 'category': 'Clothing', 'price': 58}
{'_id': ObjectId('65a1ccb11449f39f493c53cc'), 'productid': 6, 'category': 'Electronics', 'price': 12}
{'_id': ObjectId('65a1ccb11449f39f493c53cd'), 'productid': 7, 'category': 'Books', 'price': 15}
{'_id': ObjectId('65a1ccb11449f39f493c53ce'), 'productid': 8, 'category': 'Clothing', 'price': 92}
{'_id': ObjectId('65a1ccb11449f39f493c53cf'), 'productid': 9, 'category': 'Books', 'price': 80}
{'_id': ObjectId('65a1ccb11449f39f493c53d0'), 'productid': 10, 'category': 'Clothing', 'price': 15}


find all products with prices less than 40

In [None]:
for document in db.products.find({'price': {'$lt':40}}):
  print(document)

{'_id': ObjectId('646fad92f2e8345bc9a7dea6'), 'productid': 5, 'category': 'Clothing', 'price': 1}


find all products with category clothing

In [None]:
for document in db.products.find({'category': 'Clothing'}):
  print(document)

{'_id': ObjectId('646fad92f2e8345bc9a7dea2'), 'productid': 1, 'category': 'Clothing', 'price': 51}
{'_id': ObjectId('646fad92f2e8345bc9a7dea3'), 'productid': 2, 'category': 'Clothing', 'price': 90}
{'_id': ObjectId('646fad92f2e8345bc9a7dea6'), 'productid': 5, 'category': 'Clothing', 'price': 1}
{'_id': ObjectId('646fad92f2e8345bc9a7dea8'), 'productid': 7, 'category': 'Clothing', 'price': 65}
{'_id': ObjectId('646fad92f2e8345bc9a7deab'), 'productid': 10, 'category': 'Clothing', 'price': 57}


find all products with price less than 40 AND category Clothing

In [34]:
for document in db.products.find({'price': {'$lt':40},'category': 'Clothing'}):
  print(document)

{'_id': ObjectId('65a1ccb11449f39f493c53c8'), 'productid': 2, 'category': 'Clothing', 'price': 39}
{'_id': ObjectId('65a1ccb11449f39f493c53d0'), 'productid': 10, 'category': 'Clothing', 'price': 15}


## Orders

In [None]:
# Calculate the total number of orders
pipeline = [
    {
        '$unwind': '$orders'
    },
    {
        '$group': {
            '_id': None,
            'total_orders': {'$sum': 1}
        }
    },
    {
        '$project': {
            '_id': 0,
            'total_orders': 1
        }
    }
]

result = db.customers.aggregate(pipeline)

# Extract the total number of orders
total_orders = next(result)['total_orders']

# Print the total number of orders
print(f"Total Orders: {total_orders}")

Total Orders: 182


In [None]:
filter_criteria = {
    'orders.units': {'$lt': 45000}
}

# Calculate the total number of orders with units less than 12000
pipeline = [
    {
      '$unwind': '$orders'
    },
    {
      '$match': filter_criteria
    },
    {
        '$group': {
            '_id': None,
            'total_orders': {'$sum': 1}
        }
    },
    {
        '$project': {
            '_id': 0,
            'total_orders': 1
        }
    }
]

result = db.customers.aggregate(pipeline)

# Extract the total number of orders
total_orders = next(result)['total_orders']

# Print the total number of orders
print(f"Total Orders: {total_orders}")

Total Orders: 7


# Extraction


## Dump mongodb data to json file.

create a JSON file with the filtered data.

In [None]:
from bson.json_util import dumps
import json

open a file. create a cursor that gets passed to dumps which takes binary json data and converts it serialized json, deseralizes it to python object and then serializes it back to a file.  

## dump orders

## dump customers

(and orders as they are nested)

In [None]:
with open('customers.json', 'w') as file:
  cursor = db.customers.find({})
  file.write(dumps(cursor))

## dump products

In [None]:
with open('products.json', 'w') as file:
  cursor = db.products.find({})
  file.write(dumps(cursor))

In [None]:
!ls -lh *.json

-rw-r--r-- 1 root root  36K May 25 18:48 customers.json
-rw-r--r-- 1 root root 1002 May 25 18:48 products.json


In [None]:
!cat orders_gt_2023_5_15.json

cat: orders_gt_2023_5_15.json: No such file or directory


# Reading JSON file data into Python

## JSON into Pandas DataFrame

In [None]:
import pandas as pd

In [None]:
customers_df = pd.read_json('customers.json')
customers_df.head(2)

Unnamed: 0,_id,customerid,name,email,phone,orders
0,{'$oid': '646fad92f2e8345bc9a7de3e'},1,Samantha Morris,bradleythomas@example.org,"[+1-069-970-8404x5473, 001-151-571-8094, 001-9...",[]
1,{'$oid': '646fad92f2e8345bc9a7de3f'},2,Steven Boyle,bowersjeffrey@example.org,"[484-855-8172, (993)169-9728x2920, 001-928-880...","[{'productid': 2, 'units': 481319, 'purchase_d..."


In [None]:
products_df = pd.read_json('products.json')
products_df.head(2)

Unnamed: 0,_id,productid,category,price
0,{'$oid': '646fad92f2e8345bc9a7dea2'},1,Clothing,51
1,{'$oid': '646fad92f2e8345bc9a7dea3'},2,Clothing,90


### Merge DataFrames

In [None]:
customers_df.head(4)

Unnamed: 0,_id,customerid,name,email,phone,orders
0,{'$oid': '646fad92f2e8345bc9a7de3e'},1,Samantha Morris,bradleythomas@example.org,"[+1-069-970-8404x5473, 001-151-571-8094, 001-9...",[]
1,{'$oid': '646fad92f2e8345bc9a7de3f'},2,Steven Boyle,bowersjeffrey@example.org,"[484-855-8172, (993)169-9728x2920, 001-928-880...","[{'productid': 2, 'units': 481319, 'purchase_d..."
2,{'$oid': '646fad92f2e8345bc9a7de40'},3,Francisco Davis,marcussmith@example.org,"[533-742-3054, 707.447.4667, +1-520-673-8349]","[{'productid': 9, 'units': 988209, 'purchase_d..."
3,{'$oid': '646fad92f2e8345bc9a7de41'},4,Mrs. April Maynard,dianagonzalez@example.net,"[(314)763-7166x9472, 001-861-083-9634x0712, 40...","[{'productid': 3, 'units': 304654, 'purchase_d..."


In [None]:
customers_df.explode('orders').head(4)

Unnamed: 0,_id,customerid,name,email,phone,orders
0,{'$oid': '646fad92f2e8345bc9a7de3e'},1,Samantha Morris,bradleythomas@example.org,"[+1-069-970-8404x5473, 001-151-571-8094, 001-9...",
1,{'$oid': '646fad92f2e8345bc9a7de3f'},2,Steven Boyle,bowersjeffrey@example.org,"[484-855-8172, (993)169-9728x2920, 001-928-880...","{'productid': 2, 'units': 481319, 'purchase_da..."
1,{'$oid': '646fad92f2e8345bc9a7de3f'},2,Steven Boyle,bowersjeffrey@example.org,"[484-855-8172, (993)169-9728x2920, 001-928-880...","{'productid': 5, 'units': 302015, 'purchase_da..."
2,{'$oid': '646fad92f2e8345bc9a7de40'},3,Francisco Davis,marcussmith@example.org,"[533-742-3054, 707.447.4667, +1-520-673-8349]","{'productid': 9, 'units': 988209, 'purchase_da..."


In [None]:
pd.json_normalize(customers_df.explode('orders')['orders'])

Unnamed: 0,productid,units,purchase_date.$date
0,,,
1,2.0,481319.0,2023-01-26T00:00:00Z
2,5.0,302015.0,2023-01-27T00:00:00Z
3,9.0,988209.0,2022-06-13T00:00:00Z
4,8.0,863970.0,2023-03-15T00:00:00Z
...,...,...,...
202,7.0,986202.0,2022-10-01T00:00:00Z
203,9.0,716146000.0,2023-01-29T00:00:00Z
204,3.0,642181.0,2023-05-16T00:00:00Z
205,8.0,167524000.0,2022-06-17T00:00:00Z


In [None]:
cust_ord_prods_df = customers_df.merge(products_df, left_on='productid', right_on='productid')
cust_ord_prods_df.head(3)

KeyError: ignored

### Drop columns

In [None]:
cust_ord_prods_df.drop('_id_x',axis=1,inplace=True)
cust_ord_prods_df.drop('_id_y',axis=1,inplace=True)
cust_ord_prods_df.drop('_id',axis=1,inplace=True)

In [None]:
cust_ord_prods_df.head(3)

In [None]:
#df['date_field'] =
#pd.to_datetime(customers_orders_df['purchase_date']).dt.date

In [None]:
#customers_orders_df['purchase_date'].apply(pd.Series).merge(customers_orders_df, left_index=True, right_index=True).drop('purchase_date', axis=1)

## JSON into list of python dictionary

### read customers JSON file

In [None]:
with open('customers.json') as json_file:
    customers = json.load(json_file)

customers[0:2] # show the first 2 records (like head with n=2)

#### explore list of dictionaries

In [None]:
type(customers)

In [None]:
len(customers)

In [None]:
type(customers[1])

In [None]:
customers[1]

In [None]:
customers[1]['name']

In [None]:
customers[1].get('name')

### read orders JSON file

In [None]:
with open('orders_gt_2023_5_15.json') as json_file:
    orders = json.load(json_file)

orders[0:2] # show the first 2 records (like head with n=2)

### read products JSON file

In [None]:
with open('products.json') as json_file:
    products = json.load(json_file)

products[0:2] # show the first 2 records (like head with n=2)