<a href="https://colab.research.google.com/github/matthewpecsok/data_engineering/blob/main/tutorials/de_nosql_databases_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This tutorial will introduce you to  MongoDB a document database. MongoDB deals in data that is effictively a dictionary in Python or JSON data if it's a file. This document-oriented database is very different from the relational database designs we've seen thus far. We'll learn how to populate the database with data, how to query the database for documents we'd like to find, how to extract data for data engineering purposes. 

In [1]:
!pip install faker
!pip install pymongo

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faker
  Downloading Faker-18.9.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faker
Successfully installed faker-18.9.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pymongo
  Downloading pymongo-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (492 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.9/492.9 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.3.0-py3-none-any.whl (283 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m283.7/283.7 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully i

In [2]:
from faker import Faker
from pymongo import MongoClient
import random
import datetime

# install MongoDB

In [3]:
!apt install mongodb >log
!service mongodb start

from pymongo import MongoClient
client = MongoClient()
client.list_database_names() # ['admin', 'local']



 * Starting database mongodb
   ...done.


['admin', 'config', 'local']

## create the db

name our database

In [4]:
# Access a specific database (replace 'your_database' with the desired database name)
db = client['cloud_purchase_db']

## Set up Collections

drop the collections in case they already exist so we don't duplicate data

In [5]:
db.drop_collection('customers')
db.drop_collection('products')
db.drop_collection('orders')

{'ok': 0.0,
 'errmsg': 'ns not found',
 'code': 26,
 'codeName': 'NamespaceNotFound'}

do we have any collections?

## list collections

In [6]:
db.list_collection_names()

[]

In [7]:
# Create Faker instance
fake = Faker()

# Create the data

## create customer data

In [8]:
# Generate customers
customers = []
for i in range(100):
    customer = {
        'customerid': i+1,
        'name': fake.name(),
        'email': fake.email(),
        'phone': [fake.phone_number(),fake.phone_number(),fake.phone_number()]
    }
    customers.append(customer)

In [9]:
# Insert customers into MongoDB
db.customers.insert_many(customers)

<pymongo.results.InsertManyResult at 0x7f678d157610>

## create products data

In [10]:
# Generate products
products = []
for i in range(10):
    product = {
        'productid': i+1,
        'category': random.choice(['Electronics', 'Clothing', 'Books', 'Home']),
        'price': random.uniform(1, 100)
    }
    products.append(product)

In [11]:
# Insert products into MongoDB
db.products.insert_many(products)

<pymongo.results.InsertManyResult at 0x7f678d156830>

## create orders data

In [12]:
datetime.datetime.now()

datetime.datetime(2023, 5, 25, 13, 39, 50, 597345)

In [13]:
purchase_date = fake.date_between(start_date='-1y', end_date='today')
purchase_date = datetime.datetime.combine(purchase_date, datetime.datetime.min.time())

purchase_date

datetime.datetime(2022, 6, 18, 0, 0)

In [14]:
# Generate customer orders
orders = []
for i in range(1000):
    customerid = random.randint(1, 100)
    productid = random.randint(1, 10)
    units = random.randint(10000, 1000000)
    if random.random() < 0.1:
        units *= 1000
    purchase_date = fake.date_between(start_date='-1y', end_date='today')
    purchase_date = datetime.datetime.combine(purchase_date, datetime.datetime.min.time())

    order = {
        'customerid': customerid,
        'productid': productid,
        'units': units,
        'purchase_date': purchase_date
    }
    orders.append(order)

In [15]:
# Insert orders into MongoDB
db.orders.insert_many(orders)

<pymongo.results.InsertManyResult at 0x7f676300faf0>

In [16]:
db.list_collection_names() # list collections

['products', 'customers', 'orders']

loop through collections and count the number of documents

In [17]:
for collection_name in db.list_collection_names():
  collection = db.get_collection(collection_name)
  print(f'{collection_name}:{collection.count_documents({})}')

products:10
customers:100
orders:1000


# Querying



## Customers

Find all customers.

In [18]:
for document in db.customers.find({}):
  print(document)

{'_id': ObjectId('646f6526089726df60df1403'), 'customerid': 1, 'name': 'Natalie Beasley', 'email': 'omartin@example.com', 'phone': ['783.279.1929x07149', '(340)372-9532x11823', '537-863-1571']}
{'_id': ObjectId('646f6526089726df60df1404'), 'customerid': 2, 'name': 'Cindy Patrick', 'email': 'ozamora@example.com', 'phone': ['007.249.9850', '(194)710-1490x36575', '599.989.5554x2489']}
{'_id': ObjectId('646f6526089726df60df1405'), 'customerid': 3, 'name': 'Brandon Curtis', 'email': 'leenicole@example.com', 'phone': ['(842)768-7943', '942.711.7732', '(301)239-3598']}
{'_id': ObjectId('646f6526089726df60df1406'), 'customerid': 4, 'name': 'Johnny Montgomery', 'email': 'stephenfritz@example.net', 'phone': ['(103)879-1555x811', '(966)811-2675', '001-499-721-5731']}
{'_id': ObjectId('646f6526089726df60df1407'), 'customerid': 5, 'name': 'Daniel Matthews', 'email': 'abigailrodriguez@example.com', 'phone': ['+1-099-091-6723x519', '(994)313-3095', '036-763-1777']}
{'_id': ObjectId('646f6526089726df6

find customerid 76

In [19]:
for document in db.customers.find({'customerid':76}):
  print(document)

{'_id': ObjectId('646f6526089726df60df144e'), 'customerid': 76, 'name': 'Andrew Fisher', 'email': 'arodriguez@example.com', 'phone': ['672-680-9945x7226', '862.917.2635x039', '122.596.1367x7799']}


## Products

find all products

In [20]:
for document in db.products.find({}):
  print(document)

{'_id': ObjectId('646f6526089726df60df1467'), 'productid': 1, 'category': 'Electronics', 'price': 37.80238180289961}
{'_id': ObjectId('646f6526089726df60df1468'), 'productid': 2, 'category': 'Home', 'price': 96.90152545965559}
{'_id': ObjectId('646f6526089726df60df1469'), 'productid': 3, 'category': 'Books', 'price': 72.06382176676661}
{'_id': ObjectId('646f6526089726df60df146a'), 'productid': 4, 'category': 'Electronics', 'price': 3.2756652489953764}
{'_id': ObjectId('646f6526089726df60df146b'), 'productid': 5, 'category': 'Home', 'price': 71.5924500629026}
{'_id': ObjectId('646f6526089726df60df146c'), 'productid': 6, 'category': 'Electronics', 'price': 12.403988892084563}
{'_id': ObjectId('646f6526089726df60df146d'), 'productid': 7, 'category': 'Clothing', 'price': 84.57855710974763}
{'_id': ObjectId('646f6526089726df60df146e'), 'productid': 8, 'category': 'Home', 'price': 67.66780391614036}
{'_id': ObjectId('646f6526089726df60df146f'), 'productid': 9, 'category': 'Electronics', 'pri

find all products with prices less than 40

In [21]:
for document in db.products.find({'price': {'$lt':40}}):
  print(document)

{'_id': ObjectId('646f6526089726df60df1467'), 'productid': 1, 'category': 'Electronics', 'price': 37.80238180289961}
{'_id': ObjectId('646f6526089726df60df146a'), 'productid': 4, 'category': 'Electronics', 'price': 3.2756652489953764}
{'_id': ObjectId('646f6526089726df60df146c'), 'productid': 6, 'category': 'Electronics', 'price': 12.403988892084563}


find all products with category clothing

In [22]:
for document in db.products.find({'category': 'Clothing'}):
  print(document)

{'_id': ObjectId('646f6526089726df60df146d'), 'productid': 7, 'category': 'Clothing', 'price': 84.57855710974763}


find all products with price less than 40 AND category Clothing

In [23]:
for document in db.products.find({'price': {'$lt':40},'category': 'Clothing'}):
  print(document)

## Orders

In [24]:
for document in db.orders.find({'units': {'$lt': 12000}}):
  print(document)

query orders with dates greater than 5-15-2023

In [25]:
for document in db.orders.find({'purchase_date': '2022-07-04T00:00:00.000000Z'}):
  print(document)

In [26]:
for document in db.orders.find({'purchase_date': {'$gt':datetime.datetime(2021, 5, 15)}}):
  print(document)

{'_id': ObjectId('646f6526089726df60df1471'), 'customerid': 51, 'productid': 4, 'units': 815718, 'purchase_date': datetime.datetime(2022, 10, 13, 0, 0)}
{'_id': ObjectId('646f6526089726df60df1472'), 'customerid': 73, 'productid': 3, 'units': 98529, 'purchase_date': datetime.datetime(2023, 4, 24, 0, 0)}
{'_id': ObjectId('646f6526089726df60df1473'), 'customerid': 27, 'productid': 9, 'units': 956629, 'purchase_date': datetime.datetime(2023, 1, 15, 0, 0)}
{'_id': ObjectId('646f6526089726df60df1474'), 'customerid': 6, 'productid': 3, 'units': 684770, 'purchase_date': datetime.datetime(2023, 3, 4, 0, 0)}
{'_id': ObjectId('646f6526089726df60df1475'), 'customerid': 92, 'productid': 3, 'units': 962627, 'purchase_date': datetime.datetime(2023, 1, 19, 0, 0)}
{'_id': ObjectId('646f6526089726df60df1476'), 'customerid': 19, 'productid': 3, 'units': 822063, 'purchase_date': datetime.datetime(2022, 11, 14, 0, 0)}
{'_id': ObjectId('646f6526089726df60df1477'), 'customerid': 40, 'productid': 2, 'units': 

# Extraction 


## Dump mongodb data to json file. 

create a JSON file with the filtered data. 

In [27]:
from bson.json_util import dumps
import json

open a file. create a cursor that gets passed to dumps which takes binary json data and converts it serialized json, deseralizes it to python object and then serializes it back to a file.  

## dump orders

In [28]:
cursor_matt = db.orders.find({'purchase_date': {'$gt':datetime.datetime(2023, 5, 15, 0, 0, 0)}})
#for doc in cursor_matt:
#  doc['purchase_date'] = doc['purchase_date'].isoformat()
#  print(doc)

#cursor_matt

In [29]:
#from bson import json_util

# Convert dates to JSON-compatible format
#for document in cursor_matt:
#    document['purchase_date'] = document['purchase_date'].isoformat()

# Export to JSON
#json_data = json.dumps(cursor_matt, default=json_util.default)

# Print the JSON data
#print(json_data)

In [30]:
cursor_matt

<pymongo.cursor.Cursor at 0x7f6762d991e0>

In [324]:
with open('orders_gt_2023_5_15.json', 'w') as file:
  #cursor = db.orders.find({'purchase_date': {'$gt':datetime.datetime(2023, 5, 15, 0, 0, 0)}})
  json.dump(json.loads(dumps(cursor_matt)), file)

In [31]:
with open('orders_gt_2023_5_15.json', 'w') as file:
  cursor = db.orders.find({'purchase_date': {'$gt':datetime.datetime(2023, 5, 15, 0, 0, 0)}})
  file.write(dumps(cursor))

## dump customers

In [32]:
with open('customers.json', 'w') as file:
  cursor = db.customers.find({})
  file.write(dumps(cursor))

## dump products

In [33]:
with open('products.json', 'w') as file:
  cursor = db.products.find({})
  file.write(dumps(cursor))

In [34]:
!ls -lh *.json

-rw-r--r-- 1 root root  20K May 25 13:40 customers.json
-rw-r--r-- 1 root root 2.3K May 25 13:40 orders_gt_2023_5_15.json
-rw-r--r-- 1 root root 1.2K May 25 13:40 products.json


In [35]:
!cat orders_gt_2023_5_15.json

[{"_id": {"$oid": "646f6526089726df60df14f0"}, "customerid": 2, "productid": 2, "units": 517079, "purchase_date": {"$date": "2023-05-21T00:00:00Z"}}, {"_id": {"$oid": "646f6526089726df60df1528"}, "customerid": 93, "productid": 10, "units": 413320000, "purchase_date": {"$date": "2023-05-20T00:00:00Z"}}, {"_id": {"$oid": "646f6526089726df60df1555"}, "customerid": 48, "productid": 1, "units": 797846000, "purchase_date": {"$date": "2023-05-22T00:00:00Z"}}, {"_id": {"$oid": "646f6526089726df60df15dd"}, "customerid": 31, "productid": 4, "units": 309532, "purchase_date": {"$date": "2023-05-16T00:00:00Z"}}, {"_id": {"$oid": "646f6526089726df60df1612"}, "customerid": 11, "productid": 9, "units": 952671, "purchase_date": {"$date": "2023-05-17T00:00:00Z"}}, {"_id": {"$oid": "646f6526089726df60df16c4"}, "customerid": 71, "productid": 6, "units": 914078, "purchase_date": {"$date": "2023-05-24T00:00:00Z"}}, {"_id": {"$oid": "646f6526089726df60df174d"}, "customerid": 25, "productid": 10, "units": 505

# Reading JSON file data into Python

## JSON into Pandas DataFrame

In [36]:
import pandas as pd

In [37]:
customers_df = pd.read_json('customers.json')
customers_df.head(2)

Unnamed: 0,_id,customerid,name,email,phone
0,{'$oid': '646f6526089726df60df1403'},1,Natalie Beasley,omartin@example.com,"[783.279.1929x07149, (340)372-9532x11823, 537-..."
1,{'$oid': '646f6526089726df60df1404'},2,Cindy Patrick,ozamora@example.com,"[007.249.9850, (194)710-1490x36575, 599.989.55..."


In [38]:
orders_df = pd.read_json('orders_gt_2023_5_15.json')
orders_df.head(2)

Unnamed: 0,_id,customerid,productid,units,purchase_date
0,{'$oid': '646f6526089726df60df14f0'},2,2,517079,{'$date': '2023-05-21T00:00:00Z'}
1,{'$oid': '646f6526089726df60df1528'},93,10,413320000,{'$date': '2023-05-20T00:00:00Z'}


In [39]:
products_df = pd.read_json('products.json')
products_df.head(2)

Unnamed: 0,_id,productid,category,price
0,{'$oid': '646f6526089726df60df1467'},1,Electronics,37.802382
1,{'$oid': '646f6526089726df60df1468'},2,Home,96.901525


### Merge DataFrames

In [40]:
customers_orders_df = customers_df.merge(orders_df, left_on='customerid', right_on='customerid')
customers_orders_df.head(3)

Unnamed: 0,_id_x,customerid,name,email,phone,_id_y,productid,units,purchase_date
0,{'$oid': '646f6526089726df60df1404'},2,Cindy Patrick,ozamora@example.com,"[007.249.9850, (194)710-1490x36575, 599.989.55...",{'$oid': '646f6526089726df60df14f0'},2,517079,{'$date': '2023-05-21T00:00:00Z'}
1,{'$oid': '646f6526089726df60df140d'},11,Tony Lee,esmith@example.com,"[307-592-1744x4537, +1-517-528-3479x61188, 001...",{'$oid': '646f6526089726df60df1612'},9,952671,{'$date': '2023-05-17T00:00:00Z'}
2,{'$oid': '646f6526089726df60df1413'},17,Kirk Gonzalez,scottnorton@example.org,"[001-068-310-5238x6924, 001-020-922-1919x442, ...",{'$oid': '646f6526089726df60df17d3'},5,613510,{'$date': '2023-05-16T00:00:00Z'}


In [42]:
cust_ord_prods_df = customers_orders_df.merge(products_df, left_on='productid', right_on='productid')
cust_ord_prods_df.head(3)

Unnamed: 0,_id_x,customerid,name,email,phone,_id_y,productid,units,purchase_date,_id,category,price
0,{'$oid': '646f6526089726df60df1404'},2,Cindy Patrick,ozamora@example.com,"[007.249.9850, (194)710-1490x36575, 599.989.55...",{'$oid': '646f6526089726df60df14f0'},2,517079,{'$date': '2023-05-21T00:00:00Z'},{'$oid': '646f6526089726df60df1468'},Home,96.901525
1,{'$oid': '646f6526089726df60df1430'},46,Jordan Harris,cnelson@example.net,"[(412)509-5835, 0903143657, 859-600-5484]",{'$oid': '646f6526089726df60df17ba'},2,137831,{'$date': '2023-05-19T00:00:00Z'},{'$oid': '646f6526089726df60df1468'},Home,96.901525
2,{'$oid': '646f6526089726df60df140d'},11,Tony Lee,esmith@example.com,"[307-592-1744x4537, +1-517-528-3479x61188, 001...",{'$oid': '646f6526089726df60df1612'},9,952671,{'$date': '2023-05-17T00:00:00Z'},{'$oid': '646f6526089726df60df146f'},Electronics,85.798046


### Drop columns

In [44]:
cust_ord_prods_df.drop('_id_x',axis=1,inplace=True)
cust_ord_prods_df.drop('_id_y',axis=1,inplace=True)
cust_ord_prods_df.drop('_id',axis=1,inplace=True)

In [46]:
cust_ord_prods_df.head(3)

Unnamed: 0,customerid,name,email,phone,productid,units,purchase_date,category,price
0,2,Cindy Patrick,ozamora@example.com,"[007.249.9850, (194)710-1490x36575, 599.989.55...",2,517079,{'$date': '2023-05-21T00:00:00Z'},Home,96.901525
1,46,Jordan Harris,cnelson@example.net,"[(412)509-5835, 0903143657, 859-600-5484]",2,137831,{'$date': '2023-05-19T00:00:00Z'},Home,96.901525
2,11,Tony Lee,esmith@example.com,"[307-592-1744x4537, +1-517-528-3479x61188, 001...",9,952671,{'$date': '2023-05-17T00:00:00Z'},Electronics,85.798046


In [47]:
#df['date_field'] = 
#pd.to_datetime(customers_orders_df['purchase_date']).dt.date

In [262]:
#customers_orders_df['purchase_date'].apply(pd.Series).merge(customers_orders_df, left_index=True, right_index=True).drop('purchase_date', axis=1)

Unnamed: 0,$date,customerid,name,email,phone,productid,units
0,2023-05-19T00:00:00Z,7,David Fleming,rmcdonald@example.com,"[591.044.1754x11239, 317-737-1716x8760, +1-660...",10,710713
1,2023-05-23T00:00:00Z,11,Christina Gaines,briansmith@example.net,"[587.067.5507x07839, 001-213-749-3632, 643-642...",5,160962
2,2023-05-17T00:00:00Z,15,Sean Mitchell,wcontreras@example.com,"[1594613264, 538.585.2750, +1-476-708-9676]",8,27876
3,2023-05-23T00:00:00Z,19,Anthony Weaver,kimstewart@example.com,"[+1-132-693-8718x344, 382-449-9696x699, 618-60...",7,500620
4,2023-05-23T00:00:00Z,30,Jasmine Greene,hking@example.org,"[3511505656, 849.442.6133x0470, 885-547-3838x2...",4,323709
5,2023-05-16T00:00:00Z,31,Claudia Williamson,katherinebradley@example.com,"[(883)567-8493, 001-055-354-0369, +1-548-402-6...",8,228962
6,2023-05-24T00:00:00Z,43,Sheila Lang,markskirk@example.org,"[001-855-781-5048x763, 974-096-3408x29823, +1-...",4,635703
7,2023-05-20T00:00:00Z,44,Jordan Crane,adennis@example.org,"[724-890-2841, (242)289-4738, 866-474-4911]",5,850070
8,2023-05-20T00:00:00Z,53,John Mcclain,dawn16@example.net,"[+1-655-195-9973x65639, 959.802.0940, +1-213-1...",5,189310
9,2023-05-21T00:00:00Z,54,Lydia Daniels,mooreelizabeth@example.org,"[+1-234-630-8007x8015, +1-495-297-8626x89201, ...",6,577200


## JSON into list of python dictionary

### read customers JSON file

In [51]:
with open('customers.json') as json_file:
    customers = json.load(json_file)

customers[0:2] # show the first 2 records (like head with n=2)

[{'_id': {'$oid': '646f6526089726df60df1403'},
  'customerid': 1,
  'name': 'Natalie Beasley',
  'email': 'omartin@example.com',
  'phone': ['783.279.1929x07149', '(340)372-9532x11823', '537-863-1571']},
 {'_id': {'$oid': '646f6526089726df60df1404'},
  'customerid': 2,
  'name': 'Cindy Patrick',
  'email': 'ozamora@example.com',
  'phone': ['007.249.9850', '(194)710-1490x36575', '599.989.5554x2489']}]

#### explore list of dictionaries

In [68]:
type(customers)

list

In [71]:
len(customers)

100

In [70]:
type(customers[1])

dict

In [62]:
customers[1]

{'_id': {'$oid': '646f6526089726df60df1404'},
 'customerid': 2,
 'name': 'Cindy Patrick',
 'email': 'ozamora@example.com',
 'phone': ['007.249.9850', '(194)710-1490x36575', '599.989.5554x2489']}

In [63]:
customers[1]['name']

'Cindy Patrick'

In [65]:
customers[1].get('name')

'Cindy Patrick'

### read orders JSON file

In [52]:
with open('orders_gt_2023_5_15.json') as json_file:
    orders = json.load(json_file)

orders[0:2] # show the first 2 records (like head with n=2)

[{'_id': {'$oid': '646f6526089726df60df14f0'},
  'customerid': 2,
  'productid': 2,
  'units': 517079,
  'purchase_date': {'$date': '2023-05-21T00:00:00Z'}},
 {'_id': {'$oid': '646f6526089726df60df1528'},
  'customerid': 93,
  'productid': 10,
  'units': 413320000,
  'purchase_date': {'$date': '2023-05-20T00:00:00Z'}}]

### read products JSON file

In [53]:
with open('products.json') as json_file:
    products = json.load(json_file)

products[0:2] # show the first 2 records (like head with n=2)

[{'_id': {'$oid': '646f6526089726df60df1467'},
  'productid': 1,
  'category': 'Electronics',
  'price': 37.80238180289961},
 {'_id': {'$oid': '646f6526089726df60df1468'},
  'productid': 2,
  'category': 'Home',
  'price': 96.90152545965559}]