<a href="https://colab.research.google.com/github/matthewpecsok/data_engineering/blob/main/tutorials/de_nosql_databases_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This tutorial will introduce you to  MongoDB a document database. MongoDB deals in data that is effictively a dictionary in Python or JSON data if it's a file. This document-oriented database is very different from the relational database designs we've seen thus far. We'll learn how to populate the database with data, how to query the database for documents we'd like to find, how to extract data for data engineering purposes.

In [None]:
!pip install faker
!pip install pymongo

In [None]:
from faker import Faker
from pymongo import MongoClient
import random
import datetime
import re

# install MongoDB

In [None]:
!apt-get install gnupg curl

In [None]:
!curl -fsSL https://pgp.mongodb.com/server-7.0.asc | \
   sudo gpg -o /usr/share/keyrings/mongodb-server-7.0.gpg \
   --dearmor

In [None]:
!echo "deb [ arch=amd64,arm64 signed-by=/usr/share/keyrings/mongodb-server-7.0.gpg ] https://repo.mongodb.org/apt/ubuntu jammy/mongodb-org/7.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-7.0.list

In [None]:
!apt-get update

In [None]:
!apt-get install -y mongodb-org

In [None]:
!mkdir /data
!mkdir /data/db

In [None]:
import subprocess
subprocess.Popen(["mongod"])

In [None]:
from pymongo import MongoClient
client = MongoClient()
client.list_database_names() # ['admin', 'local']

## create the db

name our database

In [None]:
db = client['cloud_purchase_db']

## Set up Collections

drop the collections in case they already exist so we don't duplicate data

In [None]:
db.drop_collection('customers')
db.drop_collection('products')
#db.drop_collection('orders')

do we have any collections?

## list collections

In [None]:
db.list_collection_names()

In [None]:
# Create Faker instance
fake = Faker()

# Create the data

## create customer data

create from 1 to 5 orders (randomly)
pick a random product id from 1 to 10
units between 10 and 100
year from 1 year ago to today

In [None]:
# Generate customer orders
def gen_orders():

  orders = []
  for i in range(1,random.randint(1, 5)):

      productid = random.randint(1, 10)
      units = random.randint(10, 100)
      if random.random() < 0.1:
          units *= 10
      purchase_date = fake.date_between(start_date='-1y', end_date='today')
      purchase_date = datetime.datetime.combine(purchase_date, datetime.datetime.min.time())

      order = {
          'productid': productid,
          'units': units,
          'purchase_date': purchase_date
      }

      orders.append(order)

  return orders

In [None]:
gen_orders()

generate 100 customers

In [None]:
# Generate customers
customers = []
for i in range(100):
    customer = {
        'customerid': i+1,
        'name': fake.name(),
        'email': fake.email(),
        'phone': [fake.phone_number(),fake.phone_number(),fake.phone_number()],
        'orders' : gen_orders() #this is where we generate orders
    }
    customers.append(customer)

In [None]:
len(customers)

In [None]:
customers[0:3]

In [None]:
# Insert customers into MongoDB
db.customers.insert_many(customers)

## create products data

In [None]:
# Generate products
products = []
for i in range(10):
    product = {
        'productid': i+1,
        'category': random.choice(['Electronics', 'Clothing', 'Books', 'Home']),
        'price': random.randint(1, 100)
    }
    products.append(product)

In [None]:
products[0:3]

put the 10 products into the database

In [None]:
# Insert products into MongoDB
db.products.insert_many(products)

In [None]:
db.list_collection_names() # list collections

loop through collections and count the number of documents

In [None]:
for collection_name in db.list_collection_names():
  collection = db.get_collection(collection_name)
  print(f'{collection_name}:{collection.count_documents({})}')

# Querying



## Customers

Find all customers.

In [None]:
for document in db.customers.find({}):
  print(document)

find customerid 76

In [None]:
import pprint

In [None]:
for document in db.customers.find({'customerid':76}):
  print(pprint.pprint(document))

## Products

find all products

In [None]:
for document in db.products.find({}):
  print(document)

find all products with prices less than 40

In [None]:
for document in db.products.find({'price': {'$lt':40}}):
  print(document)

find all products with category clothing

the find method returns a cursor that lazily loads the result set in batches as we iterate over them.

In [None]:
db.products.find({'category': 'Clothing'})

In [None]:
for document in db.products.find({'category': 'Clothing'}):
  print(document)

not equal query

In [None]:
for document in db.products.find({"category": {"$ne": "Clothing"}}):
  print(document)

the equivalent of SQL's query `like '%string%'`

In [None]:
for document in db.products.find({"category": re.compile("electron", re.IGNORECASE)}):
  print(document)

find all products with price less than 40 AND category Clothing

In [None]:
for document in db.products.find({'price': {'$lt':40},'category': 'Clothing'}):
  print(document)

## Orders

In [None]:
# Calculate the total number of orders
pipeline = [
    {
        '$unwind': '$orders'
    },
    {
        '$group': {
            '_id': None,
            'total_orders': {'$sum': 1}
        }
    },
    {
        '$project': {
            '_id': 0,
            'total_orders': 1
        }
    }
]

result = db.customers.aggregate(pipeline)

# Extract the total number of orders
total_orders = next(result)['total_orders']

# Print the total number of orders
print(f"Total Orders: {total_orders}")

In [None]:
filter_criteria = {
    'orders.units': {'$lt': 45000}
}

# Calculate the total number of orders with units less than 12000
pipeline = [
    {
      '$unwind': '$orders'
    },
    {
      '$match': filter_criteria
    },
    {
        '$group': {
            '_id': None,
            'total_orders': {'$sum': 1}
        }
    },
    {
        '$project': {
            '_id': 0,
            'total_orders': 1
        }
    }
]

result = db.customers.aggregate(pipeline)

# Extract the total number of orders
total_orders = next(result)['total_orders']

# Print the total number of orders
print(f"Total Orders: {total_orders}")

# Extraction


## Dump mongodb data to json file.

create a JSON file with the filtered data.

In [None]:
from bson.json_util import dumps
import json

open a file. create a cursor that gets passed to dumps which takes binary json data and converts it serialized json, deseralizes it to python object and then serializes it back to a file.  

## dump orders

## dump customers

(and orders as they are nested)

In [None]:
with open('customers.json', 'w') as file:
  cursor = db.customers.find({})
  file.write(dumps(cursor))

## dump products

In [None]:
with open('products.json', 'w') as file:
  cursor = db.products.find({})
  file.write(dumps(cursor))

In [None]:
!ls -lh *.json

# Reading JSON file data into Python

## JSON into Pandas DataFrame

In [None]:
import pandas as pd

In [None]:
customers_df = pd.read_json('customers.json')
customers_df.head(2)

In [None]:
products_df = pd.read_json('products.json')
products_df.head(2)

### Merge DataFrames

In [None]:
customers_df.head(4)

In [None]:
orders_df = customers_df.explode('orders')

In [None]:
df_orders_expanded = pd.concat([orders_df.drop(['orders'], axis=1), orders_df['orders'].apply(pd.Series)], axis=1)
df_orders_expanded.head(3)


In [None]:
customer_product_orders = df_orders_expanded.merge(products_df, on='productid', how='left')
customer_product_orders.head(3)

In [None]:
customer_product_orders['total_sales'] = customer_product_orders['price'] * customer_product_orders['units']

In [None]:
customer_product_orders.groupby(by='category').agg({'total_sales': 'sum'}).sort_values(by='total_sales', ascending=False)

How to unzip a zip file.

In [None]:
!wget -O patient_records.zip https://github.com/matthewpecsok/data_engineering/raw/main/data/patient_records.zip

In [None]:
!ls -l

In [None]:
!unzip patient_records.zip

In [None]:
!wget -O medication_database.db https://github.com/matthewpecsok/data_engineering/raw/main/data/medication_database.db

In [None]:
import sqlite3
import pandas as pd

In [None]:
medication_con = sqlite3.connect('medication_database.db')

pd.read_sql_query('SELECT * FROM sqlite_master', medication_con)

In [None]:
pd.read_sql_query('SELECT * FROM medications', medication_con)

In [None]:
import json

In [None]:
patients_1_batch = json.load(open('patient_records_batch_1.json'))

In [None]:
len(patients_1_batch)

In [None]:
patients_1_batch[0:20]