# homework 7: `mongodb`

## `e-library`: parts 1 - 3

### Insert Data: part 1 & 2

In [179]:
import json
from bson.json_util import dumps
from pymongo import MongoClient

# initialize a client object
client = MongoClient("localhost", 27017)

# select db
db = client["library"] # or client.<name>

# select collection
ebooks = db["ebooks"]
ebooks.delete_many({})

# insert first documents
docs = [
    {
        "title": "The Elements of Statistical Learning: Data Mining, Inference, and Prediction",
        "primary_author": "Trevor Hastie",
        "secondary_author": ["Robert Tibshirani", "Jerome H. Friedman"],
        "published": 2001,
        "pages": 745,
        "publisher": "Springer",
        "topic": "statistics"
    },
    {
        "title": "Deep Learning",
        "primary_author": "Ian Goodfellow",
        "secondary_author": ["Yoshua Bengio", "Aaron Courville"],
        "published": 2016,
        "pages": 800,
        "publisher": "MIT Press",
        "topic": "deep learning"
    }
]

ebooks.insert_many(docs)
print(client.list_database_names())
print(db.list_collection_names())

['admin', 'config', 'congress', 'library', 'local', 'mongoTest']
['users', 'ebooks', 'checkouts']


In [180]:
docs = [
    {
        "title": "Zorrie",
        "primary_author": "Laird Hunt",
        "published": 2021,
        "pages": 400,
        "publisher": "Quercus",
        "topic": "fiction"
    },
    {
        "title": "Cicada",
        "primary_author": "Phoebe Giannisi",
        "published": 2022,
        "pages": 128,
        "publisher": "New Directions",
        "translator": "Brian Sneeden",
        "topic": "poetry"
    },
]

ebooks.insert_many(docs)

InsertManyResult([ObjectId('69228574944d64b91aacb3eb'), ObjectId('69228574944d64b91aacb3ec')], acknowledged=True)

In [181]:
users = db["users"]
users.delete_many({})

docs = [
    {
        "id": 1001,
        "name": "Pranav Gundrala",
        "phone": "123-456-7899",
        "address": "69 Brown St, Providence, RI 02912",
        "university": "Brown University"
    },
    {
        "id": 1003,
        "name": "John Doe",
        "phone": "246-810-1214",
        "address": "123 Main St, Providence, RI 02906",
        "university": "University of Rhode Island"
    },
    {
        "id": 1002,
        "name": "Jane Doe",
        "phone": "369-121-5182",
        "address": "456 Benefit St, Providence, RI 02906",
        "university": "Brown University"
    }
]

users.insert_many(docs)
print(db.list_collection_names())

['users', 'ebooks', 'checkouts']


In [182]:
checkouts = db["checkouts"]
checkouts.delete_many({})

docs = [
    {
        "date": "2025-11-22",
        "book": "The Elements of Statistical Learning: Data Mining, Inference, and Prediction",
        "user": 1003
    },
    {
        "date": "2025-09-16",
        "book": "The Elements of Statistical Learning: Data Mining, Inference, and Prediction",
        "user": 1002
    },
    {
        "date": "2025-09-25",
        "book": "Deep Learning",
        "user": 1002
    },
    {
        "date": "2025-11-21",
        "book": "Cicada",
        "user": 1001
    },
    {
        "date": "2025-10-13",
        "book": "Zorrie",
        "user": 1001
    },
    {
        "date": "2025-10-15",
        "book": "Deep Learning",
        "user": 1001
    },
]

checkouts.insert_many(docs)
print(db.list_collection_names())

['users', 'ebooks', 'checkouts']


### Queries: part 3

#### query 1

In [183]:
# 1. Which users have checked out 'Elements of Statistical Learning: Data Mining, Inference, and Prediction'?
query = {
    "book": "The Elements of Statistical Learning: Data Mining, Inference, and Prediction"
}
projection = {"_id" : 0, "user" : 1}

ids = checkouts.find(query, projection)
ids = [x['user'] for x in ids]

query = {
    "id": {"$in": ids}
}

result = users.find(query)
print("Users who have checked out 'Elements of Statistical Learning...'")
for x in result: print(dumps(x,indent=4))

Users who have checked out 'Elements of Statistical Learning...'
{
    "_id": {
        "$oid": "69228574944d64b91aacb3ee"
    },
    "id": 1003,
    "name": "John Doe",
    "phone": "246-810-1214",
    "address": "123 Main St, Providence, RI 02906",
    "university": "University of Rhode Island"
}
{
    "_id": {
        "$oid": "69228574944d64b91aacb3ef"
    },
    "id": 1002,
    "name": "Jane Doe",
    "phone": "369-121-5182",
    "address": "456 Benefit St, Providence, RI 02906",
    "university": "Brown University"
}


#### query 2

In [184]:
# 2. Which users from Brown University have checked out books on Deep Learning?
query = {
    "topic": "deep learning"
}
projection = {"_id": 0, "title": 1}
titles = ebooks.find(query, projection)
titles = [x['title'] for x in titles]

query = {
    "book": {"$in": titles}
}
projection = {"_id": 0, "user": 1}
ids = checkouts.find(query, projection)
ids = [x['user'] for x in ids]

query = {
    "id": {"$in": ids},
    "university": "Brown University"
}
result = users.find(query)
print("Users from 'Brown University' who have checked out books on 'deep learning':")
for x in result: print(dumps(x,indent=4))

Users from 'Brown University' who have checked out books on 'deep learning':
{
    "_id": {
        "$oid": "69228574944d64b91aacb3ed"
    },
    "id": 1001,
    "name": "Pranav Gundrala",
    "phone": "123-456-7899",
    "address": "69 Brown St, Providence, RI 02912",
    "university": "Brown University"
}
{
    "_id": {
        "$oid": "69228574944d64b91aacb3ef"
    },
    "id": 1002,
    "name": "Jane Doe",
    "phone": "369-121-5182",
    "address": "456 Benefit St, Providence, RI 02906",
    "university": "Brown University"
}


In [185]:
# 2. Using an $lookup function ... 
query = [
    # join book info to checkouts
    {
        "$lookup": {
            "from": "ebooks",
            "localField": "book",
            "foreignField": "title",
            "as": "book_info"
        }
    },
    # filter for 'deep learning'
    { "$match": { "book_info.topic": 'deep learning'}},
    # join user info to checkouts
    {
        "$lookup": {
            "from": "users",
            "localField": "user",
            "foreignField": "id",
            "as": "user_info"
        }
    },
    # filter for 'Brown University'
    { "$match": {"user_info.university": "Brown University"}},
    {
        "$project": {
            "_id": 0,
            "user_info": 1
        }
    }
]

result = checkouts.aggregate(query)
for x in result: print(dumps(x,indent=4))

{
    "user_info": [
        {
            "_id": {
                "$oid": "69228574944d64b91aacb3ef"
            },
            "id": 1002,
            "name": "Jane Doe",
            "phone": "369-121-5182",
            "address": "456 Benefit St, Providence, RI 02906",
            "university": "Brown University"
        }
    ]
}
{
    "user_info": [
        {
            "_id": {
                "$oid": "69228574944d64b91aacb3ed"
            },
            "id": 1001,
            "name": "Pranav Gundrala",
            "phone": "123-456-7899",
            "address": "69 Brown St, Providence, RI 02912",
            "university": "Brown University"
        }
    ]
}


#### query 3

In [186]:
# 3. How many times is the book 'Deep Learning' been checked out?
query = {
    "book": "Deep Learning"
}
result = checkouts.count_documents(query)
print(f"'Deep Learning' has been checked out {result} times.")

'Deep Learning' has been checked out 2 times.


## library of congress dataset: part 4

### Insert Data

In [187]:
# initialize a client object
client = MongoClient("localhost", 27017)
# select db
db = client["congress"]
# select collection
books = db["books"]
books.delete_many({})

DeleteResult({'n': 1000, 'ok': 1.0}, acknowledged=True)

In [188]:
# get documents and add
with open('./data/mongodb_sample.json') as f:
    docs = json.load(f)
books.insert_many(docs)

InsertManyResult([ObjectId('69228575944d64b91aacb3f7'), ObjectId('69228575944d64b91aacb3f8'), ObjectId('69228575944d64b91aacb3f9'), ObjectId('69228575944d64b91aacb3fa'), ObjectId('69228575944d64b91aacb3fb'), ObjectId('69228575944d64b91aacb3fc'), ObjectId('69228575944d64b91aacb3fd'), ObjectId('69228575944d64b91aacb3fe'), ObjectId('69228575944d64b91aacb3ff'), ObjectId('69228575944d64b91aacb400'), ObjectId('69228575944d64b91aacb401'), ObjectId('69228575944d64b91aacb402'), ObjectId('69228575944d64b91aacb403'), ObjectId('69228575944d64b91aacb404'), ObjectId('69228575944d64b91aacb405'), ObjectId('69228575944d64b91aacb406'), ObjectId('69228575944d64b91aacb407'), ObjectId('69228575944d64b91aacb408'), ObjectId('69228575944d64b91aacb409'), ObjectId('69228575944d64b91aacb40a'), ObjectId('69228575944d64b91aacb40b'), ObjectId('69228575944d64b91aacb40c'), ObjectId('69228575944d64b91aacb40d'), ObjectId('69228575944d64b91aacb40e'), ObjectId('69228575944d64b91aacb40f'), ObjectId('69228575944d64b91aacb4

### Queries

#### query 1

In [189]:
# debug #
# look at first book
# head = books.find_one({})
# print(dumps(head,indent=4))

In [200]:
# debug #
# import numpy as np
# test = books.find({}, {"_id": 0, "date": 1})
# test = np.array([int(x['date']) for x in test])
# print(test[test < 1800]) 
# expected answer is one book from 1780

In [191]:
# 1. What books available at the LoC were written before 1800?
query = {
    "date": {"$lt": "1800"}
}
projection = {"_id": 0, "item": {"title": 1}, "date": 1}
result = books.find(query, projection)
for x in result: print(dumps(x, indent=4))

{
    "date": "1780",
    "item": {
        "title": "Historie der waereld,"
    }
}


#### query 2

In [192]:
# 2. How many books are written in English?
query = {
    "language": "english"
}
result = books.count_documents(query)
print(f"{result} books are written in english")

918 books are written in english


#### query 3

In [193]:
# debug #
# for x in (books.find({}, {"_id":0, "item.contributors":1})): print(dumps(x,indent=4))
# there are books with "item" = {} and "contributors" missing

In [194]:
# 3. What books have more than 1 contributor? 
query = [
    # check that the item and contributors fields exist
    {
        "$match": {
            "item": {"$exists": True},
            "item.contributors": {"$exists": True, "$ne": []}
            }
    },
    # calculate the length
    {
        "$addFields": {
            "len": {"$size": "$item.contributors" }
            }
    },
    # check if the len > 1
    {
        "$match": {"len": {"$gt": 1}}
    },
    {
        "$project": {"_id":0, "title":1, "item.contributors":1}
    }
]

# print first 3 results
result = list(books.aggregate(query))
for x in result[:3]: print(dumps(x, indent=4))

{
    "item": {
        "contributors": [
            "Hyslop, James H. (James Hervey), 1854-",
            "Woodrow Wilson Collection (Library of Congress)"
        ]
    },
    "title": "Democracy; a study of government,"
}
{
    "item": {
        "contributors": [
            "Strong, Josiah, 1847-1916. [from old catalog]",
            "Congregational home missionary society. [from old catalog]"
        ]
    },
    "title": "Our country: its possible future and its present crisis."
}
{
    "item": {
        "contributors": [
            "Butler, James Davie, 1815-1905.",
            "Joseph Meredith Toner Collection (Library of Congress)"
        ]
    },
    "title": "Portraits of Columbus : A monograph"
}


#### query 4

In [197]:
# debug #
# books do exist with multiple languages
# query = [
#     {"$match": {"language": {"$exists": True, "$ne": []}}},
#     {"$addFields": {"len": {"$size": "$language" }}},
#     {"$match": {"len": {"$gt": 1}}},
#     {"$project": {"_id":0, "language":1, "title":1}}
# ]
# result = list(books.aggregate(query))
# for x in result[:1]: print(dumps(x, indent=4))

In [196]:
# 4. How many books per language does the data have? (Hint: use a pipeline)
pipeline = [
    # unwind for books with multiple languages
    { "$unwind": "$language" },
    # group by language and count docs
    { "$group": {"_id": "$language",
                 "count": {"$count": {} }
                 } },
    # sort descending
    { "$sort": {"count": -1}}
]
# print
for x in books.aggregate(pipeline): print(dumps(x,indent=4))

{
    "_id": "english",
    "count": 918
}
{
    "_id": "german",
    "count": 17
}
{
    "_id": "french",
    "count": 14
}
{
    "_id": "spanish",
    "count": 13
}
{
    "_id": "russian",
    "count": 9
}
{
    "_id": "latin",
    "count": 7
}
{
    "_id": "italian",
    "count": 6
}
{
    "_id": "flemish",
    "count": 4
}
{
    "_id": "dutch",
    "count": 4
}
{
    "_id": "czech",
    "count": 2
}
{
    "_id": "portuguese",
    "count": 2
}
{
    "_id": "polish",
    "count": 2
}
{
    "_id": "danish",
    "count": 2
}
{
    "_id": "englat",
    "count": 2
}
{
    "_id": "engspa",
    "count": 1
}
{
    "_id": "latgrc",
    "count": 1
}
{
    "_id": "latgre",
    "count": 1
}
{
    "_id": "multiple languages",
    "count": 1
}
{
    "_id": "swedish",
    "count": 1
}
{
    "_id": "engund",
    "count": 1
}
{
    "_id": "lithuanian",
    "count": 1
}
{
    "_id": "sanskrit",
    "count": 1
}
{
    "_id": "engger",
    "count": 1
}
{
    "_id": "lateng",
    "count": 1
}
