# MongoDB

In [1]:
from pymongo import MongoClient
import requests

## Part I: Basics
- example doc
- count *
- basic filtering

In [2]:
# client is a dictionary of databases
# database is a dictionary of collections
# databases are attributes of a client
# collections are attributes of databases

In [3]:
client = MongoClient() # client is a dictionary of databases
# client.drop_database('nobel') # drop db. If it already exists, we keep inserting documents into the same db
db = client["nobel"] # create local db on the fly 

In [4]:
collections = ["prizes", "laureates"]

In [5]:
for collection in collections:
    print(f"{collection.strip('s')}")
    response = requests.get(f"http://api.nobelprize.org/v1/{collection.strip('s')}.json") # ping the api
    documents = response.json()[collection] # convert data to json
    db[collection].insert_many(documents) # create colletions on the fly

prize
laureate


In [6]:
db.prizes.find_one() # see an example "record"

{'_id': ObjectId('5eb5d046b426c1f9fd9f82e3'),
 'year': '2019',
 'category': 'chemistry',
 'laureates': [{'id': '976',
   'firstname': 'John',
   'surname': 'Goodenough',
   'motivation': '"for the development of lithium-ion batteries"',
   'share': '3'},
  {'id': '977',
   'firstname': 'M. Stanley',
   'surname': 'Whittingham',
   'motivation': '"for the development of lithium-ion batteries"',
   'share': '3'},
  {'id': '978',
   'firstname': 'Akira',
   'surname': 'Yoshino',
   'motivation': '"for the development of lithium-ion batteries"',
   'share': '3'}]}

In [7]:
db.laureates.count_documents({}) #count number of documents. Need to pass empty filter `{}` to count all

1898

In [8]:
# Cout documents
db.prizes.count_documents({})

1298

In [9]:
# Unique values
db.laureates.distinct("gender")

['female', 'male', 'org']

In [10]:
# count with a filter applied
db.laureates.count_documents({'gender': 'female'})

110

In [11]:
db.laureates.count_documents({'gender': 'male'})

1739

In [12]:
db.laureates.count_documents({'gender': 'org'})

49

## Dot notation: reach into substructure

#### Use a filter document (criteria) to find a document for a laureate with at least two elements in its "prizes" (note zero-based indexing).

In [13]:
# Filter for laureates with at least two prizes
criteria = {"prizes.1": {"$exists": True}} # a second item exists the list of laureate's prizes

# Find one laureate with at least two prizes
doc = db.laureates.find_one(criteria)

# Print the document
print(doc)

doc

{'_id': ObjectId('5eb5d04cb426c1f9fd9f856e'), 'id': '6', 'firstname': 'Marie', 'surname': 'Curie', 'born': '1867-11-07', 'died': '1934-07-04', 'bornCountry': 'Russian Empire (now Poland)', 'bornCountryCode': 'PL', 'bornCity': 'Warsaw', 'diedCountry': 'France', 'diedCountryCode': 'FR', 'diedCity': 'Sallanches', 'gender': 'female', 'prizes': [{'year': '1903', 'category': 'physics', 'share': '4', 'motivation': '"in recognition of the extraordinary services they have rendered by their joint researches on the radiation phenomena discovered by Professor Henri Becquerel"', 'affiliations': [[]]}, {'year': '1911', 'category': 'chemistry', 'share': '1', 'motivation': '"in recognition of her services to the advancement of chemistry by the discovery of the elements radium and polonium, by the isolation of radium and the study of the nature and compounds of this remarkable element"', 'affiliations': [{'name': 'Sorbonne University', 'city': 'Paris', 'country': 'France'}]}]}


{'_id': ObjectId('5eb5d04cb426c1f9fd9f856e'),
 'id': '6',
 'firstname': 'Marie',
 'surname': 'Curie',
 'born': '1867-11-07',
 'died': '1934-07-04',
 'bornCountry': 'Russian Empire (now Poland)',
 'bornCountryCode': 'PL',
 'bornCity': 'Warsaw',
 'diedCountry': 'France',
 'diedCountryCode': 'FR',
 'diedCity': 'Sallanches',
 'gender': 'female',
 'prizes': [{'year': '1903',
   'category': 'physics',
   'share': '4',
   'motivation': '"in recognition of the extraordinary services they have rendered by their joint researches on the radiation phenomena discovered by Professor Henri Becquerel"',
   'affiliations': [[]]},
  {'year': '1911',
   'category': 'chemistry',
   'share': '1',
   'motivation': '"in recognition of her services to the advancement of chemistry by the discovery of the elements radium and polonium, by the isolation of radium and the study of the nature and compounds of this remarkable element"',
   'affiliations': [{'name': 'Sorbonne University',
     'city': 'Paris',
     '

In [14]:
db.laureates.count_documents(criteria)

12

## Part II: Distinct Values

### Pre-filtering distinct values

Prizes can be shared among multiple people. Only the literature prize category has no prizes shared by three or more laureates.

In [15]:
# Save a filter for prize documents with three or more laureates
criteria = {"laureates.2": {"$exists": True}}

# Save the set of distinct prize categories in documents satisfying the criteria
triple_play_categories = set(db.prizes.distinct("category", criteria))
print(triple_play_categories)
# Confirm literature as the only category not satisfying the criteria.
assert set(db.prizes.distinct("category")) - triple_play_categories == {"literature"}
# print(triple_play_categories)

{'peace', 'physics', 'medicine', 'chemistry', 'economics'}


### Array fields and operators

In [16]:
db.prizes.distinct("category")

['chemistry', 'economics', 'literature', 'medicine', 'peace', 'physics']

In [17]:
db.laureates.count_documents({})

1898

In [18]:
db.laureates.count_documents({"prizes.category": "physics"})

427

In [19]:
db.laureates.count_documents({"prizes.category": {"$ne": "physics"}})

1471

In [20]:
# number of laureates documents whose prizes are  in physics, chemistry, or medicine
db.laureates.count_documents({
"prizes.category": {
"$in": ["physics", "chemistry", "medicine"]}})

1234

In [21]:
# number laureates documents whose prizes are NOT in physics, chemistry, or medicine
db.laureates.count_documents({
"prizes.category": {
"$nin": ["physics", "chemistry", "medicine"]}})

664

### $elemMatch

In [22]:
db.laureates.count_documents({
"prizes": {
"category": "physics", "share": "1"}})

0

In [23]:
db.laureates.count_documents({
"prizes.category": "physics", "prizes.share": "1"})

96

In [24]:
db.laureates.count_documents({
"prizes": {"$elemMatch":
{"category": "physics", "share": "1"}}})

94

In [25]:
db.laureates.count_documents({
"prizes": {"$elemMatch": {
"category": "physics",
"share": "1",
"year": {"$lt": "1945"},}}})

58

### Filtering with Regular Expressions

In [26]:
# Finding a substring with $regex
db.laureates.distinct("bornCountry",
{"bornCountry": {"$regex": "Poland"}})

['Austria-Hungary (now Poland)',
 'Free City of Danzig (now Poland)',
 'German-occupied Poland (now Poland)',
 'Germany (now Poland)',
 'Poland',
 'Poland (now Belarus)',
 'Poland (now Lithuania)',
 'Poland (now Ukraine)',
 'Prussia (now Poland)',
 'Russian Empire (now Poland)']

In [27]:
from bson.regex import Regex
db.laureates.distinct("bornCountry",
{"bornCountry": Regex("^Poland")})

['Poland',
 'Poland (now Belarus)',
 'Poland (now Lithuania)',
 'Poland (now Ukraine)']

In [28]:
db.laureates.distinct(
"bornCountry",
{"bornCountry": Regex("^Poland \(now")})

['Poland (now Belarus)', 'Poland (now Lithuania)', 'Poland (now Ukraine)']

In [29]:
# Ends wih 'now Poland'
db.laureates.distinct(
"bornCountry",
{"bornCountry": Regex("now Poland\)$")})

['Austria-Hungary (now Poland)',
 'Free City of Danzig (now Poland)',
 'German-occupied Poland (now Poland)',
 'Germany (now Poland)',
 'Prussia (now Poland)',
 'Russian Empire (now Poland)']

## Part III: The Business
- Projections
- Sorting
- Indexes
- Limits, Skips, and Sorts

### Projections

In [30]:
docs = db.laureates.find(
        filter={},
        projection={"prizes.affiliations": 1,
                    "_id": 0})
type(docs)

pymongo.cursor.Cursor

In [31]:
# convert to list and slice
list(docs)[:3]

[{'prizes': [{'affiliations': [{'name': 'Munich University',
      'city': 'Munich',
      'country': 'Germany'}]}]},
 {'prizes': [{'affiliations': [{'name': 'Leiden University',
      'city': 'Leiden',
      'country': 'the Netherlands'}]}]},
 {'prizes': [{'affiliations': [{'name': 'Amsterdam University',
      'city': 'Amsterdam',
      'country': 'the Netherlands'}]}]}]

### Sorting

#### Sorting post-query with Python

In [32]:
docs = list(db.prizes.find({"category": "physics"}, ["year"]))
print([doc["year"] for doc in docs][:5])

['2019', '2018', '2017', '2016', '2015']


In [33]:
from operator import itemgetter
docs = sorted(docs, key=itemgetter("year"))
print([doc["year"] for doc in docs][:5])

['1901', '1901', '1902', '1902', '1903']


In [34]:
docs = sorted(docs, key=itemgetter("year"), reverse=True)
print([doc["year"] for doc in docs][:5])

['2020', '2019', '2019', '2018', '2018']


#### Sorting in-query with MongoDB

In [35]:
cursor = db.prizes.find({"category": "physics"}, ["year"],
                        sort=[("year", 1)])
print([doc["year"] for doc in cursor][:5])

['1901', '1901', '1902', '1902', '1903']


In [36]:
cursor = db.prizes.find({"category": "physics"}, ["year"],
                        sort=[("year", -1)])
print([doc["year"] for doc in cursor][:5])

['2020', '2019', '2019', '2018', '2018']


#### Primary and secondary sorting

In [37]:
for doc in db.prizes.find(
    {"year": {"$gt": "1966", "$lt": "1970"}},
    ["category", "year"],
    sort=[("year", 1), ("category", -1)]):
    
    print("{year} {category}".format(**doc))

1967 physics
1967 physics
1967 peace
1967 peace
1967 medicine
1967 medicine
1967 literature
1967 literature
1967 chemistry
1967 chemistry
1968 physics
1968 physics
1968 peace
1968 peace
1968 medicine
1968 medicine
1968 literature
1968 literature
1968 chemistry
1968 chemistry
1969 physics
1969 physics
1969 peace
1969 peace
1969 medicine
1969 medicine
1969 literature
1969 literature
1969 economics
1969 economics
1969 chemistry
1969 chemistry


### When to use indexes?
- Queries with high specicity
- Large documents
- Large collections

### Skips and paging through results

In [38]:
for doc in db.prizes.find({"laureates.share": "3"}, limit=3):
    print("{year} {category}".format(**doc))

2019 chemistry
2019 economics
2019 medicine


In [40]:
for doc in db.prizes.find({"laureates.share": "3"}, skip=3, limit=3):
    print("{year} {category}".format(**doc))

2017 chemistry
2017 medicine
2016 chemistry


In [41]:
for doc in db.prizes.find({"laureates.share": "3"}, skip=6, limit=3):
    print("{year} {category}".format(**doc))

2015 chemistry
2014 chemistry
2014 physics


### Using cursor methods for {sort, skip, limit}

In [42]:
for doc in db.prizes.find({"laureates.share": "3"}).limit(10):
    print("{year} {category}".format(**doc))

2019 chemistry
2019 economics
2019 medicine
2017 chemistry
2017 medicine
2016 chemistry
2015 chemistry
2014 chemistry
2014 physics
2013 chemistry


In [43]:
for doc in (db.prizes.find({"laureates.share": "3"}).skip(3).limit(3)):
    print("{year} {category}".format(**doc))

2017 chemistry
2017 medicine
2016 chemistry


In [44]:
for doc in (db.prizes.find({"laureates.share": "3"})
            .sort([("year", 1)])
            .skip(3)
            .limit(3)):
    print("{year} {category}".format(**doc))

1945 medicine
1950 medicine
1950 medicine


## Part IV:  Aggregation Pipelines
Let the Server Do It For You

In [45]:
cursor = db.laureates.find(
    filter={"bornCountry": "USA"},
    projection={"prizes.year": 1},
    limit=3
    )
for doc in cursor:
    print(doc["prizes"])

[{'year': '1923'}]
[{'year': '1927'}]
[{'year': '1936'}]


In [46]:
cursor = db.laureates.aggregate([
    {"$match": {"bornCountry": "USA"}},
    {"$project": {"prizes.year": 1}},
    {"$limit": 3}
    ])
for doc in cursor:
    print(doc["prizes"])

[{'year': '1923'}]
[{'year': '1927'}]
[{'year': '1936'}]


In [47]:
list(db.laureates.aggregate([
{"$match": {"bornCountry": "USA"}},
{"$count": "n_USA-born-laureates"}
]))


[{'n_USA-born-laureates': 551}]

In [48]:
db.laureates.count_documents({"bornCountry": "USA"})

551

In [49]:
cursor = db.laureates.aggregate([
        {"$project": {"solo_winner": {"$in": ["1","$prizes.share"]}}}
        ])

In [50]:
cursor.next()

{'_id': ObjectId('5eb5d04cb426c1f9fd9f8569'), 'solo_winner': True}

In [51]:
list(db.laureates.aggregate([
{"$project": {"n_prizes": {"$size": ["$prizes"]}}},
{"$group": {"_id": None,
"n_prizes_total": {"$sum": "$n_prizes"}}}
]))

[{'_id': None, 'n_prizes_total': 1912}]

In [52]:
{ "$ifNull": [ "$myFieldArray", [] ] }

{'$ifNull': ['$myFieldArray', []]}

### Sizing and Summing

In [53]:
list(db.prizes.aggregate([
{"$project": {"n_laureates": {"$size": {"$ifNull": ["$laureates", []]}},
"year": 1, "category": 1, "_id": 0}}
]))[:5]

[{'year': '2019', 'category': 'chemistry', 'n_laureates': 3},
 {'year': '2019', 'category': 'economics', 'n_laureates': 3},
 {'year': '2019', 'category': 'literature', 'n_laureates': 1},
 {'year': '2019', 'category': 'peace', 'n_laureates': 1},
 {'year': '2019', 'category': 'physics', 'n_laureates': 3}]

In [54]:
result = \
list(db.prizes.aggregate([
    {"$project": {"n_laureates": {"$size": {"$ifNull": ["$laureates", []]}},
                  "category": 1}},
    {"$group": {"_id": "$category", "n_laureates":
    {"$sum": "$n_laureates"}}},
    {"$sort": {"n_laureates": -1}},
]))


In [55]:
result

[{'_id': 'medicine', 'n_laureates': 441},
 {'_id': 'physics', 'n_laureates': 429},
 {'_id': 'chemistry', 'n_laureates': 370},
 {'_id': 'peace', 'n_laureates': 269},
 {'_id': 'literature', 'n_laureates': 233},
 {'_id': 'economics', 'n_laureates': 170}]

In [56]:
import pandas as pd

In [57]:
pd.DataFrame(result)

Unnamed: 0,_id,n_laureates
0,medicine,441
1,physics,429
2,chemistry,370
3,peace,269
4,literature,233
5,economics,170


### Zoom into Array Fields with  `$unwind`

### How many prizes were awarded to immigrants?
How many prizes were awarded to people who had no affiliation in their country of birth at the time of the award?

In [58]:
pipeline = [
    # Limit results to people; project needed fields; unwind prizes
    {"$match": {"gender": {"$ne": "org"}}},
    {"$project": {"bornCountry": 1, "prizes.affiliations.country": 1}},
    {"$unwind": "$prizes"},
  
    # Count prizes with no country-of-birth affiliation
    {"$addFields": {"bornCountryInAffiliations": {"$in": ["$bornCountry", "$prizes.affiliations.country"]}}},
    {"$match": {"bornCountryInAffiliations": False}},
    {"$count": "awardedElsewhere"},
]

In [59]:
print(list(db.laureates.aggregate(pipeline)))

[{'awardedElsewhere': 935}]


In [60]:
db.laureates.find_one()

{'_id': ObjectId('5eb5d04cb426c1f9fd9f8569'),
 'id': '1',
 'firstname': 'Wilhelm Conrad',
 'surname': 'Röntgen',
 'born': '1845-03-27',
 'died': '1923-02-10',
 'bornCountry': 'Prussia (now Germany)',
 'bornCountryCode': 'DE',
 'bornCity': 'Lennep (now Remscheid)',
 'diedCountry': 'Germany',
 'diedCountryCode': 'DE',
 'diedCity': 'Munich',
 'gender': 'male',
 'prizes': [{'year': '1901',
   'category': 'physics',
   'share': '1',
   'motivation': '"in recognition of the extraordinary services he has rendered by the discovery of the remarkable rays subsequently named after him"',
   'affiliations': [{'name': 'Munich University',
     'city': 'Munich',
     'country': 'Germany'}]}]}

# The End.