# Quiz 1

In [3]:
"""
Use an aggregation query to answer the following question. 

What is the most common city name in our cities collection?

Your first attempt probably identified None as the most frequently occurring
city name. What that actually means is that there are a number of cities
without a name field at all. It's strange that such documents would exist in
this collection and, depending on your situation, might actually warrant
further cleaning. 

To solve this problem the right way, we should really ignore cities that don't
have a name specified. As a hint ask yourself what pipeline operator allows us
to simply filter input? How do we test for the existence of a field?

Please modify only the 'make_pipeline' function so that it creates and returns
an aggregation pipeline that can be passed to the MongoDB aggregate function.
As in our examples in this lesson, the aggregation pipeline should be a list of
one or more dictionary objects. Please review the lesson examples if you are
unsure of the syntax.

Your code will be run against a MongoDB instance that we have provided. If you
want to run this code locally on your machine, you have to install MongoDB, 
download and insert the dataset. For instructions related to MongoDB setup and
datasets please see Course Materials.

Please note that the dataset you are using here is a different version of the
cities collection provided in the course materials. If you attempt some of the
same queries that we look at in the problem set, your results may be different.
"""

def get_db(db_name):
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client[db_name]
    return db

In [11]:
# For debugging
def aggregate(db, pipeline):
    return [doc for doc in db.cities.aggregate(pipeline)]

db = get_db('examples')

In [66]:
aggregate(db, [
    {"$match": {"country": {"$eq": "India"}}},
    {"$match": {"name": {"$exists": True, "$ne": None}}},
    {"$group": {"_id": "$name", "count": {"$sum": 1}}},
    {"$sort": {"count": -1}},
    {"$limit": 5}
])

[{u'_id': u'Patan', u'count': 5},
 {u'_id': u'Shahpura', u'count': 4},
 {u'_id': u'Gangapur', u'count': 3},
 {u'_id': u'Deori', u'count': 3},
 {u'_id': u'Bilaspur', u'count': 3}]

In [67]:
def make_pipeline():
    # complete the aggregation pipeline
    pipeline = [
        {"$match": {"country": {"$eq": "India"}}},
        {"$match": {"name": {"$exists": True, "$ne": None}}},
        {"$group": {"_id": "$name", "count": {"$sum": 1}}},
        {"$sort": {"count": -1}},
        {"$limit": 1}
    ]
    return pipeline

In [68]:
def aggregate(db, pipeline):
    return [doc for doc in db.cities.aggregate(pipeline)]


if __name__ == '__main__':
    # The following statements will be used to test your code by the grader.
    # Any modifications to the code past this point will not be reflected by
    # the Test Run.
    db = get_db('examples')
    pipeline = make_pipeline()
    result = aggregate(db, pipeline)
    import pprint
    pprint.pprint(result[0])
    assert len(result) == 1
    #assert result[0] == {'_id': 'Shahpur', 'count': 6}


{u'_id': u'Patan', u'count': 5}


# Quiz 2

In [24]:
"""
Use an aggregation query to answer the following question. 

Which Region in India has the largest number of cities with longitude between
75 and 80?

Please modify only the 'make_pipeline' function so that it creates and returns
an aggregation pipeline that can be passed to the MongoDB aggregate function.
As in our examples in this lesson, the aggregation pipeline should be a list of
one or more dictionary objects. Please review the lesson examples if you are
unsure of the syntax.

Your code will be run against a MongoDB instance that we have provided. If you
want to run this code locally on your machine, you have to install MongoDB,
download and insert the dataset. For instructions related to MongoDB setup and
datasets please see Course Materials.

Please note that the dataset you are using here is a different version of the
cities collection provided in the course materials. If you attempt some of the
same queries that we look at in the problem set, your results may be different.
"""

def get_db(db_name):
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client[db_name]
    return db

In [28]:
# For debugging
def aggregate(db, pipeline):
    return [doc for doc in db.cities.aggregate(pipeline)]

db = get_db('examples')

In [29]:
aggregate(db, [
    {"$match": {"lon": {""}}}
    {"$limit": 1}
])

[{u'_id': ObjectId('5ca79c02873d8102a816e2ca'),
  u'country': u'India',
  u'elevation': 1855.0,
  u'isPartOf': [u'Jammu and Kashmir', u'Udhampur district'],
  u'lat': 75.28,
  u'lon': 33.08,
  u'name': u'Kud',
  u'population': 1140,
  u'timeZone': u'Indian Standard Time'}]

In [83]:
aggregate(db, [
    {"$match": {"country": {"$eq": "India"}}},
    {"$match": {"isPartOf": {"$exists": True, "$ne": None}}},
    {"$match": {"$and": [{"lon":{"$gte": 75.0}}, {"lon": {"$lte": 80.0}}]}},
    {"$unwind": "$isPartOf"},
    {"$group": {"_id": "$isPartOf", "count": {"$sum": 1}}},
    {"$sort": {"count": -1}},
    {"$limit": 5}
])

[]

In [80]:
def make_pipeline():
    # complete the aggregation pipeline
    pipeline = [
        {"$match": {"country": {"$eq": "India"}}},
        {"$match": {"isPartOf": {"$exists": True, "$ne": None}}},
        {"$match": {"$and": [{"lon":{"$gte": 75.0}}, {"lon": {"$lte": 80.0}}]}},
        {"$unwind": "$isPartOf"},
        {"$group": {"_id": "$isPartOf", "count": {"$sum": 1}}},
        {"$sort": {"count": -1}},
        {"$limit": 1}
    ]
    return pipeline

In [81]:
def aggregate(db, pipeline):
    return [doc for doc in db.cities.aggregate(pipeline)]

if __name__ == '__main__':
    # The following statements will be used to test your code by the grader.
    # Any modifications to the code past this point will not be reflected by
    # the Test Run.
    db = get_db('examples')
    pipeline = make_pipeline()
    result = aggregate(db, pipeline)
    """import pprint
    pprint.pprint(result[0])
    assert len(result) == 1
    assert result[0]["_id"] == 'Tamil Nadu'
    assert result[0]["count"] == 424"""


In [82]:
result

[]

# Quiz 3

In [88]:
"""
Use an aggregation query to answer the following question. 

Extrapolating from an earlier exercise in this lesson, find the average
regional city population for all countries in the cities collection. What we
are asking here is that you first calculate the average city population for each
region in a country and then calculate the average of all the regional averages
for a country.
  As a hint, _id fields in group stages need not be single values. They can
also be compound keys (documents composed of multiple fields). You will use the
same aggregation operator in more than one stage in writing this aggregation
query. I encourage you to write it one stage at a time and test after writing
each stage.

Please modify only the 'make_pipeline' function so that it creates and returns
an aggregation  pipeline that can be passed to the MongoDB aggregate function.
As in our examples in this lesson, the aggregation pipeline should be a list of
one or more dictionary objects. Please review the lesson examples if you are
unsure of the syntax.

Your code will be run against a MongoDB instance that we have provided. If you
want to run this code locally on your machine, you have to install MongoDB,
download and insert the dataset. For instructions related to MongoDB setup and
datasets please see Course Materials.

Please note that the dataset you are using here is a different version of the
cities collection provided in the course materials. If you attempt some of the
same queries that we look at in the problem set, your results may be different.
"""

def get_db(db_name):
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client[db_name]
    return db

In [89]:
# For debugging
def aggregate(db, pipeline):
    return [doc for doc in db.cities.aggregate(pipeline)]
db = get_db('examples')

In [91]:
aggregate(db, [
    {"$limit": 1}
])

[{u'_id': ObjectId('5ca79c02873d8102a816e2ca'),
  u'country': u'India',
  u'elevation': 1855.0,
  u'isPartOf': [u'Jammu and Kashmir', u'Udhampur district'],
  u'lat': 75.28,
  u'lon': 33.08,
  u'name': u'Kud',
  u'population': 1140,
  u'timeZone': u'Indian Standard Time'}]

In [108]:
aggregate(db, [
    {"$match": {"population": {"$exists": True, "$ne": None},
                "country": {"$exists": True, "$ne": None}
               }},
    {"$unwind": "$isPartOf"},
    {"$group": {"_id": "$isPartOf", 
                "avg": {"$avg": "$population"}, 
                "country": {"$first": "$country"}}},
    {"$group": {"_id": "$country", "avg": {"$avg": "$avg"}}},
    {"$sort": {"avg": -1}}
    #{"$limit": 5}
])

[{u'_id': u'The_Democratic_Republic_Of_Congo', u'avg': 9046000.0},
 {u'_id': u'Turkey', u'avg': 1945457.0},
 {u'_id': u'United Kingdom', u'avg': 1864957.8653846155},
 {u'_id': u'Saudi Arabia', u'avg': 1826254.75},
 {u'_id': u'Cuba', u'avg': 1701228.0},
 {u'_id': u'China', u'avg': 1294543.23975458},
 {u'_id': u'Kenya', u'avg': 1226727.4285714286},
 {u'_id': u'Bangladesh', u'avg': 1193317.0925925926},
 {u'_id': u'Ghana', u'avg': 1185798.5},
 {u'_id': u'Tunisia', u'avg': 1138517.5},
 {u'_id': u'Honduras', u'avg': 1126534.0},
 {u'_id': u'Zambia', u'avg': 1103957.4285714286},
 {u'_id': u'Jamaica', u'avg': 937700.0},
 {u'_id': u'Togo', u'avg': 837437.0},
 {u'_id': u'Kyrgyzstan', u'avg': 835800.0},
 {u'_id': u'Dominican Republic', u'avg': 820869.6666666666},
 {u'_id': u'Pakistan', u'avg': 780520.9078469883},
 {u'_id': u'Egypt', u'avg': 682002.8125},
 {u'_id': u'Nigeria', u'avg': 635294.9708333333},
 {u'_id': u'Malaysia', u'avg': 592408.6333333333},
 {u'_id': u'North Korea', u'avg': 592281.444

In [111]:
def make_pipeline():
    # complete the aggregation pipeline
    pipeline = [
        {"$match": {"population": {"$exists": True, "$ne": None},
                    "country": {"$exists": True, "$ne": None}
                   }},
        {"$unwind": "$isPartOf"},
        {"$group": {"_id": "$isPartOf", 
                    "avgInRegion": {"$avg": "$population"}, 
                    "country": {"$first": "$country"}}},
        {"$group": {"_id": "$country", "avgRegionalPopulation": {"$avg": "$avgInRegion"}}},
        {"$sort": {"avgRegionalPopulation": -1}}
    ]
    return pipeline

In [112]:
def aggregate(db, pipeline):
    return [doc for doc in db.cities.aggregate(pipeline)]

if __name__ == '__main__':
    # The following statements will be used to test your code by the grader.
    # Any modifications to the code past this point will not be reflected by
    # the Test Run.
    db = get_db('examples')
    pipeline = make_pipeline()
    result = aggregate(db, pipeline)
    import pprint
    if len(result) < 150:
        pprint.pprint(result)
    else:
        pprint.pprint(result[:100])
    key_pop = 0
    for country in result:
        if country["_id"] == 'Lithuania':
            assert country["_id"] == 'Lithuania'
            assert abs(country["avgRegionalPopulation"] - 14750.784447977203) < 1e-10
            key_pop = country["avgRegionalPopulation"]
    assert {'_id': 'Lithuania', 'avgRegionalPopulation': key_pop} in result

[{u'_id': u'The_Democratic_Republic_Of_Congo',
  u'avgRegionalPopulation': 9046000.0},
 {u'_id': u'Turkey', u'avgRegionalPopulation': 1945457.0},
 {u'_id': u'United Kingdom', u'avgRegionalPopulation': 1864957.8653846155},
 {u'_id': u'Saudi Arabia', u'avgRegionalPopulation': 1826254.75},
 {u'_id': u'Cuba', u'avgRegionalPopulation': 1701228.0},
 {u'_id': u'China', u'avgRegionalPopulation': 1294543.23975458},
 {u'_id': u'Kenya', u'avgRegionalPopulation': 1226727.4285714286},
 {u'_id': u'Bangladesh', u'avgRegionalPopulation': 1193317.0925925926},
 {u'_id': u'Ghana', u'avgRegionalPopulation': 1185798.5},
 {u'_id': u'Tunisia', u'avgRegionalPopulation': 1138517.5},
 {u'_id': u'Honduras', u'avgRegionalPopulation': 1126534.0},
 {u'_id': u'Zambia', u'avgRegionalPopulation': 1103957.4285714286},
 {u'_id': u'Jamaica', u'avgRegionalPopulation': 937700.0},
 {u'_id': u'Togo', u'avgRegionalPopulation': 837437.0},
 {u'_id': u'Kyrgyzstan', u'avgRegionalPopulation': 835800.0},
 {u'_id': u'Dominican Repub