# Homework 5
## Balázs Menkó (O67UT7)
---

Data source: http://insideairbnb.com/get-the-data/

Operators: https://www.mongodb.com/docs/manual/reference/operator/query/

CRUD operators: https://www.mongodb.com/docs/manual/crud/

Aggregation
- https://www.mongodb.com/docs/manual/aggregation/
- https://www.mongodb.com/docs/manual/reference/operator/aggregation-pipeline/
- https://www.mongodb.com/docs/manual/reference/operator/aggregation/

---


### Dataset: [Cambridge](https://insideairbnb.com/cambridge/)
---

install `pymongo`
```python
!pip install pymongo
```

In [1]:
# packages
import pandas as pd
from pymongo import MongoClient 

# functions
def show_result(pipeline):
    ''' Show the result of a pipeline in pandas dataframe format. '''
    return pd.DataFrame(list(collection.aggregate(pipeline)))

#### Needed packages
```python
import csv
import gzip
import shutil
import urllib.request
from pathlib import Path
from collections import defaultdict
```

#### Check files in `workdir` folder
```python
workdir = Path('mongo_data')
if not workdir.exists():
    workdir.mkdir()
    
list(workdir.iterdir())
```


#### Download files
```python
url='https://data.insideairbnb.com/united-states/ma/cambridge/2024-06-29/data/'

if not (workdir / 'listings.csv').exists():
    urllib.request.urlretrieve(url+'listings.csv.gz', workdir / 'listings.csv.gz')
    with gzip.open(workdir / 'listings.csv.gz', 'rb') as f_in:
        with open(workdir / 'listings.csv', 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

if not (workdir / 'calendar.csv').exists():
    urllib.request.urlretrieve(url+'calendar.csv.gz', workdir / 'calendar.csv.gz')
    with gzip.open(workdir / 'calendar.csv.gz', 'rb') as f_in:
        with open(workdir / 'calendars.csv', 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

if not (workdir / 'reviews.csv').exists():
    urllib.request.urlretrieve(url+'reviews.csv.gz', workdir / 'reviews.csv.gz')
    with gzip.open(workdir / 'reviews.csv.gz', 'rb') as f_in:
        with open(workdir / 'reviews.csv', 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
```

In [2]:
# connect to the database
user, pwd = "tanar", "ahviGh2v"
client = MongoClient(f"mongodb://{user}:{pwd}@mongo-edu.db-test/sandbox")
db = client['sandbox']
collection = db['o67ut7']

---
#### Create collection from `listings.csv`
```python
listings = []
with open(workdir / 'listings.csv') as f_in:
    dict_reader = csv.DictReader(f_in)
    for row in dict_reader:
        row['_id'] = row.pop('id')
        for k in list(row):
            if row[k] == '':
                del row[k]
        listings.append(row)

collection.insert_many(listings)
```
#### Create collection from `reviews.csv`
```python
reviews = defaultdict(list)
with open(workdir / 'reviews.csv') as f_in:
    dict_reader = csv.DictReader(f_in)
    for row in dict_reader:
        listing_id = row.pop('listing_id')
        for k in list(row):
            if row[k] == '':
                del row[k]
        reviews[listing_id].append(row)

for listing_id, rev in reviews.items():
    collection.update_one({'_id': listing_id}, {'$set': {'reviews': rev}})
```

#### Create collection from `calendars.csv`
```python
calendars = defaultdict(list)
with open(workdir / 'calendars.csv') as f_in:
    dict_reader = csv.DictReader(f_in)
    for row in dict_reader:
        listing_id = row.pop('listing_id')
        for k in list(row):
            if row[k] == '':
                del row[k]
        calendars[listing_id].append(row)

for listing_id, cal in calendars.items():
    collection.update_one({'_id': listing_id}, {'$set': {'calendars': cal}})
```



In [3]:
collection_df = pd.DataFrame(list(collection.find()))
collection_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1214 entries, 0 to 1213
Data columns (total 75 columns):
 #   Column                                        Non-Null Count  Dtype 
---  ------                                        --------------  ----- 
 0   _id                                           1214 non-null   object
 1   listing_url                                   1214 non-null   object
 2   scrape_id                                     1214 non-null   object
 3   last_scraped                                  1214 non-null   object
 4   source                                        1214 non-null   object
 5   name                                          1214 non-null   object
 6   description                                   1196 non-null   object
 7   neighborhood_overview                         724 non-null    object
 8   picture_url                                   1214 non-null   object
 9   host_id                                       1214 non-null   object
 10  

In [4]:
# example pipeline from lecture notes
pipeline = [
    { '$match': { 'review_scores_rating': {'$ne': ''} } },
    {
        '$group': {
            '_id': 'average_user_rating',
            'value': {
                '$sum': {
                    '$convert': { 'input': '$review_scores_rating', 'to': 'double' }
                }
            }
        }
    }
]

show_result(pipeline)

Unnamed: 0,_id,value
0,average_user_rating,4278.05


---
# Task 1. 
List the best and worst 10 listings by "review_scores_rating"


In [5]:
# best 10
pipeline = [
    { '$match': { 'review_scores_rating': {'$ne': ''} } },  # filter for non-null ratings
    { '$sort': { 'review_scores_rating': -1 } },  # sort by rating in descending order
    { '$limit': 10 },  # get the top 10 listings
    { '$project': { '_id': 1, 'name': 1, 'review_scores_rating': 1 } }  
]

show_result(pipeline)

Unnamed: 0,_id,name,review_scores_rating
0,6374780,"Big 2 bedroom, steps to Davis Sq",5.0
1,7415807,Cambridge Single Family -- Families and Pets!,5.0
2,10066362,Clean+Furnished 1BR | Work Desk | Kendall Square,5.0
3,5016234,"Bright, Beautiful 2 Bed 2 Bath near Kendall/MIT",5.0
4,7886699,New Modern Studio-Walk to Harvard,5.0
5,7508244,PrivateRoom 10min MIT/Harvard !!!,5.0
6,14521992,Harvard Square 1br,5.0
7,13281865,"3rd fl studio, excellent light, 1 stop to Harvard",5.0
8,8064641,Spacious+ Clean 1BR | Work Desk | Kendall Square,5.0
9,7490808,Cozy Room 10min from MIT/Harvard,5.0


In [6]:
# worst 10
pipeline = [
    { '$match': { 'review_scores_rating': {'$ne': ''} } },
    { '$sort': { 'review_scores_rating': 1 } }, # sort by rating in ascending order
    { '$limit': 10 },
    { '$project': { '_id': 1, 'name': 1, 'review_scores_rating': 1 } }  
]

show_result(pipeline)

Unnamed: 0,_id,name
0,9109000,Comfy Bedroom in Cambridge
1,16013016,One Bedroom Short Walk to Harvard (Rm 12)
2,22629543,One bedroom at Harvard Square to share
3,1080789,Entire Fabulous home: Available occasionally!
4,4824130,Cambridge Modern Living
5,19847550,*New Construction Luxury Kendall Square Townho...
6,26978504,"Your own private bed, bath, and office/bonus r..."
7,22842459,Luxurious and Spacious 5 bedroom in Mid-Cambridge
8,20504268,Beautiful 2 Bed 2 Private Bath near MIT/Harvard
9,4894617,"Large, cozy 1BD in awesome location"


---
# Task 2.
Same for hosts (group by "host_id" and average)


In [7]:
# best 10
pipeline = [
    { '$match': { 'review_scores_rating': { '$ne': None } } },  # filter for valid ratings
    { '$group': {
        '_id': '$host_id',  # group by host_id
        'average_rating': {
            '$avg': { '$convert': { 'input': '$review_scores_rating', 'to': 'double' } } # calculate avg rating
            } 
        }
    },
    { '$sort': { 'average_rating': -1 } },  # sort by average rating in descending order
    { '$limit': 10 },  # Limit to top 10 hosts
    { '$project': { 'host_id': 1, 'average_rating': 1 } }  # project host_id and average rating
]

show_result(pipeline)

Unnamed: 0,_id,average_rating
0,468539024,5.0
1,561879186,5.0
2,428894262,5.0
3,497306866,5.0
4,20857768,5.0
5,97154914,5.0
6,384300855,5.0
7,102771443,5.0
8,2722041,5.0
9,61719177,5.0


In [8]:
# worst 10
pipeline = [
    { '$match': { 'review_scores_rating': { '$ne': None } } },  # filter for valid ratings
    { '$group': {
        '_id': '$host_id',  # group by host_id
        'average_rating': {
            '$avg': { '$convert': { 'input': '$review_scores_rating', 'to': 'double' } } # calculate avg rating
            } 
        }
    },
    { '$sort': { 'average_rating': 1 } },  # sort by average rating in ascending order
    { '$limit': 10 },  # Limit to top 10 hosts
    { '$project': { 'host_id': 1, 'average_rating': 1 } }  # project host_id and average rating
]

show_result(pipeline)

Unnamed: 0,_id,average_rating
0,122377513,1.0
1,50541668,1.0
2,70833557,1.0
3,2356643,3.5
4,67827905,4.0
5,21976411,4.0
6,107434423,4.2575
7,15154687,4.272105
8,9928124,4.29
9,44939469,4.293333


---
# Task 3. 
Identify the top 10 reviewers by the number of reviews produced (Hint: `$first`)

mongodb.com/**[\$unwind](https://www.mongodb.com/docs/manual/reference/operator/aggregation/unwind/)**

In [9]:
pipeline = [
    # "$unwind" stage flattens the reviews array so that each review becomes its own document
    { "$unwind": "$reviews" },
    # group by reviewer_id and reviewer_name, and count the reviews
    { "$group": {
        "_id": {
            "reviewer_id": "$reviews.reviewer_id", 
            "reviewer_name": "$reviews.reviewer_name" 
        },
        "review_count": {"$sum": 1}
        }
    },
    { "$sort": {"review_count": -1} },
    { "$limit": 10 },
    { "$project": { "reviewer_name": "$_id.reviewer_name", "review_count": 1 } }
]

df = show_result(pipeline)
df[['reviewer_name', 'review_count']]

Unnamed: 0,reviewer_name,review_count
0,Theresa,33
1,Danette,30
2,Betsy,18
3,Aman,17
4,Billy,16
5,Christopher,16
6,Clorice,15
7,Alex,15
8,Claire,15
9,Ken,14


---
# Task 4. 
What are the most popular host names in the region?


In [10]:
pipeline = [
    # group by host name and count the occurrences
    { "$group": {
            "_id": "$host_name", 
            "host_count": {"$sum": 1}
        }
    },
    { "$sort": {"host_count": -1} },
    { "$limit": 10 },
    { "$project": { "_id": 0, "host_name": "$_id", "host_count": 1 } }
]

df = show_result(pipeline)
# change the order of columns
df[['host_name', 'host_count']]

Unnamed: 0,host_name,host_count
0,Blueground,171
1,Steve,62
2,Liya,53
3,Yudong,29
4,Sofia,29
5,Hong,25
6,RoomPicks,24
7,Sophia,23
8,Riverside,23
9,Thatch,20


---
# Task 5.
Count the reviews that mention the word "weather" in the review? (Hint:`$regex`)


In [11]:
pipeline = [
    # unwind the reviews array to create one document per review
    { "$unwind": "$reviews" },
    # match reviews that contain the word "weather" (case-insensitive)
    { "$match": {
        "reviews.comments": {
            "$regex": "weather",  # search for the word "weather"
            "$options": "i"       # case-insensitive
            }
        }
    },
    { "$count": "weather_count" } 
]

show_result(pipeline)

Unnamed: 0,weather_count
0,345


---
# Task 6. 
When was the least amount of apartments available for rent?

In [12]:
pipeline = [
    { "$unwind": "$calendars" },
    { "$match": { "calendars.available": "t" } },
    # group by date and count how many apartments are available for each date
    { "$group": {
            "_id": "$calendars.date",
            "available_count": {"$sum": 1}
        }
    },
    {"$sort": {"available_count": 1}},
    {"$limit": 1}
]

show_result(pipeline)

Unnamed: 0,_id,available_count
0,2024-06-29,124


---
# Task 7. 
Verify "availability_365" by counting the number of available days in the calendars for all listings.

In [13]:
pipeline = [
    { '$project': {
        '_id': 1,
        'availability_365': { '$toInt': '$availability_365' },  # convert to integer
        'calculated_availability': {
            # calculate the size of filtered days where 'available' is 't' (true)
            '$size': {
                '$filter': {
                    'input': '$calendars', 'as': 'calendar_day',
                    # if the date is available: 't'
                    'cond': { '$eq': ['$$calendar_day.available', 't'] }
                    }
                }
            }
        }
    },
    { '$addFields': {
        'availability_match': { '$eq': ['$availability_365', '$calculated_availability'] } } 
    }
]

df = show_result(pipeline)
df

Unnamed: 0,_id,availability_365,calculated_availability,availability_match
0,8521,65,65,True
1,11169,350,350,True
2,19581,259,259,True
3,27498,283,283,True
4,79762,264,264,True
...,...,...,...,...
1209,1178564043662571328,33,33,True
1210,1179323162218520405,214,214,True
1211,1179880552871879807,260,260,True
1212,1187484571097707740,268,268,True


In [14]:
# where the availability_match is False
df[df.availability_match == False]

Unnamed: 0,_id,availability_365,calculated_availability,availability_match
131,9790965,0,241,False
365,34944649,0,106,False


---
# Task 8.
Assuming all available listings were purchased each day, what was the revenue for each listing?

#### Main problems:
- The price is in string format eg. `'$225.0'`
- Some price has comma in it eg. `'$1,500.0'`

In [15]:
pipeline = [
    # filter out available days and project necessary fields
    { '$project': {
        '_id': 1, 
        'available_days': {
            # filter the 'calendars' array for entries where 'available' is 't'
            '$filter': {
                'input': '$calendars', 'as': 'calendar_day', 
                # Condition: where 'available' is true ('t')
                'cond': { '$eq': ['$$calendar_day.available', 't'] }  
                }
            }
        }
    },
    # Convert the price field to a number and calculate revenue per available day
    {'$project': {
        '_id': 1,  
        'total_revenue_in_USD': {
            # sum of all available days' prices
            '$sum': {
                # convert each price (string with dollar sign and commas) to a number
                '$map': {
                    'input': '$available_days', 'as': 'day', 
                    'in': {
                        # remove commas and the dollar sign, then convert to a number
                        '$toDouble': {
                            '$replaceAll': {
                                'input': { '$substr': ['$$day.price', 1, -1] },  # remove the dollar sign
                                'find': ',', 'replacement': ''  # remove the commas
                                }
                            }
                        }
                    }
                }
            }
        }
    },
]

show_result(pipeline)

Unnamed: 0,_id,total_revenue_in_USD
0,8521,14625.0
1,11169,43750.0
2,19581,58275.0
3,27498,46695.0
4,79762,79200.0
...,...,...
1209,1178564043662571328,5940.0
1210,1179323162218520405,256800.0
1211,1179880552871879807,24700.0
1212,1187484571097707740,21440.0
