In [6]:
import sys
import json
import csv
import yaml

import importlib

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv

from datetime import time
from datetime import date
from datetime import datetime
# with the above choices, the imported datetime.time(2023,07,01) is recognized
# from datetime import date
# from datetime import datetime

import pprint

import psycopg2
from sqlalchemy import create_engine, text as sql_text

# Create an utilities file util.py in a folder benchmarking and import it
sys.path.append('/Users/Nfaith21/ECS 116/')
import util_2 as util

<span style=color:blue>Getting mongodb connection set up</span>

In [7]:
from pymongo import MongoClient

client = MongoClient()
# could have written client = MongoClient("localhost", 27017)
#                 or client = MongoClient("mongodb://localhost:27017/")

<span style=color:blue>Getting access to airbnb database

In [8]:
# I have (or will have) a database "airbnb"
db = client.airbnb


print('The list of all databases currently in the MongoDB client is:')
print(client.list_database_names())

print('\nThe list of all collections in the airbnb database is:')
print(db.list_collection_names())
# Note: calendar may not show up yet; it is created only when a first document is inserted into it

The list of all databases currently in the MongoDB client is:
['admin', 'airbnb', 'company', 'config', 'local', 'test']

The list of all collections in the airbnb database is:
['newlistings', 'listings', 'listings_with_reviews_m', 'listings_with_reviews_and_cal', 'calendar', 'listings_with_calendar', 'reviews']


### Datatypes Conversion and Clean up

In [9]:
f = '/Users/Nfaith21/Library/CloudStorage/GoogleDrive-nishafaith21@gmail.com/My Drive/UC Davis/Spring Quarter 2024/ECS 116/listings_202406060150.csv'
f2 = '/Users/Nfaith21/ECS 116/reviews.csv'

dtype = {"id":str, "host_id":str}
dtype2 = {"date": str, "listing_id":str, "id":str, "reviewer_id":str}

df = pd.read_csv(f, dtype=dtype, keep_default_na=False)
df2 = pd.read_csv(f2, dtype=dtype2, keep_default_na=False)

df['reviews_per_month'] = pd.to_numeric(df['reviews_per_month'], errors='coerce')

In [31]:
print('The datatypes for the fields of df are:')
print(df.dtypes)

print('\nThe first few rows of df are:')
print(df.head())

print('The datatypes for the fields of df are:')
print(df2.dtypes)

print('\nThe first few rows of df are:')
print(df2.head())


The datatypes for the fields of df are:
id                                 object
name                               object
host_id                            object
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                              object
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
number_of_reviews_ltm               int64
license                            object
dtype: object

The first few rows of df are:
                   id                                               name  \
0  977395984065981849      Home in Brooklyn · 1 bedroom · 1 bed · 1 bath   
1  7299

### Convert to Datetime and Remove NaTs

In [32]:
# also converts NaT to None, because MongoDB does not recognize NaT
def convert_date_str_to_datetime(dt):
    if dt is None:
        return None
    elif pd.isnull(dt):  # tests whether dt is the pandas value NaT ("not a time")
        # print('\nEntered the NaT case\n')
        return None
    elif dt != dt:
        return None        # could also use math.nan, I think
    elif dt == '':
        return None
    else:
        year = int(dt[0:4])
        month = int(dt[5:7])
        day = int(dt[8:10])
        # print(year, month, day)
        temp = datetime(year, month, day)
        ts = temp.timestamp()
        new_dt = datetime.fromtimestamp(ts)
        return new_dt


print(convert_date_str_to_datetime('2024-05-23'))

2024-05-23 00:00:00


In [33]:
df2['date'] = df2['date'].apply(convert_date_str_to_datetime)

print(df2.head())

  listing_id     id       date reviewer_id reviewer_name  \
0       2595  17857 2009-11-21       50679          Jean   
1       2595  19176 2009-12-05       53267          Cate   
2       2595  19760 2009-12-10       38960         Anita   
3       2595  34320 2010-04-09       71130       Kai-Uwe   
4       2595  46312 2010-05-25      117113        Alicia   

                                            comments  
0  Notre séjour de trois nuits.\r<br/>Nous avons ...  
1                                  Great experience.  
2  I've stayed with my friend at the Midtown Cast...  
3  We've been staying here for about 9 nights, en...  
4  We had a wonderful stay at Jennifer's charming...  


In [34]:
# need to strip the leading '$' from the price value, and remove commas
df['price'] = df['price'].apply(lambda x:x.replace('$','').replace(',',''))

# converting price which is string to numeric
df['price'] = pd.to_numeric(df['price']) 

print(type(df.loc[0,'price']))
print(df.head())

<class 'numpy.float64'>
                   id                                               name  \
0  977395984065981849      Home in Brooklyn · 1 bedroom · 1 bed · 1 bath   
1  729947657876634696  Rental unit in The Bronx · 1 bedroom · 1 bed ·...   
2  648033676238017128  Rental unit in Bronx · ★4.89 · 1 bedroom · 1 b...   
3  623137142536549768  Rental unit in Brooklyn · 1 bedroom · 1 bed · ...   
4  871990853610302281  Rental unit in Queens · ★New · 1 bedroom · Hal...   

     host_id        host_name neighbourhood_group   neighbourhood   latitude  \
0   95344065            Derek            Brooklyn  Sheepshead Bay  40.591790   
1     566660           Markus               Bronx        Longwood  40.827374   
2  421601513  J Carlos Retals               Bronx     Kingsbridge  40.863940   
3  106442885              Ava            Brooklyn   East New York  40.660730   
4  484563208           Tricia              Queens    Howard Beach  40.663586   

   longitude        room_type  price  

In [35]:
df['last_review'] = pd.to_datetime(df['last_review'])

  df['last_review'] = pd.to_datetime(df['last_review'])


### Load Data into Dictionaries

In [36]:
dict_full = df.to_dict('records')

def convert_date_to_datetime(dt):
    if pd.isnull(dt):           # tests whether dt is None, NaN, or DaT (not a date)
        return None
    elif type(dt) == pd._libs.tslibs.nattype.NaTType:  # including this, but see below
        return None
    else:
        temp = datetime(dt.year, dt.month, dt.day)
        ts = temp.timestamp()
        new_dt = datetime.fromtimestamp(ts)
        return new_dt

for record in dict_full:
    record['last_review'] = convert_date_to_datetime(record['last_review'])

for doc in dict_full:
    if pd.isnull(doc['last_review']): 
        doc['last_review'] = None

In [37]:
pprint.pp(dict_full[0])

{'id': '977395984065981849',
 'name': 'Home in Brooklyn · 1 bedroom · 1 bed · 1 bath',
 'host_id': '95344065',
 'host_name': 'Derek',
 'neighbourhood_group': 'Brooklyn',
 'neighbourhood': 'Sheepshead Bay',
 'latitude': 40.59179,
 'longitude': -73.94285,
 'room_type': 'Private room',
 'price': 30.0,
 'minimum_nights': 31,
 'number_of_reviews': 1,
 'last_review': datetime.datetime(2024, 1, 3, 0, 0),
 'reviews_per_month': 0.86,
 'calculated_host_listings_count': 7,
 'availability_365': 339,
 'number_of_reviews_ltm': 1,
 'license': ''}


In [38]:
time1 = datetime.now()
dict_full_2 = df2.to_dict('records')
time2 = datetime.now()
print(f'Time to perform this operation was {util.time_diff(time1,time2)} seconds.')

Time to perform this operation was 4.219962 seconds.


### Load Data into MongoDB

In [39]:
# The following empties out listings_with_cal; useful if making a fresh start
db.newlistings.drop()
db.reviews.drop()

print(len(dict_full))
print(len(dict_full_2))

time1 = datetime.now()
result = db.newlistings.insert_many(dict_full)
results2=db.reviews.insert_many(dict_full_2)
time2 = datetime.now()
print(f'\nTime to perform this operation was {util.time_diff(time1,time2)} seconds.')
# between about 2 and 4 minutes

print(f'\nNumber of docs in db.newlistings is {db.newlistings.count_documents({})}')
print(f'\nNumber of docs in db.reviews is {db.reviews.count_documents({})}')


print()
outdocs = []
for o in result.inserted_ids[-5:]:
    outdocs.append(db.newlistings.find_one({ '_id': o}))
pprint.pp(outdocs)

outdocs2 = []
for o in results2.inserted_ids[-5:]:
    outdocs2.append(db.reviews.find_one({ '_id': o}))
pprint.pp(outdocs2)

39202
986810

Time to perform this operation was 9.42273 seconds.

Number of docs in db.newlistings is 39202

Number of docs in db.reviews is 986810

[{'_id': ObjectId('6663d3abc6744f44122705cf'),
  'id': '795691344180160853',
  'name': 'Hotel in New York · ★4.33 · 1 bedroom · 1 bed · 1 private bath',
  'host_id': '484277630',
  'host_name': 'Bugra Han',
  'neighbourhood_group': 'Manhattan',
  'neighbourhood': 'Midtown',
  'latitude': 40.755077,
  'longitude': -73.98106,
  'room_type': 'Private room',
  'price': nan,
  'minimum_nights': 4,
  'number_of_reviews': 5,
  'last_review': datetime.datetime(2023, 12, 18, 0, 0),
  'reviews_per_month': 1.2,
  'calculated_host_listings_count': 33,
  'availability_365': 0,
  'number_of_reviews_ltm': 5,
  'license': 'Exempt'},
 {'_id': ObjectId('6663d3abc6744f44122705d0'),
  'id': '942191196511011206',
  'name': 'Home in Brooklyn · 5 bedrooms · 5 beds · 2 baths',
  'host_id': '517145594',
  'host_name': 'Ben',
  'neighbourhood_group': 'Brooklyn',
 

### Create Index

In [40]:
db.reviews.create_index('listing_id')

'listing_id_1'

### Creating listings_with_reviews_m Collection

In [41]:
db.listings_with_reviews_m.drop()

pipeline = [
    {"$lookup":
       {
         "from": "reviews",
         "localField": "id",
         "foreignField": "listing_id",
         "as": "reviews"
       }}, 
    
    { "$out": "listings_with_reviews_m" }
    
]

time1 = datetime.now()
test1 = db.newlistings.aggregate(pipeline)
time2 = datetime.now()
diff = util.time_diff(time1, time2)

print('\nTime it took was:', format(diff, '.4f'), '.')

print(db.list_collection_names())

print("test1:")

# print(len(list(test1)))

print(type(test1))




Time it took was: 4.7962 .
['newlistings', 'listings', 'listings_with_reviews_m', 'listings_with_reviews_and_cal', 'calendar', 'listings_with_calendar', 'reviews']
test1:
<class 'pymongo.command_cursor.CommandCursor'>


In [50]:
count = db.listings_with_reviews_m.count_documents({})
print(count)

39202


In [51]:
doc = db.listings_with_reviews_m.find_one()
pprint.pp(doc)

{'_id': ObjectId('6663d3abc6744f4412266cb2'),
 'id': '977395984065981849',
 'name': 'Home in Brooklyn · 1 bedroom · 1 bed · 1 bath',
 'host_id': '95344065',
 'host_name': 'Derek',
 'neighbourhood_group': 'Brooklyn',
 'neighbourhood': 'Sheepshead Bay',
 'latitude': 40.59179,
 'longitude': -73.94285,
 'room_type': 'Private room',
 'price': 30.0,
 'minimum_nights': 31,
 'number_of_reviews': 1,
 'last_review': datetime.datetime(2024, 1, 3, 0, 0),
 'reviews_per_month': 0.86,
 'calculated_host_listings_count': 7,
 'availability_365': 339,
 'number_of_reviews_ltm': 1,
 'license': '',
 'reviews': [{'_id': ObjectId('6663d3aec6744f44123602e2'),
              'listing_id': '977395984065981849',
              'id': '1060927930986644037',
              'date': datetime.datetime(2024, 1, 3, 0, 0),
              'reviewer_id': '56179331',
              'reviewer_name': 'Sean',
              'comments': 'I fear it’s kind of a “you get what you pay for” '
                          'situation. The place

### JSON File Loading

In [52]:
def convert_lwc_to_json(doc):
    doc_new = {}
    doc_new['_id'] = str(doc['_id'])
    for key in ['id', 'name', 'host_id', 'host_name', 'neighbourhood_group', 'neighbourhood', 'latitude', 'longitude', 'room_type', 'price', 'minimum_nights', 'number_of_reviews']:
        doc_new[key] = doc[key] if not pd.isna(doc[key]) else None
    doc_new['last_review'] = doc['last_review'].strftime('%Y-%m-%d') if not pd.isna(doc['last_review']) else None
    for key in ['reviews_per_month', 'calculated_host_listings_count', 'availability_365', 'number_of_reviews_ltm', 'license']:
        doc_new[key] = doc[key] if not pd.isna(doc[key]) else None
    dlist = []
    for d in doc['reviews']:
        d_new = {}
        d_new['_id'] = str(d['_id'])
        d_new['date'] = d['date'].strftime('%Y-%m-%d') if not pd.isna(d['date']) else None
        for key in ['listing_id', 'id', 'reviewer_id', 'reviewer_name', 'comments']:
            d_new[key] = d[key] if not pd.isna(d[key]) else None
        dlist.append(d_new)
    doc_new['reviews'] = dlist
    return doc_new

pprint.pp(convert_lwc_to_json(doc))

{'_id': '6663d3abc6744f4412266cb2',
 'id': '977395984065981849',
 'name': 'Home in Brooklyn · 1 bedroom · 1 bed · 1 bath',
 'host_id': '95344065',
 'host_name': 'Derek',
 'neighbourhood_group': 'Brooklyn',
 'neighbourhood': 'Sheepshead Bay',
 'latitude': 40.59179,
 'longitude': -73.94285,
 'room_type': 'Private room',
 'price': 30.0,
 'minimum_nights': 31,
 'number_of_reviews': 1,
 'last_review': '2024-01-03',
 'reviews_per_month': 0.86,
 'calculated_host_listings_count': 7,
 'availability_365': 339,
 'number_of_reviews_ltm': 1,
 'license': '',
 'reviews': [{'_id': '6663d3aec6744f44123602e2',
              'date': '2024-01-03',
              'listing_id': '977395984065981849',
              'id': '1060927930986644037',
              'reviewer_id': '56179331',
              'reviewer_name': 'Sean',
              'comments': 'I fear it’s kind of a “you get what you pay for” '
                          'situation. The place is very dirty, especially the '
                          'common

In [53]:
print(db.listings_with_reviews_m.count_documents({}))

cursor = db.listings_with_reviews_m.find({'id' : {'$regex' : '^1000.*$'}})
    
l = list(cursor)
print(len(l))

39202
43


In [54]:
output = []

for doc in l:
    output.append(convert_lwc_to_json(doc))

print(len(output))

43


In [55]:
# Writing dict to a json file into a json file in a subdirectory
# Also putting this function into my util.py
def write_dict_to_dir_json(dict, dir, filename):
    with open(dir + '/' + filename, 'w') as fp:
        json.dump(dict, fp)

dir = '/Users/Nfaith21/ECS 116'
filename = 'listings_with_reviews_m_subset_1000.json'
write_dict_to_dir_json(output, dir, filename)

