In [1]:
import sys
import json
import csv
import yaml

import importlib

import pandas as pd
import numpy as np
import math

import matplotlib as mpl
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv

from datetime import time
from datetime import date
from datetime import datetime
# with the above choices, the imported datetime.time(2023,07,01) is recognized
# from datetime import date
# from datetime import datetime

import pprint

import psycopg2
from sqlalchemy import create_engine, text as sql_text

# Create an utilities file util.py in a folder benchmarking and import it
sys.path.append('/Users/Nfaith21/ECS 116/')
# import util as util
import util_2 as util

In [2]:
from pymongo import MongoClient

client = MongoClient()

In [3]:
# I have (or will have) a database "airbnb"
db = client.airbnb


print('The list of all databases currently in the MongoDB client is:')
print(client.list_database_names())

print('\nThe list of all collections in the airbnb database is:')
print(db.list_collection_names())

The list of all databases currently in the MongoDB client is:
['admin', 'airbnb', 'company', 'config', 'local', 'test']

The list of all collections in the airbnb database is:
['newlistings', 'listings', 'listings_with_reviews_m', 'listings_with_reviews_and_cal', 'calendar', 'listings_with_calendar', 'reviews']


### Query 5 pos

In [6]:
superlative_words = [
    'astounding',
    'amazing',
    'awesome',
    'excellent',
    'exceptional',
    'extraordinary',
    'fantastic',
    'great',
    'magnificent',
    'splendid',
    'wonderful'
]

In [7]:
pos_regex = f"(?i)[\S\s]*{'|'.join(superlative_words)}[\S\s]*"

In [8]:
pos_condition = {
    'reviews.comments':
        {
            '$regex': pos_regex
        }
}

In [9]:
time1 = datetime.now()
result = db.listings_with_reviews_and_cal.find(pos_condition)
time2 = datetime.now()
print(f"The time taken for the selection was {util.time_diff(time1,time2)} seconds.")
time3 = datetime.now()
l = list(result)
time4 = datetime.now()
print(f"\nThe time taken to create the list was {util.time_diff(time3,time4)} seconds.")

The time taken for the selection was 0.000399 seconds.

The time taken to create the list was 85.399036 seconds.


In [10]:
print(len(l))

25196


### Query 5 neg

In [11]:
super_negative_words = [
    'aweful',
    'horrible',
    'terrible'
]

In [12]:
neg_regex = f"(?i)[\S\s]*{'|'.join(super_negative_words)}[\S\s]*"

In [13]:
neg_condition = {
    'reviews.comments':
        {
            '$regex': neg_regex
        }
}

In [14]:
time1 = datetime.now()
result = db.listings_with_reviews_and_cal.find(neg_condition)
time2 = datetime.now()
print(f"The time taken for the selection was {util.time_diff(time1,time2)} seconds.")
time3 = datetime.now()
l = list(result)
time4 = datetime.now()
print(f"\nThe time taken to create the list was {util.time_diff(time3,time4)} seconds.")

The time taken for the selection was 0.003196 seconds.

The time taken to create the list was 799.393482 seconds.


In [15]:
print(len(l))

1672


### Query 6 pos

In [16]:
pos_6_condition = {
    'reviews.comments':
        {
            '$regex': pos_regex
        },
    'dates_list':
        {
            '$elemMatch':
                {
                    'date':
                        {
                            '$gte': datetime(2025,2,1)
                        }
                }
        },
    'average_price':
        {
            '$lte': 200
        } 
}

In [17]:
time1 = datetime.now()
result = db.listings_with_reviews_and_cal.find(pos_6_condition)
time2 = datetime.now()
print(f"The time taken for the selection was {util.time_diff(time1,time2)} seconds.")
time3 = datetime.now()
l = list(result)
time4 = datetime.now()
print(f"\nThe time taken to create the list was {util.time_diff(time3,time4)} seconds.")

The time taken for the selection was 0.002243 seconds.

The time taken to create the list was 80.185633 seconds.


In [18]:
print(len(l))

19228


### Query 6 neg

In [19]:
neg_6_condition = {
    'reviews.comments':
        {
            '$regex': neg_regex
        },
    'dates_list':
        {
            '$elemMatch':
                {
                    'date':
                        {
                            '$gte': datetime(2025,2,1)
                        }
                }
        },
    'average_price':
        {
            '$lte': 200
        } 
}

In [20]:
time1 = datetime.now()
result = db.listings_with_reviews_and_cal.find(neg_6_condition)
time2 = datetime.now()
print(f"The time taken for the selection was {util.time_diff(time1,time2)} seconds.")
time3 = datetime.now()
l = list(result)
time4 = datetime.now()
print(f"\nThe time taken to create the list was {util.time_diff(time3,time4)} seconds.")

The time taken for the selection was 0.005397 seconds.

The time taken to create the list was 590.868299 seconds.


In [21]:
print(len(l))

1281


### Set up index

In [22]:
time1 = datetime.now()
index_name = db.listings_with_reviews_and_cal.create_index([('reviews.comments', 'text')])
time2 = datetime.now()
print(f"The time taken to create the index was {util.time_diff(time1,time2)} seconds.")

The time taken to create the index was 30.892075 seconds.


### Query 7 pos

In [23]:
pos_condition_ind = {
    '$text':
        {
            '$search': (' ').join(superlative_words)
        }
}

In [24]:
time1 = datetime.now()
result = db.listings_with_reviews_and_cal.find(pos_condition_ind)
time2 = datetime.now()
print(f"The time taken for the selection was {util.time_diff(time1,time2)} seconds.")
time3 = datetime.now()
l = list(result)
time4 = datetime.now()
print(f"\nThe time taken to create the list was {util.time_diff(time3,time4)} seconds.")

The time taken for the selection was 0.004541 seconds.

The time taken to create the list was 98.252213 seconds.


In [25]:
print(len(l))

25197


### Query 7 neg

In [26]:
neg_condition_ind = {
    '$text':
        {
            '$search': (' ').join(super_negative_words)
        }
}

In [27]:
time1 = datetime.now()
result = db.listings_with_reviews_and_cal.find(neg_condition_ind)
time2 = datetime.now()
print(f"The time taken for the selection was {util.time_diff(time1,time2)} seconds.")
time3 = datetime.now()
l = list(result)
time4 = datetime.now()
print(f"\nThe time taken to create the list was {util.time_diff(time3,time4)} seconds.")

The time taken for the selection was 0.003816 seconds.

The time taken to create the list was 71.607567 seconds.


In [28]:
print(len(l))

1930
