In [1]:
#run the below pip if necessary
#!pip install pymongo
import requests
import pandas as pd
from pymongo import *
from datetime import datetime

# MongoDB setup
PASSWORD = "xxx"
mongoUrl = f"mongodb+srv://mgrove:{PASSWORD}@mg-nypd-response-times.nvvuu.mongodb.net/?retryWrites=true&w=majority&appName=mg-nypd-response-times"
client = MongoClient(mongoUrl)
db = client["nypd_data_v2"]
collection = db["nypd_collection"]

In [2]:
# Find a single document
document = collection.find_one()

if document:
    print("Document found")
else:
    print("No document found.")

def print_schema(doc):
    if not doc:
        print("Document is empty.")
        return

    print("Schema of the document:")
    for key, value in doc.items():
        print(f"{key}: {type(value).__name__}")

# Print the schema of the retrieved document
print_schema(document)

Document found
Schema of the document:
_id: ObjectId
cad_evnt_id: str
create_date: str
incident_date: str
incident_time: str
nypd_pct_cd: str
boro_nm: str
patrl_boro_nm: str
geo_cd_x: str
geo_cd_y: str
radio_code: str
typ_desc: str
cip_jobs: str
add_ts: str
disp_ts: str
closng_ts: str
latitude: str
longitude: str
arrivd_ts: float
incident_year: int
incident_month: int


In [3]:
def sample_monthly_data(collection, sampleSize = 100):
    # Get the distinct years and months from the collection
    pipeline = [
        {
            "$group": {
                "_id": {
                    "year": "$incident_year",
                    "month": "$incident_month"
                },
                "count": {"$sum": 1}
            }
        },
        {"$sort": {"_id.year": 1, "_id.month": 1}}
    ]
    
    distinct_months = list(collection.aggregate(pipeline))
    print(distinct_months)
    sampled_data = []
    
    # Sample rows from each month
    for month_info in distinct_months:
        year = month_info["_id"]["year"]
        month = month_info["_id"]["month"]
        print(f"getting {month}/{year}")
        # Use aggregation pipeline to sample records for the current month
        pipeline = [
            {
                "$match": {
                    "$expr": {
                        "$and": [
                            {"$eq": ["$incident_year", year]},
                            {"$eq": ["$incident_month", month]}
                        ]
                    }
                }
            },
            {"$sample": {"size": sampleSize}}
        ]
        
        monthly_sample = list(collection.aggregate(pipeline))
        sampled_data.extend(monthly_sample)
    
    return pd.DataFrame(sampled_data)

# Sample the data
SAMPLE_SIZE = 1000
df_sampled = sample_monthly_data(collection, SAMPLE_SIZE)
print(df_sampled.head())

[{'_id': {'year': 2022, 'month': 7}, 'count': 65072}, {'_id': {'year': 2022, 'month': 8}, 'count': 613921}, {'_id': {'year': 2022, 'month': 9}, 'count': 598342}, {'_id': {'year': 2022, 'month': 10}, 'count': 658305}, {'_id': {'year': 2022, 'month': 11}, 'count': 640037}, {'_id': {'year': 2022, 'month': 12}, 'count': 614202}, {'_id': {'year': 2023, 'month': 1}, 'count': 646307}, {'_id': {'year': 2023, 'month': 2}, 'count': 580166}, {'_id': {'year': 2023, 'month': 3}, 'count': 637324}, {'_id': {'year': 2023, 'month': 4}, 'count': 576113}, {'_id': {'year': 2023, 'month': 5}, 'count': 620441}, {'_id': {'year': 2023, 'month': 6}, 'count': 577135}, {'_id': {'year': 2023, 'month': 7}, 'count': 575313}, {'_id': {'year': 2023, 'month': 8}, 'count': 559154}, {'_id': {'year': 2023, 'month': 9}, 'count': 552160}, {'_id': {'year': 2023, 'month': 10}, 'count': 598906}, {'_id': {'year': 2023, 'month': 11}, 'count': 560745}, {'_id': {'year': 2023, 'month': 12}, 'count': 566365}, {'_id': {'year': 2024,

In [4]:
print("rows sampled: ", df_sampled.count())

rows sampled:  _id               24000
cad_evnt_id       24000
create_date       24000
incident_date     24000
incident_time     24000
nypd_pct_cd       24000
boro_nm           24000
patrl_boro_nm     24000
geo_cd_x          24000
geo_cd_y          24000
radio_code        24000
typ_desc          24000
cip_jobs          24000
add_ts            24000
disp_ts           24000
closng_ts         24000
latitude          24000
longitude         24000
location          18000
incident_year     24000
incident_month    24000
arrivd_ts         14254
dtype: int64


In [5]:
df_sampled.to_csv('data/sampled_data.csv', index=False)