In [1]:
import requests
import csv
import time
import json
from datetime import datetime, timedelta
import os
from pytz import UTC  # Import UTC timezone object

# Function to get the last date in the existing CSV file
def get_last_date_in_csv(file_path):
    try:
        with open(file_path, 'r', newline='', encoding='utf-8') as f:
            csv_reader = csv.reader(f)
            last_row = None
            for row in csv_reader:
                last_row = row
            if last_row:
                return datetime.fromisoformat(last_row[5].replace("Z", "+00:00"))
    except FileNotFoundError:
        return None

# Initialize CSV file
csv_file_path = 'boston_311_data_2022.csv'
start_date = get_last_date_in_csv(csv_file_path) or datetime(2022, 1, 1, tzinfo=UTC)
end_date = datetime(2023, 1, 1, tzinfo=UTC)
delta = timedelta(days=1)  # 12 hours
minutedelta = timedelta(minutes=1)  # 1 minute

csv_file = open(csv_file_path, 'a', newline='', encoding='utf-8')
csv_writer = csv.writer(csv_file)

# Write header only if the file is new
if start_date == datetime(2023, 1, 1, tzinfo=UTC):
    csv_writer.writerow(['service_request_id', 'status', 'service_name', 'service_code', 'description', 'requested_datetime', 'updated_datetime', 'address', 'lat', 'long', 'token'])  # Header

while start_date <= end_date:
    formatted_start_date = start_date.strftime('%Y-%m-%dT%H:%M:%S') + 'Z'
    formatted_end_date = (start_date + delta).strftime('%Y-%m-%dT%H:%M:%S') + 'Z'

    url = f"https://311.boston.gov/open311/v2/requests.json?start_date={formatted_start_date}&end_date={formatted_end_date}"

    response = requests.get(url)
    print(f"Fetching data for {formatted_start_date} to {formatted_end_date}")
    #print number of requests in response
    print(f"Number of requests: {len(response.json())}")

    if response.status_code == 200:
        try:
            data = json.loads(response.text)
            for record in data:
                csv_writer.writerow([record.get('service_request_id'), record.get('status'), record.get('service_name'), record.get('service_code'), record.get('description'), record.get('requested_datetime'), record.get('updated_datetime'), record.get('address'), record.get('lat'), record.get('long'), record.get('token')])
        except json.JSONDecodeError:
            print(f"Failed to decode JSON for {formatted_start_date} to {formatted_end_date}")
            continue  # Skip to the next iteration
    else:
        print(f"Failed to fetch data for {formatted_start_date} to {formatted_end_date}")

    # Move to next half-day
    #start_date += delta

    #set start_date to the last date in this response if it's greater than the current start_date else add delta. Use multiple lines
    if len(response.json()) > 0:
        last_date = datetime.fromisoformat(response.json()[-1].get('requested_datetime').replace("Z", "+00:00"))
        if last_date > start_date:
            start_date = last_date
        else:
            start_date += minutedelta


    # Rate limiting: Sleep for 6 seconds to stay within 10 requests per minute limit
    time.sleep(6)

# Close CSV file
csv_file.close()

Fetching data for 2022-01-01T00:00:00Z to 2022-01-02T00:00:00Z
Number of requests: 50
Fetching data for 2022-01-01T03:32:00Z to 2022-01-02T03:32:00Z
Number of requests: 50
Fetching data for 2022-01-02T03:24:00Z to 2022-01-03T03:24:00Z
Number of requests: 50
Fetching data for 2022-01-02T12:16:00Z to 2022-01-03T12:16:00Z
Number of requests: 50
Fetching data for 2022-01-02T14:33:00Z to 2022-01-03T14:33:00Z
Number of requests: 50
Fetching data for 2022-01-02T15:50:00Z to 2022-01-03T15:50:00Z
Number of requests: 50
Fetching data for 2022-01-02T17:22:00Z to 2022-01-03T17:22:00Z
Number of requests: 50
Fetching data for 2022-01-02T18:28:00Z to 2022-01-03T18:28:00Z
Number of requests: 50
Fetching data for 2022-01-02T19:11:00Z to 2022-01-03T19:11:00Z
Number of requests: 50
Fetching data for 2022-01-02T20:33:00Z to 2022-01-03T20:33:00Z
Number of requests: 50
Fetching data for 2022-01-02T21:59:00Z to 2022-01-03T21:59:00Z
Number of requests: 50
Fetching data for 2022-01-02T23:56:00Z to 2022-01-03T2

In [3]:
#count the lines in the csv file
num_lines = sum(1 for line in open('boston_311_data_2022.csv'))

#print the count
print("Number of lines in the CSV file:")
print(num_lines)

Number of lines in the CSV file:
19044
