In [2]:
import requests
import csv
import time
import json
from datetime import datetime, timedelta
import os
from pytz import UTC  # Import UTC timezone object

# Function to get the last date in the existing CSV file
def get_last_date_in_csv(file_path):
    try:
        with open(file_path, 'r', newline='', encoding='utf-8') as f:
            csv_reader = csv.reader(f)
            last_row = None
            for row in csv_reader:
                last_row = row
            if last_row:
                return datetime.fromisoformat(last_row[5].replace("Z", "+00:00"))
    except FileNotFoundError:
        return None

#function to get tomorrow's date in UTC
def get_tomorrows_date():
    tomorrow = datetime.now() + timedelta(days=1)
    return datetime(tomorrow.year, tomorrow.month, tomorrow.day, tzinfo=UTC)

# Initialize CSV file
csv_file_path = 'boston_311_data_predict.csv'
start_date = get_last_date_in_csv(csv_file_path) or datetime(2023, 8, 26, tzinfo=UTC)
end_date = get_tomorrows_date()
delta = timedelta(days=1)  # 12 hours
minutedelta = timedelta(minutes=1)  # 1 minute

csv_file = open(csv_file_path, 'a', newline='', encoding='utf-8')
csv_writer = csv.writer(csv_file)

# Write header only if the file is new
if start_date == datetime(2023, 8, 26, tzinfo=UTC):
    csv_writer.writerow(['service_request_id', 'status', 'service_name', 'service_code', 'description', 'requested_datetime', 'updated_datetime', 'address', 'lat', 'long', 'token'])  # Header

while start_date <= end_date:
    formatted_start_date = start_date.strftime('%Y-%m-%dT%H:%M:%S') + 'Z'
    formatted_end_date = (start_date + delta).strftime('%Y-%m-%dT%H:%M:%S') + 'Z'

    url = f"https://311.boston.gov/open311/v2/requests.json?start_date={formatted_start_date}&end_date={formatted_end_date}"

    response = requests.get(url)
    print(f"Fetching data for {formatted_start_date} to {formatted_end_date}")
    #print number of requests in response
    print(f"Number of requests: {len(response.json())}")

    #print the wc -l of the file
    print(f"Number of lines in file: {os.system('wc -l boston_311_data_predict.csv')}")

    if response.status_code == 200:
        try:
            data = json.loads(response.text)
            for record in data:
                csv_writer.writerow([record.get('service_request_id'), record.get('status'), record.get('service_name'), record.get('service_code'), record.get('description'), record.get('requested_datetime'), record.get('updated_datetime'), record.get('address'), record.get('lat'), record.get('long'), record.get('token')])
        except json.JSONDecodeError:
            print(f"Failed to decode JSON for {formatted_start_date} to {formatted_end_date}")
            continue  # Skip to the next iteration
    else:
        print(f"Failed to fetch data for {formatted_start_date} to {formatted_end_date}")

    # Move to next half-day
    #start_date += delta

    #set start_date to the last date in this response if it's greater than the current start_date else add delta. Use multiple lines
    if len(response.json()) > 0:
        last_date = datetime.fromisoformat(response.json()[-1].get('requested_datetime').replace("Z", "+00:00"))
        if last_date > start_date:
            start_date = last_date
        else:
            start_date += minutedelta


    # Rate limiting: Sleep for 6 seconds to stay within 10 requests per minute limit
    time.sleep(6)

# Close CSV file
csv_file.close()

Fetching data for 2023-08-26T00:00:00Z to 2023-08-27T00:00:00Z
Number of requests: 50
0 boston_311_data_predict.csv
Number of lines in file: 0
Fetching data for 2023-08-26T01:39:00Z to 2023-08-27T01:39:00Z
Number of requests: 50
41 boston_311_data_predict.csv
Number of lines in file: 0
Fetching data for 2023-08-26T06:15:00Z to 2023-08-27T06:15:00Z
Number of requests: 50
84 boston_311_data_predict.csv
Number of lines in file: 0
Fetching data for 2023-08-26T10:26:00Z to 2023-08-27T10:26:00Z
Number of requests: 50
157 boston_311_data_predict.csv
Number of lines in file: 0
Fetching data for 2023-08-26T11:24:00Z to 2023-08-27T11:24:00Z
Number of requests: 50
201 boston_311_data_predict.csv
Number of lines in file: 0
Fetching data for 2023-08-26T11:59:00Z to 2023-08-27T11:59:00Z
Number of requests: 50
250 boston_311_data_predict.csv
Number of lines in file: 0
Fetching data for 2023-08-26T12:25:00Z to 2023-08-27T12:25:00Z
Number of requests: 50
300 boston_311_data_predict.csv
Number of lines 

KeyboardInterrupt: 

In [None]:
import pandas as pd
import requests
import csv
import time
import json
import os

# Load your CSVs into pandas DataFrames
df1 = pd.read_csv("file_with_case_enquiry_id.csv")
df2 = pd.read_csv("file_with_service_request_id.csv")

# Identify missing service_request_ids
missing_ids = set(df1['case_enquiry_id']) - set(df2['service_request_id'])

# Initialize CSV file
csv_file_path = 'file_with_service_request_id.csv'
csv_file = open(csv_file_path, 'a', newline='', encoding='utf-8')
csv_writer = csv.writer(csv_file)

# Rate limit delay
rate_limit_delay = 6  # 6 seconds to stay within 10 requests per minute

for service_request_id in missing_ids:
    url = f"https://311.boston.gov/open311/v2/requests.json?service_request_id={service_request_id}"
    response = requests.get(url)
    
    print(f"Fetching data for service_request_id {service_request_id}")

    if response.status_code == 200:
        try:
            data = json.loads(response.text)
            for record in data:
                csv_writer.writerow([
                    record.get('service_request_id'),
                    record.get('status'),
                    record.get('service_name'),
                    record.get('service_code'),
                    record.get('description'),
                    record.get('requested_datetime'),
                    record.get('updated_datetime'),
                    record.get('address'),
                    record.get('lat'),
                    record.get('long'),
                    record.get('token')
                ])
        except json.JSONDecodeError:
            print(f"Failed to decode JSON for service_request_id {service_request_id}")
    else:
        print(f"Failed to fetch data for service_request_id {service_request_id}")

    # Rate limiting
    time.sleep(rate_limit_delay)

# Close CSV file
csv_file.close()


In [None]:
#count the lines in the csv file
num_lines = sum(1 for line in open('boston_311_data_predict.csv'))

#print the count
print("Number of lines in the CSV file:")
print(num_lines)

Number of lines in the CSV file:
5348


In [None]:
#load the csv file into a dataframe and count the unique service codes
import pandas as pd
df = pd.read_csv('boston_311_data_2022.csv')
print("Number of unique values in the first column:")



Number of unique values in the first column:
<bound method Series.unique of 0         101004113295
1         101004113630
2         101004113228
3         101004113229
4         101004113230
              ...     
305353    101004397135
305354    101004397479
305355    101004396963
305356    101004396409
305357    101004396280
Name: 101004113559, Length: 305358, dtype: int64>
