In [None]:
!pip install pandas
!pip install sodapy



In [None]:
from sodapy import Socrata
import pandas as pd
import time

In [None]:
# Initialize Socrata client
client = Socrata("data.cityofnewyork.us", "ql2o7kE6dAr27uzMyaLLF4nbl", timeout=60)
dataset_id = "nc67-uf89"
params = { # set the parameters and using where to set the limit
    "select": "plate, state, issue_date, violation_time, violation, fine_amount, penalty_amount, interest_amount, reduction_amount, payment_amount, amount_due, county, violation_status",
    "where": "issue_date >= '2022-01-01T00:00:00'",
    "limit": 5000,
}
record_count = 111000

In [None]:
from concurrent.futures import ThreadPoolExecutor
from tenacity import retry, stop_after_attempt, wait_fixed

In [None]:
@retry(stop=stop_after_attempt(5), wait=wait_fixed(5)) ## retry funtion to keep in loop and divide it by the chunk
def fetch_chunk(offset):
    return client.get(dataset_id, offset=offset, **params)

# Fetch data with retries
results, failed_offsets = [], []
for offset in range(0, record_count, params["limit"]):
    try:
        results.extend(fetch_chunk(offset))
    except Exception:
        failed_offsets.append(offset)

# Retry failed chunks
for offset in failed_offsets:
    try:
        results.extend(fetch_chunk(offset))
    except Exception:
        pass

In [None]:
# Turn results into a data frame
parking_df= pd.DataFrame.from_records(results)

In [None]:
parking_df.head(5)

Unnamed: 0,plate,state,issue_date,violation_time,violation,fine_amount,penalty_amount,interest_amount,reduction_amount,payment_amount,amount_due,county,violation_status
0,AA96820,CT,2022-01-01T00:00:00.000,11:10A,NO STAND TAXI/FHV RELIEF STAND,115,0,0,0,115,0,NY,
1,GNV8795,NY,2022-01-01T00:00:00.000,11:42A,NO STAND TAXI/FHV RELIEF STAND,115,0,0,0,115,0,NY,
2,DFTE69,FL,2022-01-01T00:00:00.000,11:44A,NO STAND TAXI/FHV RELIEF STAND,115,0,0,0,115,0,NY,
3,T678645C,NY,2022-01-01T00:00:00.000,01:13A,PCKP DSCHRGE IN PRHBTD ZONE,115,0,0,115,0,0,NY,HEARING HELD-NOT GUILTY
4,67522MN,NY,2022-01-02T00:00:00.000,12:53P,NO STAND TAXI/FHV RELIEF STAND,115,10,0,10,115,0,NY,HEARING HELD-GUILTY REDUCTION


In [None]:
#Cleaning Data

In [None]:
# Check the number of nan values
parking_df.isnull().sum()

Unnamed: 0,0
plate,0
state,0
issue_date,0
violation_time,0
violation,0
fine_amount,0
penalty_amount,0
interest_amount,0
reduction_amount,0
payment_amount,0


In [None]:
#drop rows with more than 5 NaN values
parking_df_clean = parking_df[parking_df.isnull().sum(axis=1) <= 5]

#limit to 100,000 rows
parking_df_clean = parking_df_clean.head(100000)

In [None]:
parking_df_clean

Unnamed: 0,plate,state,issue_date,violation_time,violation,fine_amount,penalty_amount,interest_amount,reduction_amount,payment_amount,amount_due,county,violation_status
0,AA96820,CT,2022-01-01T00:00:00.000,11:10A,NO STAND TAXI/FHV RELIEF STAND,115,0,0,0,115,0,NY,
1,GNV8795,NY,2022-01-01T00:00:00.000,11:42A,NO STAND TAXI/FHV RELIEF STAND,115,0,0,0,115,0,NY,
2,DFTE69,FL,2022-01-01T00:00:00.000,11:44A,NO STAND TAXI/FHV RELIEF STAND,115,0,0,0,115,0,NY,
3,T678645C,NY,2022-01-01T00:00:00.000,01:13A,PCKP DSCHRGE IN PRHBTD ZONE,115,0,0,115,0,0,NY,HEARING HELD-NOT GUILTY
4,67522MN,NY,2022-01-02T00:00:00.000,12:53P,NO STAND TAXI/FHV RELIEF STAND,115,10,0,10,115,0,NY,HEARING HELD-GUILTY REDUCTION
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,12212ME,NY,2023-03-10T00:00:00.000,11:39A,MOBILE BUS LANE VIOLATION,150,0,0,0,150,0,BK,HEARING HELD-GUILTY REDUCTION
99996,HSD9608,NY,2023-03-10T00:00:00.000,11:41A,MOBILE BUS LANE VIOLATION,50,0,0,0,50,0,,
99997,KSR1933,NY,2023-03-10T00:00:00.000,11:43A,MOBILE BUS LANE VIOLATION,50,0,0,0,50,0,,
99998,KTY3597,NY,2023-03-10T00:00:00.000,12:18P,MOBILE BUS LANE VIOLATION,100,0,0,0,100,0,,




In [None]:
parking_df_clean.to_csv('parking_violations.csv', index=False)