In [1]:
import pandas as pd
import json
import random
from tqdm import tqdm
tqdm.pandas(desc="Extracting Neighborhoods")

In [10]:
community_areas_path = 'drive/MyDrive/CS418_DATA/community_areas.json'
crime_dataset_path = 'drive/MyDrive/CS418_DATA/crime_dataset.csv'
divvy_dataset_path = 'drive/MyDrive/CS418_DATA/divvy_dataset.csv'
event_dataset_path = 'drive/MyDrive/CS418_DATA/event_dataset.csv'
train_dataset_path = 'drive/MyDrive/CS418_DATA/cta_train_stations.csv'
clean_crime_dataset_path = 'drive/MyDrive/CS418_CLEAN_DATA/crime_dataset.csv'
clean_divvy_dataset_path = 'drive/MyDrive/CS418_CLEAN_DATA/divvy_dataset.csv'
clean_event_dataset_path = 'drive/MyDrive/CS418_CLEAN_DATA/event_dataset.csv'

In [3]:
def point_inside_polygon(x, y, poly):
    n = len(poly)
    inside = False
    p1x, p1y = poly[0]
    for i in range(n + 1):
        p2x, p2y = poly[i % n]
        if y > min(p1y, p2y):
            if y <= max(p1y, p2y):
                if x <= max(p1x, p2x):
                    if p1y != p2y:
                        xinters = (y - p1y) * (p2x - p1x) / (p2y - p1y) + p1x
                    if p1x == p2x or x <= xinters:
                        inside = not inside
        p1x, p1y = p2x, p2y
    return inside

def get_neighborhood(lat, lon, neighborhoods):
    for name, coords in neighborhoods.items():
        if point_inside_polygon(lon, lat, coords):  # lon=x, lat=y
            return name
    return None


with open(community_areas_path) as f:
    community_areas = json.load(f)

# Crime Data

In [4]:
crime_data = pd.read_csv(crime_dataset_path)

  crime_data = pd.read_csv(crime_dataset_path)


In [5]:
crime_data['Date'].min(), crime_data['Date'].max()

('01/01/2001 01:00:00 AM', '12/31/2024 12:58:00 PM')

In [6]:
# Filter rows by their date to only contain crimes after 2012
crime_data['Date'] = pd.to_datetime(crime_data['Date'], format='%m/%d/%Y %I:%M:%S %p')
crime_data = crime_data[crime_data['Date'] >= '2014-01-01']

In [7]:
# drop entries whose location is NaN
crime_data = crime_data.dropna(subset=['Location', 'Latitude', 'Longitude'])
crime_data = crime_data.reset_index(drop=True)

In [8]:
crime_data.columns = crime_data.columns.str.lower()
crime_data = crime_data.map(lambda x: x.lower() if isinstance(x, str) else x)

In [9]:
# drop uninteresting cols and sample
cols_2_drop = ['id', 'case number', 'iucr', 'ward', 'updated on']
crime_data = crime_data.drop(columns=cols_2_drop)

In [None]:
crime_data.to_csv(clean_crime_dataset_path)

# Divvy Data

In [13]:
divvy_data = pd.read_csv(divvy_dataset_path)

In [14]:
# convert start and stop time to standard AM, PM format
divvy_data['starttime'] = pd.to_datetime(divvy_data['starttime'])
divvy_data['starttime'] = divvy_data['starttime'].dt.strftime('%m/%d/%Y %I:%M:%S %p')

divvy_data['stoptime'] = pd.to_datetime(divvy_data['stoptime'])
divvy_data['stoptime'] = divvy_data['stoptime'].dt.strftime('%m/%d/%Y %I:%M:%S %p')

In [15]:
divvy_data['starttime'].min(), divvy_data['starttime'].max()

('01/01/2014 01:06:00 PM', '12/31/2017 12:59:00 PM')

In [16]:
# only keep those samples whose start time is less than stop time
divvy_data = divvy_data[divvy_data['starttime'] < divvy_data['stoptime']].reset_index(drop=True)

In [17]:
divvy_data.columns

Index(['trip_id', 'year', 'month', 'week', 'day', 'hour', 'usertype', 'gender',
       'starttime', 'stoptime', 'tripduration', 'temperature', 'events',
       'from_station_id', 'from_station_name', 'latitude_start',
       'longitude_start', 'dpcapacity_start', 'to_station_id',
       'to_station_name', 'latitude_end', 'longitude_end', 'dpcapacity_end'],
      dtype='object')

In [18]:
# drop uninteresting cols and sample
cols_2_drop = ['usertype', 'trip_id', 'year', 'month', 'week', 'day', 'hour', 'gender']
divvy_data = divvy_data.drop(columns=cols_2_drop)

In [19]:
# sample 20000 of data
sampled_divvy_data = divvy_data.sample(n=20000, random_state=1000)
sampled_divvy_data = sampled_divvy_data.reset_index(drop=True)

In [None]:
sampled_divvy_data.to_csv(clean_divvy_dataset_path)

# Event Data

In [20]:
event_data = pd.read_csv(event_dataset_path)

In [21]:
event_data.columns = event_data.columns.str.lower()
event_data = event_data.map(lambda x: x.lower() if isinstance(x, str) else x)

In [22]:
cols_2_drop = ['organization', 'year', 'month', 'requestor']
event_data = event_data.drop(columns=cols_2_drop)

In [23]:
event_data = event_data[event_data['permit status'].isin(['approved', 'issued', 'completed'])]
event_data = event_data[~event_data['neighborhood'].isnull()]
event_data = event_data.reset_index(drop=True)

In [24]:
event_data['reservation start date'] = pd.to_datetime(event_data['reservation start date'], errors='coerce')
event_data['reservation end date'] = pd.to_datetime(event_data['reservation end date'], errors='coerce')

event_data['reservation start date'] = event_data['reservation start date'].dt.strftime('%m/%d/%Y')
event_data['reservation end date'] = event_data['reservation end date'].dt.strftime('%m/%d/%Y')

In [None]:
event_data.to_csv(clean_event_dataset_path)

# Extract Neighborhood for Crime and Divvy Datasets

In [None]:
crime_data.loc['neighborhood'] = crime_data.progress_apply(
    lambda row: get_neighborhood(row['latitude'], row['longitude'], community_areas),
    axis=1
)

In [None]:
divvy_data['neighborhood_start'] = divvy_data.progress_apply(
    lambda row: get_neighborhood(row['latitude_start'], row['longitude_start'], community_areas),
    axis=1
)

divvy_data['neighborhood_end'] = divvy_data.progress_apply(
    lambda row: get_neighborhood(row['latitude_end'], row['longitude_end'], community_areas),
    axis=1
)

In [None]:
crime_data = crime_data[~crime_data['neighborhood'].isnull()]
divvy_data = divvy_data[~divvy_data['neighborhood_start'].isnull()]
divvy_data = divvy_data[~divvy_data['neighborhood_end'].isnull()]

In [None]:
crime_data = crime_data.map(lambda x: x.lower() if isinstance(x, str) else x)
divvy_data = divvy_data.map(lambda x: x.lower() if isinstance(x, str) else x)

In [None]:
crime_data.to_csv('drive/MyDrive/CS418_DATA/sampled_crime_dataset.csv')
divvy_data.to_csv('drive/MyDrive/CS418_DATA/sampled_divvy_dataset.csv')