In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

divvy = pd.read_csv('/content/drive/MyDrive/divvy_data.csv')
crime = pd.read_csv('/content/drive/MyDrive/crime_data.csv')

print("✅ Divvy dataset loaded successfully:")
print(divvy.head())

print("\n✅ Crime dataset loaded successfully:")
print(crime.head())




✅ Divvy dataset loaded successfully:
   trip_id  year  month  week  day  hour    usertype  gender  \
0  2355134  2014      6    27    0    23  Subscriber    Male   
1  2355133  2014      6    27    0    23  Subscriber    Male   
2  2355130  2014      6    27    0    23  Subscriber    Male   
3  2355129  2014      6    27    0    23  Subscriber  Female   
4  2355128  2014      6    27    0    23  Subscriber  Female   

             starttime             stoptime  ...  from_station_id  \
0  2014-06-30 23:57:00  2014-07-01 00:07:00  ...              131   
1  2014-06-30 23:56:00  2014-07-01 00:00:00  ...              282   
2  2014-06-30 23:33:00  2014-06-30 23:35:00  ...              327   
3  2014-06-30 23:26:00  2014-07-01 00:24:00  ...              134   
4  2014-06-30 23:16:00  2014-06-30 23:26:00  ...              320   

             from_station_name latitude_start  longitude_start  \
0    Lincoln Ave & Belmont Ave      41.939365       -87.668385   
1      Halsted St & Maxwell St 

In [None]:
import json

# JSON file
with open('/content/drive/MyDrive/community_areas.json') as f:
    community_areas = json.load(f)

print("\n✅ JSON Data Keys:")
print(community_areas.keys())



✅ JSON Data Keys:
dict_keys(['Grand Boulevard', 'Printers Row', 'United Center', 'Sheffield & DePaul', 'Humboldt Park', 'Garfield Park', 'North Lawndale', 'Little Village', 'Armour Square', 'Avalon Park', 'Burnside', 'Calumet Heights', 'Pullman', 'Riverdale', 'Hegewisch', 'Douglas', 'Oakland', 'Fuller Park', 'Woodlawn', 'Portage Park', 'Hermosa', 'Avondale', 'Logan Square', 'Little Italy, UIC', 'Kenwood', 'Rogers Park', 'Jefferson Park', 'Sauganash,Forest Glen', 'North Park', 'Albany Park', 'Irving Park', 'Dunning', 'West Ridge', 'Uptown', 'Norwood Park', 'Streeterville', 'South Shore', 'Chatham', 'South Chicago', 'Roseland', 'North Center', 'South Deering', 'East Side', 'West Pullman', 'Garfield Ridge', 'New City', 'Englewood', 'Grand Crossing', 'Ashburn', 'Mount Greenwood', 'Morgan Park', "O'Hare", 'Jackson Park', 'Loop', 'Greektown', 'Museum Campus', 'Edgewater', 'Lake View', 'Lincoln Park', 'Magnificent Mile', 'Lincoln Square', 'Washington Park', 'Millenium Park', 'Near South Side

In [None]:
!pip install pyshp
!pip install pandas




In [None]:
def point_inside_polygon(x, y, poly):
    n = len(poly)
    inside = False
    p1x, p1y = poly[0]
    for i in range(n + 1):
        p2x, p2y = poly[i % n]
        if y > min(p1y, p2y):
            if y <= max(p1y, p2y):
                if x <= max(p1x, p2x):
                    if p1y != p2y:
                        xinters = (y - p1y) * (p2x - p1x) / (p2y - p1y) + p1x
                    if p1x == p2x or x <= xinters:
                        inside = not inside
        p1x, p1y = p2x, p2y
    return inside

def get_neighborhood(lat, lon, neighborhoods):
    for name, coords in neighborhoods.items():
        if point_inside_polygon(lon, lat, coords):  # lon=x, lat=y
            return name
    return None


In [None]:
crime['neighborhood'] = crime.apply(
    lambda row: get_neighborhood(row['Latitude'], row['Longitude'], community_areas),
    axis=1
)

print("✅ Crime dataset with neighborhoods added:")
print(crime[['Latitude', 'Longitude', 'neighborhood']].head())

✅ Crime dataset with neighborhoods added:
    Latitude  Longitude    neighborhood
0  41.775402 -87.653178       Englewood
1  41.880829 -87.752634          Austin
2  41.750582 -87.647984  Auburn Gresham
3  41.751657 -87.650131  Auburn Gresham
4  41.915575 -87.707472    Logan Square


In [None]:
print(crime.columns)


Index(['ID', 'Case Number', 'Date', 'Block', 'IUCR', 'Primary Type',
       'Description', 'Location Description', 'Arrest', 'Domestic', 'Beat',
       'District', 'Ward', 'Community Area', 'FBI Code', 'X Coordinate',
       'Y Coordinate', 'Year', 'Updated On', 'Latitude', 'Longitude',
       'Location', 'neighborhood'],
      dtype='object')


In [None]:
print(divvy.columns)

Index(['trip_id', 'year', 'month', 'week', 'day', 'hour', 'usertype', 'gender',
       'starttime', 'stoptime', 'tripduration', 'temperature', 'events',
       'from_station_id', 'from_station_name', 'latitude_start',
       'longitude_start', 'dpcapacity_start', 'to_station_id',
       'to_station_name', 'latitude_end', 'longitude_end', 'dpcapacity_end'],
      dtype='object')


In [None]:
divvy_sample = divvy.sample(700, random_state=42)  # 5000 rows only


In [None]:
# Start station neighborhood for the sample
divvy_sample['start_neighborhood'] = divvy_sample.apply(
    lambda row: get_neighborhood(row['latitude_start'], row['longitude_start'], community_areas),
    axis=1
)

# End station neighborhood for the sample
divvy_sample['end_neighborhood'] = divvy_sample.apply(
    lambda row: get_neighborhood(row['latitude_end'], row['longitude_end'], community_areas),
    axis=1
)

print("\n✅ Divvy sample dataset with start/end neighborhoods added:")
print(divvy_sample[['latitude_start', 'longitude_start', 'start_neighborhood',
                    'latitude_end', 'longitude_end', 'end_neighborhood']].head())


✅ Divvy sample dataset with start/end neighborhoods added:
         latitude_start  longitude_start start_neighborhood  latitude_end  \
3810233       41.865212       -87.617759      Museum Campus     41.884728   
1685437       41.881469       -87.635177               Loop     41.893843   
8771251       41.916017       -87.668879           Bucktown     41.899930   
2099479       41.872293       -87.624091         Grant Park     41.872078   
5389004       41.838842       -87.621857            Douglas     41.799336   

         longitude_end end_neighborhood  
3810233     -87.619521             Loop  
1685437     -87.641851      River North  
8771251     -87.634430      River North  
2099479     -87.629544     Printers Row  
5389004     -87.600958        Hyde Park  


In [None]:
#crime.to_csv('/content/drive/MyDrive/crime_with_neighborhoods.csv', index=False)

In [None]:
# Save only the sample Divvy dataset with neighborhoods
divvy_sample.to_csv('/content/drive/MyDrive/divvy_sample_with_neighborhoods.csv', index=False)


In [None]:
# Check unique years in Divvy dataset
unique_years = divvy['year'].unique()
print("Years available in Divvy dataset:", sorted(unique_years))


Years available in Divvy dataset: [np.int64(2014), np.int64(2015), np.int64(2016), np.int64(2017)]


In [None]:
import pandas as pd

# Make sure 'date' column is in datetime format
crime['Date'] = pd.to_datetime(crime['Date'], errors='coerce')  # converts invalid dates to NaT

# Extract year into a new column
crime['year'] = crime['Date'].dt.year

# Check unique years
unique_crime_years = crime['year'].unique()
print("Years available in Crime dataset:", sorted(unique_crime_years))


Years available in Crime dataset: [np.int32(2019)]


  crime['Date'] = pd.to_datetime(crime['Date'], errors='coerce')  # converts invalid dates to NaT
