## Import Dependencies

In [1]:
#Import Dependencies
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
import datetime as dt
import json
from math import cos, asin, sqrt

## Extract CSV File

### Store CSV Into Dataframe

In [2]:
#Store COBRA crime data CSV as dataframe
cobra_csv_file = "../../data/COBRA-2022.csv"
cobra_complete_df = pd.read_csv(cobra_csv_file)
cobra_complete_df.head()

Unnamed: 0,offense_id,rpt_date,occur_date,occur_day,occur_day_num,occur_time,poss_date,poss_time,beat,zone,location,ibr_code,UC2_Literal,neighborhood,npu,lat,long
0,22011609,7/20/2022,7/20/2022,Wednesday,4.0,18:30,7/20/2022,19:00,607,6,"1395 CUSTER WAY SE\nATLANTA, GA 30316\nUNITED ...",23H,LARCENY-NON VEHICLE,Custer/McDonough/Guice,W,33.716073,-84.353217
1,22254073,9/11/2022,9/11/2022,Sunday,1.0,11:15,9/11/2022,11:45,606,6,"777 MEMORIAL DR SE\nATLANTA, GA 30316\nUNITED ...",23F,LARCENY-FROM VEHICLE,Reynoldstown,N,33.746335,-84.361753
2,202800283,6/14/2022,10/6/2020,Tuesday,3.0,05:21,10/6/2020,05:40,413,4,"4050 BLANTON AVE SW\nATLANTA, GA 30331\nUNITED...",23F,LARCENY-FROM VEHICLE,Fairburn,P,33.690551,-84.52097
3,203140048,2/21/2022,11/9/2020,Monday,2.0,00:32,11/9/2020,01:49,409,4,"2111 CAMPBELLTON RD SW\nATLANTA, GA 30311\nUNI...",13A,AGG ASSAULT,Adams Park,R,33.706526,-84.458
4,203250206,4/15/2022,11/20/2020,Friday,6.0,01:40,11/20/2020,01:45,109,1,"2621 DONALD LEE HOLLOWELL PKWY NW\nATLANTA, GA...",23H,LARCENY-NON VEHICLE,Center Hill,J,33.776168,-84.472729


## Transform COBRA crime dataframe

In [3]:
#Create a new dataframe with just the columns we want to use
cobra_summary_df = cobra_complete_df[["offense_id", "occur_date", "UC2_Literal", "neighborhood", "lat", "long"]]
cobra_summary_df.head()

Unnamed: 0,offense_id,occur_date,UC2_Literal,neighborhood,lat,long
0,22011609,7/20/2022,LARCENY-NON VEHICLE,Custer/McDonough/Guice,33.716073,-84.353217
1,22254073,9/11/2022,LARCENY-FROM VEHICLE,Reynoldstown,33.746335,-84.361753
2,202800283,10/6/2020,LARCENY-FROM VEHICLE,Fairburn,33.690551,-84.52097
3,203140048,11/9/2020,AGG ASSAULT,Adams Park,33.706526,-84.458
4,203250206,11/20/2020,LARCENY-NON VEHICLE,Center Hill,33.776168,-84.472729


In [4]:
#Rename column for easier readability
cobra_summary_df.rename(columns={'UC2_Literal':'crime_type'}, inplace = True)
cobra_summary_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,offense_id,occur_date,crime_type,neighborhood,lat,long
0,22011609,7/20/2022,LARCENY-NON VEHICLE,Custer/McDonough/Guice,33.716073,-84.353217
1,22254073,9/11/2022,LARCENY-FROM VEHICLE,Reynoldstown,33.746335,-84.361753
2,202800283,10/6/2020,LARCENY-FROM VEHICLE,Fairburn,33.690551,-84.52097
3,203140048,11/9/2020,AGG ASSAULT,Adams Park,33.706526,-84.458
4,203250206,11/20/2020,LARCENY-NON VEHICLE,Center Hill,33.776168,-84.472729


In [5]:
#Convert 'occur_date' column from a string type to a date type
cobra_summary_df["occur_date"] = pd.to_datetime(cobra_summary_df["occur_date"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [6]:
#Filter out any rows with an 'occur_date' that is NOT in 2022
cobra_summary_df = cobra_summary_df.loc[cobra_summary_df["occur_date"] > "2021-12-31"]
cobra_summary_df.head()

Unnamed: 0,offense_id,occur_date,crime_type,neighborhood,lat,long
0,22011609,2022-07-20,LARCENY-NON VEHICLE,Custer/McDonough/Guice,33.716073,-84.353217
1,22254073,2022-09-11,LARCENY-FROM VEHICLE,Reynoldstown,33.746335,-84.361753
13,210770926,2022-01-15,LARCENY-NON VEHICLE,Arlington Estates,33.688746,-84.54419
21,212630747,2022-07-11,HOMICIDE,Fairburn Heights,33.772107,-84.501745
60,220010108,2022-01-01,AGG ASSAULT,Summerhill,33.738249,-84.383948


In [7]:
#Get rid of any rows with a duplicate 'offense_id'
cobra_summary_df = cobra_summary_df.drop_duplicates(subset='offense_id', keep="first")

# reset index 
cobra_summary_df.reset_index(drop=True)

Unnamed: 0,offense_id,occur_date,crime_type,neighborhood,lat,long
0,22011609,2022-07-20,LARCENY-NON VEHICLE,Custer/McDonough/Guice,33.716073,-84.353217
1,22254073,2022-09-11,LARCENY-FROM VEHICLE,Reynoldstown,33.746335,-84.361753
2,210770926,2022-01-15,LARCENY-NON VEHICLE,Arlington Estates,33.688746,-84.544190
3,212630747,2022-07-11,HOMICIDE,Fairburn Heights,33.772107,-84.501745
4,220010108,2022-01-01,AGG ASSAULT,Summerhill,33.738249,-84.383948
...,...,...,...,...,...,...
21007,223560731,2022-12-22,LARCENY-NON VEHICLE,Edgewood,33.757886,-84.347406
21008,223560734,2022-12-19,AUTO THEFT,,33.753515,-84.495354
21009,223560793,2022-12-12,LARCENY-NON VEHICLE,Downtown,33.748887,-84.392788
21010,223560803,2022-12-22,LARCENY-NON VEHICLE,Harland Terrace,33.751137,-84.486120


## Read in MARTA rail station file

In [8]:
#Store MARTA rail station data CSV as dataframe
rail_station_csv_file = "../../data/Transit_Rail_Stations.csv"
rail_station_complete_df = pd.read_csv(rail_station_csv_file)
rail_station_complete_df.head()

Unnamed: 0,latitude,longitude,altitude,geometry,OBJECTID,STATION,Stn_Code,Stn_Site,Extrude,GlobalID,last_edited_date
0,33.774524,-84.295376,,Point,1,Decatur,E6,http://www.itsmarta.com/getthere/stations/deca...,100,{8580187C-665E-47DD-87C6-A0C9C500B113},
1,33.775031,-84.282338,,Point,2,Avondale,E7,http://www.itsmarta.com/getthere/stations/avon...,100,{2ED7EC52-BD69-4B51-A859-6548477EA29B},
2,33.887431,-84.306324,,Point,3,Chamblee,NE9,http://www.itsmarta.com/getthere/stations/cham...,100,{4A2A920F-B4D7-4720-9A0D-1F0633981AC3},
3,33.700331,-84.428944,,Point,4,Lakewood-Ft. McPherson,S4,http://www.itsmarta.com/getthere/stations/lake...,100,{079686C9-BC4D-47EF-A03D-1AC588F90C39},
4,33.717032,-84.42516,,Point,5,Oakland City,S3,http://www.itsmarta.com/getthere/stations/oakl...,100,{929088B5-0B87-4557-A5B5-3239FE62D857},


In [9]:
#Create a new dataframe with just the columns we want to use
rail_station_summary_df = rail_station_complete_df[["STATION", "latitude", "longitude"]]
rail_station_summary_df.head()

Unnamed: 0,STATION,latitude,longitude
0,Decatur,33.774524,-84.295376
1,Avondale,33.775031,-84.282338
2,Chamblee,33.887431,-84.306324
3,Lakewood-Ft. McPherson,33.700331,-84.428944
4,Oakland City,33.717032,-84.42516


In [22]:
#Rename column for easier readability
rail_station_summary_df.rename(columns={'STATION':'station'}, inplace = True)
rail_station_summary_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,station,latitude,longitude
0,Decatur,33.774524,-84.295376
1,Avondale,33.775031,-84.282338
2,Chamblee,33.887431,-84.306324
3,Lakewood-Ft. McPherson,33.700331,-84.428944
4,Oakland City,33.717032,-84.42516


## Find distance in lat and long of crime from rail station

In [10]:
#Put the rail station dataframe into a list of dictionaries
rail_station_data = rail_station_summary_df.to_dict(orient='records')
rail_station_data

[{'STATION': 'Decatur',
  'latitude': 33.77452350213134,
  'longitude': -84.2953761869187},
 {'STATION': 'Avondale',
  'latitude': 33.775030857776706,
  'longitude': -84.28233754763869},
 {'STATION': 'Chamblee',
  'latitude': 33.887430936043835,
  'longitude': -84.30632437676441},
 {'STATION': 'Lakewood-Ft. McPherson',
  'latitude': 33.700331434235736,
  'longitude': -84.4289440309677},
 {'STATION': 'Oakland City',
  'latitude': 33.71703218276701,
  'longitude': -84.42515985144823},
 {'STATION': 'West End',
  'latitude': 33.7358233864717,
  'longitude': -84.41366998137127},
 {'STATION': 'Arts Center',
  'latitude': 33.789084201961124,
  'longitude': -84.3868068781822},
 {'STATION': 'Lindbergh Center',
  'latitude': 33.82248569901961,
  'longitude': -84.36953392502531},
 {'STATION': 'Kensington',
  'latitude': 33.77259708180039,
  'longitude': -84.25191914320045},
 {'STATION': 'Georgia State',
  'latitude': 33.74997973158319,
  'longitude': -84.38584557158936},
 {'STATION': 'King Memori

In [11]:
#Put Crime dataframe into a list of dictionaries
cobra_summary_data = cobra_summary_df.to_dict(orient='records')

In [12]:
#Functions for finding the closest MARTA rail station to each crime
def distance(lat1, lon1, lat2, lon2):
    p = 0.017453292519943295
    hav = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p)*cos(lat2*p) * (1-cos((lon2-lon1)*p)) / 2
    return 12742 * asin(sqrt(hav))

def closest(data, v):
    return min(data, key=lambda p: distance(v['latitude'],v['longitude'],p['latitude'],p['longitude']))
                                                   
v = {'latitude': 33.716073, 'longitude': -84.353217}
print(closest(rail_station_data, v))

{'STATION': 'King Memorial', 'latitude': 33.74980807108449, 'longitude': -84.37554588614684}


In [13]:
#Create empty lists to hold closest rail station data
closest_station_name = []
closest_station_lat = []
closest_station_long = []
distance_from_station = []

In [14]:
#Loop through each crime and find the closest MARTA rail station
for i in range(len(cobra_summary_data)):
    crime_lat = cobra_summary_data[i]['lat']
    crime_lon = cobra_summary_data[i]['long']
    v = {'latitude': crime_lat, 'longitude': crime_lon}
    closest_rail_station = closest(rail_station_data, v)
    closest_station_name.append(closest_rail_station['STATION'])
    closest_station_lat.append(closest_rail_station['latitude'])
    closest_station_long.append(closest_rail_station['longitude'])
    distance_to_station = distance(crime_lat, crime_lon, closest_rail_station['latitude'], closest_rail_station['longitude'])
    distance_from_station.append(distance_to_station)


In [15]:
#Create empty lists to hold distance data
diff_in_lat = []
diff_in_long = []

In [16]:
#Loop through each crime and find the difference in latitude and longitude from closest MARTA rail station
for j in range(len(cobra_summary_data)):
    lat_difference = abs(closest_station_lat[j] - cobra_summary_data[j]['lat'])
    diff_in_lat.append(lat_difference)
    long_difference = abs(closest_station_long[j] - cobra_summary_data[j]['long'])
    diff_in_long.append(long_difference)

In [17]:
#Append new columns to cobra_summary_df dataframe
cobra_summary_df['closest_station'] = closest_station_name
cobra_summary_df['difference_in_lat'] = diff_in_lat
cobra_summary_df['difference_in_long'] = diff_in_long
cobra_summary_df['distance_away'] = distance_from_station

cobra_summary_df.head()

Unnamed: 0,offense_id,occur_date,crime_type,neighborhood,lat,long,closest_station,difference_in_lat,difference_in_long,distance_away
0,22011609,2022-07-20,LARCENY-NON VEHICLE,Custer/McDonough/Guice,33.716073,-84.353217,King Memorial,0.033735,0.022329,4.281915
1,22254073,2022-09-11,LARCENY-FROM VEHICLE,Reynoldstown,33.746335,-84.361753,King Memorial,0.003473,0.013793,1.332445
13,210770926,2022-01-15,LARCENY-NON VEHICLE,Arlington Estates,33.688746,-84.54419,East Point,0.011871,0.10339,9.657057
21,212630747,2022-07-11,HOMICIDE,Fairburn Heights,33.772107,-84.501745,Hamilton E. Holmes,0.017695,0.031305,3.499359
60,220010108,2022-01-01,AGG ASSAULT,Summerhill,33.738249,-84.383948,Georgia State,0.011731,0.001898,1.316145


## Export tables to Postgres database

In [18]:
#Connect to database
protocol = 'postgresql'
username = 'postgres'
password = 'postgres'
host = 'localhost'
port = 5432
database_name = 'secret_of_nimby'
rds_connection_string = f'{protocol}://{username}:{password}@{host}:{port}/{database_name}'
engine = create_engine(rds_connection_string)

In [19]:
#Check for tables
engine.table_names()

['cobra_complete', 'cobra_summary', 'transit_rail_station']

In [20]:
#Load in the cobra_summary table
cobra_summary_df.to_sql(name='cobra_summary', con=engine, if_exists='append', index=False)

In [23]:
#Load in the transit rail station table
rail_station_summary_df.to_sql(name='transit_rail_station', con=engine, if_exists='append', index=False)