# Join and Pre-Process the New York City Datasets

Code responsible to Join and Pre-Process three New York City datasets:

- Crimes
- Census
- Housing

In [1]:
import math
import pandas as pd

In [2]:
df_census = pd.read_table("Input/census_data.csv", sep = ',')
df_crimes = pd.read_table("Input/crimes_data.csv", sep = ',')
df_housing = pd.read_table("Input/housing_data.csv", sep = ',')

### Auxiliary Function: Rename Columns

In [3]:
def rename_columns(df, string):
    
    new_columns = list(map(lambda x: {x: x + ' ' + string}, df.columns))
    columns = {}
    for index, column in enumerate(new_columns):
        key = list(column.keys())[0]
        columns[key] = new_columns[index][key]

    return columns

In [4]:
columns = rename_columns(df_crimes, 'Crimes')
df_crimes.rename(columns = columns, inplace = True)
df_crimes.head(5)

Unnamed: 0,Event Date Crimes,Event Ending Date Crimes,Event Report Date Crimes,Description Crimes,Attempt Complete Crimes,Level of Offense Crimes,Borough Crimes,Latitude Crimes,Longitude Crimes
0,12/31/2015,,12/31/2015,FORGERY,COMPLETED,FELONY,BRONX,40.828848,-73.916661
1,12/31/2015,,12/31/2015,MURDER & NON-NEGL. MANSLAUGHTER,COMPLETED,FELONY,QUEENS,40.697338,-73.784557
2,12/31/2015,,12/31/2015,DANGEROUS DRUGS,COMPLETED,FELONY,MANHATTAN,40.802607,-73.945052
3,12/31/2015,,12/31/2015,ASSAULT 3 & RELATED OFFENSES,COMPLETED,MISDEMEANOR,QUEENS,40.654549,-73.726339
4,12/31/2015,12/31/2015,12/31/2015,ASSAULT 3 & RELATED OFFENSES,COMPLETED,MISDEMEANOR,MANHATTAN,40.738002,-73.987891


In [5]:
columns = rename_columns(df_census, 'Census')
df_census.rename(columns = columns, inplace = True)
df_census.head(5)

Unnamed: 0,BlockCode Census,Latitude Census,Longitude Census,County Census,Borough Census,TotalPop Census,Men Census,Women Census,Hispanic Census,White Census,...,Walk Census,OtherTransp Census,WorkAtHome Census,MeanCommute Census,Employed Census,PrivateWork Census,PublicWork Census,SelfEmployed Census,FamilyWork Census,Unemployment Census
0,36005000100,40.792756,-73.883541,BRONX,BRONX,7703,7133,570,29.9,6.1,...,,,,,0,,,,,
1,36005000200,40.805866,-73.860278,BRONX,BRONX,5403,2659,2744,75.8,2.3,...,2.9,0.0,0.0,43.0,2308,80.8,16.2,2.9,0.0,7.7
2,36005000400,40.807594,-73.851237,BRONX,BRONX,5915,2896,3019,62.7,3.6,...,1.4,0.5,2.1,45.0,2675,71.7,25.3,2.5,0.6,9.5
3,36005001600,40.819196,-73.85804,BRONX,BRONX,5879,2558,3321,65.1,1.6,...,8.6,1.6,1.7,38.8,2120,75.0,21.3,3.8,0.0,8.7
4,36005001900,40.801216,-73.909212,BRONX,BRONX,2591,1206,1385,55.4,9.0,...,3.0,2.4,6.2,45.4,1083,76.8,15.5,7.7,0.0,19.2


In [6]:
df_housing.head(5)

Unnamed: 0,Borough,Latitude,Longitude,Extremely Low Income Units,Very Low Income Units,Low Income Units,Moderate Income Units,Middle Income Units,Other Income Units,Studio Units,Mean Bathrooms,County
0,MANHATTAN,40.811202,-73.942995,0,16,24,0,38,1,16,1.35443,NEW YORK
1,MANHATTAN,40.833829,-73.941339,20,3,1,0,0,1,0,2.6,NEW YORK
2,MANHATTAN,40.83516,-73.941663,6,11,3,0,0,0,0,2.0,NEW YORK
3,MANHATTAN,40.835295,-73.941977,4,14,2,0,0,0,0,2.0,NEW YORK
4,BROOKLYN,40.708637,-73.948486,59,7,4,0,0,1,0,2.197183,KINGS


### Defining distance between two geographic points

Given 2 points of latitude and longitude:

$
    \DeclareMathOperator{\atantwo}{atan2}
    earth\_radius = 6373.0 \\
    dist\_lat = lat_b -  lat_a \\ 
    dist\_lon = lon_b - lon_a \\
    a = \sin(\frac{dist\_lat}{2})^2+ cos(lat_a) \times \cos(lat_b) \times \sin(\frac{dist\_lon}{2})^2 \\
    b = 2 \times \atantwo(\sqrt(a), \sqrt(1-a)) \\
    distance = earth\_radius \times b \\
$

In [7]:
def geo_point_distance(lat_a, long_a, lat_b, long_b):
    # approximate radius of earth in km
    earth_radius = 6373.0
    point_a, point_b = {}, {}

    point_a['Lat'] = math.radians(lat_a)
    point_a['Long'] = math.radians(long_a)
    
    point_b['Lat'] = math.radians(lat_b)
    point_b['Long'] = math.radians(long_b)

    distance_lon = point_b['Long'] - point_a['Long']
    distance_lat = point_b['Lat'] - point_a['Lat']
    
    a = math.cos(point_b['Lat'])*math.sin(distance_lon / 2)**2
    a = math.sin(distance_lat/2)**2+math.cos(point_a['Lat'])*a
    a = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    distance = earth_radius * a

    return distance

### Measuring geopoint distances

In [8]:
def measure_geopoint(df_one, df_two, geo_string):

    index = 0
    for census_lat, census_long in zip(df_one['Latitude'], df_one['Longitude']):

            # distance between a block and a crime
            values = list(map(lambda x, y: geo_point_distance(census_lat, census_long, x, y),
                              df_two['Latitude ' + geo_string], df_two['Longitude ' + geo_string]))

            min_value_index = values.index(min(values))

            for column in df_two.columns:
                df_one.loc[index, column] = df_two.loc[min_value_index, column]
            df_one.loc[index,  geo_string + ' Distance'] = min(values)

            index+=1
    
    return df_one

In [9]:
df_housing = measure_geopoint(df_housing, df_crimes, 'Crimes')

In [10]:
df = measure_geopoint(df_housing, df_census, 'Census')

In [90]:
df_census.head(5)

Unnamed: 0,BlockCode,Latitude,Longitude,County,Borough,TotalPop,Men,Women,Hispanic,White,...,Extremely Low Income Units Housing,Very Low Income Units Housing,Low Income Units Housing,Moderate Income Units Housing,Middle Income Units Housing,Other Income Units Housing,Studio Units Housing,Mean Bathrooms Housing,County Housing,Housing Distance
0,36005000100,40.792756,-73.883541,BRONX,BRONX,7703,7133,570,29.9,6.1,...,0.0,0.0,17.0,66.0,0.0,1.0,12.0,1.357143,QUEENS,2.275122
1,36005000200,40.805866,-73.860278,BRONX,BRONX,5403,2659,2744,75.8,2.3,...,4.0,17.0,26.0,12.0,0.0,0.0,0.0,1.0,BRONX,1.047753
2,36005000400,40.807594,-73.851237,BRONX,BRONX,5915,2896,3019,62.7,3.6,...,0.0,31.0,0.0,0.0,0.0,0.0,0.0,1.0,BRONX,1.375001
3,36005001600,40.819196,-73.85804,BRONX,BRONX,5879,2558,3321,65.1,1.6,...,0.0,39.0,0.0,0.0,0.0,0.0,0.0,2.564103,BRONX,0.196547
4,36005001900,40.801216,-73.909212,BRONX,BRONX,2591,1206,1385,55.4,9.0,...,17.0,2.0,15.0,0.0,0.0,0.0,0.0,2.235294,BRONX,0.572929


In [11]:
df.to_csv("teste.csv", sep = ',', index = False)