# Lat Long Analysis

This portion of our project uses the schools' latitude and longitude (previously obtained using geocoder) to perform some final cleaning on the VADIR data and then join it to the NYC crime data by location. The code below will:  

* (A): __Load vadir and felony data__ with the functions from cleandata.py
* (B): __Join latlong data__ to the school data frame (using beds/sed code).
* (C): __Give schools consistent names__ (we'll use the names from the lat long file)  
* (D): __Fill in missing boroughs__ for records from 2006-2007.  
* (E): __Identify felonies within a 1 mile__ radius of a given school.  
* (F): __Plot correlations__ between school indicents and felonies (by year, by borough, by felony type, by location, by school incident type).

In [1]:
%load_ext autoreload
%autoreload 2
% matplotlib inline

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from vincenty import vincenty
import cleandata as cd
import time

## A. Load data using functions from cleandata.py

In [2]:
school_df = cd.load_and_clean_VADIR()
felony_df = cd.load_and_clean_NYPD()
latlon_df = pd.read_csv('SchoolLatLon.csv', index_col=0)
#... ??? replace the last call with function call to Aaron's geocoder function???

NameError: name 'cd' is not defined

## B. Join School Data and Location Data

In [None]:
# Helper function -- extract lat/long from object type
def parse_latlong(dataframe, loc_column):
    """
    Function to extract lat/long coords. 
    INPUT: dataframe and name of column with string tuple or list pair of coordinates.
    OUTPUT: n/a. Function modifies dataframe to add a lat and long column with float type.
    """
    get_lat = lambda x: x.split(',')[0][1:] if type(x)==type('s') else np.nan
    get_long = lambda x: x.split(',')[1][:-1] if type(x)==type('s') else np.nan
    dataframe['lat'] = dataframe[loc_column].apply(get_lat).astype('float64')
    dataframe['long'] = dataframe[loc_column].apply(get_long).astype('float64')
    print('... latitude and longitude extracted for dataframe.')

In [None]:
# Helper function to add latitude and longitudes to school data frame
def join_latlong(school_df, latlon_df, talk=False):
    """
    Function to add (and parse)latitude and longitude 
    information for each school.
    INPUT: school dataframe including "BEDS Code" column,
           latitude/longitude dataframe w/ "SED CODE" col.
           (optional 'talk' bool turns on/off print statements)
    OUTPUT: dataframe with school data plut new columns 
            for 'LEGAL NAME', 'Full_Address', 'latlon'(obj),
            lat(float64) and long(float64).
    """
    # ensure BEDS and SED are integers so that they'll be recognized as identical
    latlon_df["SED CODE"] = latlon_df["SED CODE"].astype(np.int64)
    school_df["BEDS Code"] = school_df["BEDS Code"].astype(np.int64)
    
    # join latlong data to school data using the BEDS code
    school_df = pd.merge(school_df, latlon_df, left_on=['BEDS Code'],right_on=['SED CODE'], how='left')
    
    # parse latlon object in to numerical columns
    parse_latlong(school_df, 'latlon')
    
    # drop the now redundant SED code
    school_df.drop(['SED CODE'], axis=1, inplace=True)
    
    # Take a look at the resulting data/missing values
    if talk:
        print('... joined df inclues {} unique schools,'.format(len(school_df['BEDS Code'].unique())))
        schools_withloc = school_df[school_df['latlon'].notnull()]['BEDS Code'].unique()
        schools_missingloc = school_df[school_df['latlon'].isnull()]['BEDS Code'].unique()
        print('... of which {} have lat/long'.format(len(schools_withloc)),
              'and {} are missing lat/long'.format(len(schools_missingloc)))
        
    return school_df

In [None]:
# Function Call
school_df = join_latlong(school_df, latlon_df, talk=True)

# Take a look -- uncomment to run
#school_df.head(2)

## C. Fix School Names using info from 'latlon' file.

In [None]:
# Helper Function
def fix_case(x):
    """Function to put a school name in the correct case"""
    if not x:
        return x
    elif x[:3] in ['PS ', 'JHS', 'MS ']:
        return x[:3] + x[3:].title()
    else:
        return x.title()

In [None]:
# Helper function to eliminate duplicated school names
def fix_school_names(school_df, talk=False):
    """ 
    INPUT: school_df with original 'School Name' column
           and with 'LEGAL NAME' column from latlong file.
           (optional 'talk' bool turns on/off print statements)
    OUTPUT: n/a, fxn modifies school_df to replace duplicate
           'School Name's with their (consistent) legal name.
    """
    if talk:
        print('... original dataset had {} unique'.format(len(school_df['School Name'].unique())),
              'school names but only {} unique BEDS Codes'.format(len(school_df['BEDS Code'].unique())))
    
    # Fix missing LEGAL NAMES with School Name
    school_df['LEGAL NAME'].fillna(school_df['School Name'], inplace=True)
    # Fix case and reassign to School Name
    school_df['School Name'] = school_df['LEGAL NAME'].apply(fix_case)
    # drop the now redundant LEGAL NAME column
    school_df.drop(['LEGAL NAME'], axis=1, inplace=True)
    
    if talk:
        print('... new dataset has {} unique school '.format(len(school_df['School Name'].unique())),
              'names and {} unique BEDS Codes.'.format(len(school_df['BEDS Code'].unique())))

In [None]:
# function call
fix_school_names(school_df, talk=True)

# Take a look -- uncomment to run
#school_df.head(2)

## D. Fill in missing boroughs

__TODO:__ From the numbers below it looks like some of the County values got switched around (eg. decrease in Manhattan counts?)... I think we probalby need a way of creating the county_map dictionary that prioritizes the traditional borough name). Come back to this.

In [None]:
# Helper function to fill in boroughs
def fill_in_boroughs(school_df, talk=False):
    """ 
    INPUT: school_df with 'County' and 'BEDS Code' columns
           (optional 'talk' bool turns on/off print statements)
    OUTPUT: n/a, fxn modifies school_df to fill in boroughs.
    """
    if talk:
        print('... Originally, {} entries were missing'.format(sum(school_df['County'].isnull())),
              ' county info. Other counties:\n',school_df.County.value_counts())

    # create dictionary of county by BEDS Code
    c = school_df[school_df['County'].notnull()][['BEDS Code','County']].to_dict()
    county_map = {c['BEDS Code'][idx]: c['County'][idx] for idx in c['County'].keys()}
    # map counties using dictionary
    school_df.County = school_df['BEDS Code'].map(county_map)
    
    if talk:
        print('... Now {} entries are missing'.format(sum(school_df['County'].isnull())),
              ' county info. Other counties:\n',school_df.County.value_counts())

In [None]:
# function call
fill_in_boroughs(school_df, talk=True)

# Take a look -- uncomment to run
#school_df.head(2)

In [None]:
# QUICK CHECK - are any BEDS Codes are linked with more than one Borough(County)?
school_df.groupby('BEDS Code')['County'].apply(lambda x: len(x.unique())).value_counts()

### Load and Prep NYC data

In [None]:
# load NYC dataframe
felony_df = pd.read_csv('NYPD_7_Major_Felony_Incidents.csv', index_col = False)

In [None]:
# ... and clean it   
felony_df = cd.clean_NYPD(felony_df)

# take a look -- uncomment to run
#felony_df.head()

In [None]:
# visual check that school year column calculation worked
felony_df.groupby(['Occurrence Month','Occurrence Year'])['School Year'].mean()['Jun']

In [None]:
# Extact Lattitude and longitude data for felony dataframes
parse_latlong(felony_df, 'Location 1')

---

## E. Extracting crime tallies w/in radius of schools

The functions below build together to ultimately 'join' our two datasets by adding columns to the school dataset for tallies of each type of felony that occurred w/in a 1 mile radius. 

#### Part 1: Distance Calculation using Vincenty

In [None]:
# Quick Check, are there rows with 'latlon' but not 'lat'
print('... there are {} missing latlon entries'.format(sum(school_df.latlon.isnull())))
print('... there are {} missing lat entries'.format(sum(school_df.lat.isnull())))

In [None]:
# Helper function -- check dist
def is_in_radius(school_point, crime_point, radius):
    """
    Function using vincenty package to check distance between school and crime.
    INPUT: (lat,long) tuples for school and crime (in degrees), radius in miles.
    OUTPUT: Boolean
    """
    return vincenty(school_point, crime_point, miles=True) <= radius

In [None]:
# Testing vincenty on the first felony and first school
first_school_point = (school_df.loc[0,'lat'], school_df.loc[0,'long']) 
first_felony_point = (felony_df.loc[1,'lat'], felony_df.loc[1,'long']) 

# not w/in 2 miles, but yes, w/in 50
print('Distance: ', vincenty(first_school_point, first_felony_point))
print("... w/in 2 mi?", is_in_radius(first_school_point, first_felony_point, 2))
print("... w/in 50 mi?",is_in_radius(first_school_point, first_felony_point, 50))

#### Part 2: Setting up a grid for NYC lat/long coords

The goal is to use a grid strategy to avoid searching the entire NYC Felony dataframe for each school location (which would take too long).

NOTES: The max lat of a school is ~ 40.9  and the distance between 40.9 and 40.95 is over 3 miles... but there are 7 crimes that fell under the jurisdiction of the NY Transit police whose locations are recorded north of 41 degrees (the farthes one is 500 miles away). The minimum longitude of a school is ~-74.24 which is around 3 miles from -74.3. There are 63 crimes that occurred west of -74.3. I suggest that we disregard these outliers for the purposes of our analysis

In [None]:
# Initial exploration of ranges
max_lat = school_df.lat.max()
min_lat = school_df.lat.min()
max_long = school_df.long.max()
min_long = school_df.long.min()

lat_dist = vincenty((min_lat, 0.5*(max_long + min_long)),(max_lat, 0.5*(max_long + min_long)), miles=True)
long_dist = vincenty((min_long, 0.5*(max_lat + min_lat)),(max_long, 0.5*(max_lat + min_lat)), miles=True)

print('Latitude ranges from {} to {} with a total distance of {}'.format(min_lat, max_lat, lat_dist))
print('Longitude ranges from {} to {} with a total distance of {}'.format(min_long, max_long, long_dist))

In [None]:
# Helper function to identify grid cell that contains a given point
def nyc_grid(lat,long):
    """
    This function identifies a square mile cell of NYC that contains 
    the given longitude and latitude point. There are 1500 cells in 
    total. 30 rows each represent a segement of latitude and 50 
    columns each represent a segment of longitude. The cells are 
    numbered 0 through 1599 and they are unique to this analysis.
    """
    # max and min values from data set
    max_lat = 40.95
    min_lat = 40.50
    max_long = -73.45
    min_long = -74.30
    
    # divide each range into segments of a little over a mile
    delta_lat = (max_lat - min_lat)/28
    delta_long = (max_long - min_long)/48

    # then segment each direction
    lat_seg = np.array([min_lat + idx*delta_lat for idx in range(-1,29)])
    long_seg = np.array([min_long + idx*delta_long for idx in range(-1,49)])

    # identify where given point fits in segments
    row = sum(lat_seg <= lat) - 1
    col = sum(long_seg <= long) - 1
    
    # return grid number
    if row < 0 or row == 29 or col < 0 or col == 49:
        return np.nan
    else:
        return row * 50 + col

In [None]:
# Test an out of bound point
nyc_grid(40.653161, -76.862164)

In [None]:
# Test an in bound point
nyc_grid(40.821798, -73.886463)

#### Part 3: Function to find crimes w/in 1 mile of a school

STRATEGY: Identify the grid cell containing the school and then search only that cell and its immediately adjacent cells.

In [None]:
# helper function to get a list of adjacent cells
def get_adjacent(cell_num):
    """ 
    This function identifies a group of cells which together superset 
    any points within a mile of any location in the original cell.
    INPUT: a cell number (< 5999) from NYC grid
    OUTPUT: a list of adjacent and or diagonal cell numbers
    
    NOTE: this function should only be run on cell numbers of vadir
    school locations since the nyc_grid is designed so that all
    schools are in a cell that is not a boarder cell.
    """
    col = cell_num % 50
    row = cell_num // 50
    row_range = [row - 1, row, row + 1]
    col_range = [col - 1, col, col + 1]
    return [r * 50 + c for r in row_range for c in col_range]
    

In [None]:
# Test grid adjacency
get_adjacent(52)

In [None]:
# helper function to get subset of felonies within a 1 mile radius
def get_local_crimes(location, felony_df):
    """
    This function identifies crimes less than 1 mi 
    from a single school (or GPS location).
    INPUT: location (GPS point), felony_df (w/
        'NYC_grid column already populated)
    OUTPUT: felony_df subset of crimes that 
        occurred within one mile of school.
            
    """
    cells_to_search = get_adjacent(nyc_grid(*location))
    
    # Get subset of crimes w/in grid
    cells_to_search = get_adjacent(nyc_grid(*location)) 
    crimes = felony_df.loc[felony_df.NYC_grid.isin(cells_to_search)]
    
    # Further subset by a radius of 1 mile
    if not crimes.empty: 
        r_filter = lambda x: is_in_radius(location,(x.lat,x.long),1)
        crimes =  crimes[crimes.apply(r_filter, axis=1)]
        
    return crimes

In [None]:
# test function on a single school / no crimes
this_school = school_df[school_df['BEDS Code'] == 307500014256]
location = (this_school.lat.mean(), this_school.long.mean())
result = get_local_crimes(location, felony_df)

In [None]:
# now testing on a school with crimes
this_school = school_df[school_df['BEDS Code'] == 307500012017]
location = (this_school.lat.mean(), this_school.long.mean())
result = get_local_crimes(location, felony_df)

#### Part 4: Crime counting function to search only within adjacent cells of the school

NOTE: loading grid cell#s for the felony data set takes 3-4 minutes and only needs to be done once. To skip that step after you've already run this function before simply set the optional parameter skip_gridsetup to True (it defaults False)

In [None]:
def tally_crime_by_loc(school_df, felony_df, skip_gridsetup = False):
    """
    Function to tally and store information about felonies 
    that occur w/in one mile of each school in the school_df.
    INPUT: school df w/ cols 'latlon', 'lat', 'long', and 'School Year'
           felony df w/ cols 'Occurrence Year', 'lat','long','Offense', and 'Identifier'
    OUTPUT: n/a, modifies school data.
    """
    if not skip_gridsetup:
        # prepare felony dataframe by adding a column for nyc_grid cell number
        felony_df.lat.fillna(0, inplace=True)
        felony_df.long.fillna(0, inplace=True)
        felony_df['NYC_grid'] = felony_df.apply(lambda x: nyc_grid(x.lat, x.long),axis=1)

    # Initialize new columns in school data frame
    school_df['CrimeIDS'] = pd.Series()
    school_df['Total Felonies w/in 1mi'] = pd.Series()
    school_df['Grand Larceny w/in 1mi'] = pd.Series()
    school_df['Robbery w/in 1mi'] = pd.Series()
    school_df['Burglary w/in 1mi'] = pd.Series()
    school_df['Assault w/in 1mi'] = pd.Series()
    school_df['Auto Theft w/in 1mi'] = pd.Series()
    school_df['Rape w/in 1mi'] = pd.Series()
    school_df['Murders w/in 1mi'] = pd.Series()
    
    # Group schools (unique location for each BEDS Code) 
    grouped = school_df[school_df.lat.notnull()].groupby(['BEDS Code'])
    
    # Loop through schools, subset crime by location. 
    for beds, df in grouped:
        # NOTE: the coordinates should all be the same so the mean is just the location
        assert len(df.lat.unique().tolist()) == 1, 'ERROR: multiple latitudes for this school.'
        location = (df.lat.mean(), df.long.mean())
        local_crimes = get_local_crimes(location, felony_df)

        # tally and store felonies for each year
        for year in df['School Year'].unique():
            subset = local_crimes[local_crimes['School Year'] == year]
            idxs = df[df['School Year'] == year].index.tolist()
            school_df.loc[idxs,['CrimeIDS']] = str(subset.Identifier.unique().tolist())
            school_df.loc[idxs,['Total Felonies w/in 1mi']] = len(subset)        
            school_df.loc[idxs,['Grand Larceny w/in 1mi']] = sum(subset['Offense'] == 'GRAND LARCENY')
            school_df.loc[idxs,['Robbery w/in 1mi']] = sum(subset['Offense'] == 'ROBBERY')
            school_df.loc[idxs,['Burglary w/in 1mi']] = sum(subset['Offense'] == 'BURGLARY')
            school_df.loc[idxs,['Assault w/in 1mi']] = sum(subset['Offense'] == 'FELONY ASSAULT')
            school_df.loc[idxs,['Auto Theft w/in 1mi']] = sum(subset['Offense'] == 'GRAND LARCENY OF MOTOR VEHICLE')
            school_df.loc[idxs,['Rape w/in 1mi']] = sum(subset['Offense'] == 'RAPE')
            school_df.loc[idxs,['Murders w/in 1mi']] = sum(subset['Offense'] == 'MURDER & NON-NEGL. MANSLAUGHTE')

    print('... tallied felonies w/in 1 mile of each school.')

In [None]:
# WARNING ... this cell takes around and hour and half to run (~5 seconds per school)
# Try importing: 'vadir_with_felonycounts.csv' for a faster option

# run function
tally_crime_by_loc(school_df, felony_df)


In [None]:
school_df.to_csv('vadir_with_felonycounts.csv')

## F. Exploring the data

In [None]:
# Shortcut to data - uncomment to run.
#school_df = pd.read_csv('vadir_with_felonycounts.csv')

In [None]:
school_df.groupby('County')['Robbery w/in 1mi','Total Incidents'].mean().plot(kind='bar', legend=True)

In [None]:
school_df.plot(x='Total Incidents', y='Total Felonies w/in 1mi', kind='scatter')

In [None]:
school_df.plot(x='Total Incidents' , y='Burglary w/in 1mi', kind='scatter')

In [None]:
school_df.plot(x='Total Incidents' , y='Assault w/in 1mi', kind='scatter')

In [None]:
school_df.plot(x='Robbery w/in 1mi' , y='Total Felonies w/in 1mi', kind='scatter')

In [None]:
school_df.plot(x='Burglary w/in 1mi' , y='Total Felonies w/in 1mi', kind='scatter')

In [None]:
school_df.plot(x='Grand Larceny w/in 1mi' , y='Total Felonies w/in 1mi', kind='scatter')