# Step 2:
# Create Unique School and Daycare IDs and Assign Kids to Schools and Daycares

* Kids with age [0-3] go to Daycares
* Kids with age [4-11] go to Elementry school
* Kids with age [12-13] go to Middle school
* Kids with age [14-17] go to High school

In [None]:
import geopandas as gpd
import pandas as pd
import numpy as np
import random
import timeit
#from shapely.ops import cascaded_union
import multiprocessing
from math import sin, cos, sqrt, atan2, radians

## 1, Load Data

In [None]:
#Census Treact Boundary
#only read two columns
tract = gpd.read_file('data/Erie_Tract_WGS84.shp')#.loc[:,['GEOID10','geometry']]
tract.head()

In [None]:
#Population (results from Step 1)
pop = pd.read_csv('results/erie_population.csv').iloc[:,1:]
pop.head()

In [None]:
pop.shape

In [None]:
#School
school_gdf = gpd.read_file('data/Education_site/schools_NY.shp')
school_gdf.head()

In [None]:
#Read Data
#Daycare
daycare = gpd.read_file('data/Education_site/Daycare_NY.shp')
daycare.head()

## 2, Data Preprocess
Assign Unique IDs for Schools and Daycares

### 2.1 Change to Spatial Data

In [None]:
# From Geo Panda DataFram, change the data to spatial data
def edu_to_gpd(data):
    from shapely.geometry import Point
    #combine lat and lon column to a shapely Point() object
    data['geometry'] = data.apply(lambda x: Point((float(x.LONGITUDE), float(x.LATITUDE))), axis=1)
    data = gpd.GeoDataFrame(data, geometry='geometry')
    #WGS84 Coordinate System
    data.crs = {'init' :'epsg:4326'}
    
    return data

In [None]:
#daycare to geo spatial data
gdc = to_GPD(daycare)
#school
gsch = to_GPD(school_gdf)

In [None]:
gdc.head()

In [None]:
gsch.head()

In [None]:
#gsch.to_file('where you save files')

### 2.2 Create Unique ID for Education Site
Locate the schools and daycares to assign unique IDs

In [None]:
#set census crs into wgs84
tract.crs = {'init' :'epsg:4326'}

In [None]:
gsch.crs

In [None]:
gdc.crs

In [None]:
def join_edu_data(data1, data2, col_name, type):
    if type == "School":
        #spatial join
        sjdf = gpd.sjoin(data1, data2, how = "inner", op = 'intersects')

        #Create new column with the valus of census tract ID
        #school
        sjdf['temp'] = sjdf.apply(lambda x:'%ss' % (x['GEOID10']),axis=1)
        #daycare
        #sjdf['temp'] = sjdf.apply(lambda x:'%sd' % (x['GEOID10']),axis=1)

        #create a column to count the number of education site within the census tract
        sjdf['count'] = 0

        #group by census tract number
        census_group = sjdf.groupby(sjdf['GEOID10'])

        #create a dataframe to hold the data
        wp = pd.DataFrame()

        #loop through each group and get count
        for i,j in census_group:
            group = pd.DataFrame(j).reset_index().drop('index',axis=1)
            for k in range (0,len(group)):
                group.loc[k,'count'] = k
                #print(group)
            #wp = wp.append(group,ignore_index = True)
            wp = pd.concat([wp, group])#wp.append(group,ignore_index = True)

        wp[col_name] = wp.apply(lambda x:'%s%s' % (x['temp'],x['count']),axis=1)

        return wp.reset_index(drop= True)

    if (type == "Daycare"):
                #spatial join
        sjdf = gpd.sjoin(data1, data2, how = "inner", op = 'intersects')

        #Create new column with the valus of census tract ID
        #daycare
        sjdf['temp'] = sjdf.apply(lambda x:'%sd' % (x['GEOID10']),axis=1)

        #create a column to count the number of education site within the census tract
        sjdf['count'] = 0

        #group by census tract number
        census_group = sjdf.groupby(sjdf['GEOID10'])

        #create a dataframe to hold the data
        wp = pd.DataFrame()

        #loop through each group and get count
        for i,j in census_group:
            group = pd.DataFrame(j).reset_index().drop('index',axis=1)
            for k in range (0,len(group)):
                group.loc[k,'count'] = k
                #print(group)
            #wp = wp.append(group,ignore_index = True)
            wp = pd.concat([wp, group])#wp.append(group,ignore_index = True)

        wp[col_name] = wp.apply(lambda x:'%s%s' % (x['temp'],x['count']),axis=1)

        return wp.reset_index(drop= True)

### 2.3 Apply Spatial Join

In [None]:
#Scchool
census = tract.loc[:,['GEOID10','geometry']]
sch_df = join_edu_data(gsch, census, "SchID", "School")
sch_df.head()

In [None]:
sch_df.columns

In [None]:
sch_df = sch_df.drop(columns=['index_right', 'GEOID10', 'temp', 'count'], axis=0)
sch_df['current'] = 0
sch_df.head()

In [None]:
sch_df = to_GPD(sch_df)

In [None]:
type(sch_df)

### Daycare ID

In [None]:
census = tract.loc[:,['GEOID10','geometry']]
dc_df = join_edu_data(gdc, census, "SchID", "Daycare")
dc_df.head()

In [None]:
dc_df = dc_df.drop(columns=['index_right', 'GEOID10', 'temp', 'count'], axis=0)
dc_df['current'] = 0
dc_df.head()

In [None]:
dc_df = dc_df.rename(columns={"POPULATION": "ENROLLMENT"})
dc_df.head()

In [None]:
dc_df = to_GPD(dc_df)
dc_df.head()

## 3, Assign Daycare ID and School ID

### 3.1 Function to calculate the great circle distance
The reasons why we create this funtion is due the distance function from geopandas package is not giving us the right result. To fix the issue, we create a new distance function.

If yours works fine, you don't have to do this.

In [None]:
def new_distance(x1, y1, x2, y2):
    # approximate radius of earth in km
    R = 6373.0
    
    lat1 = radians (x1)
    long1 = radians (y1)
    lat2 = radians(x2)
    long2 = radians(y2)
    
    dlon = long2 - long1
    dlat = lat2 - lat1
    
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    return distance

### 3.2 Assign School and DaycareID function

In [None]:
#change pop to gpd
from src.tools import to_geo_df
g_pop = to_geo_df(pop)
g_pop.plot()

In [None]:
len(g_pop)

In [None]:
g_pop.plot()

In [None]:
census = tract.loc[:,['GEOID10','geometry']].set_index('GEOID10')

def assign_school(data, school, daycare):
    #indi_buffer = data.geometry.buffer(0.08)
    #print(type(indi_buffer))
    #Assign edu site based on age
    if data.age >= 4 and data.age <= 17:
        #print("school")
        return find_eduid(data.lat, data.long, data.geometry, school)

    elif data.age < 4:
        #print("daycare")
        return find_eduid(data.lat, data.long, data.geometry, daycare)
    else:
        return data.wp

        #return df_sch_in.iloc[0, 0]

In [None]:
def find_eduid (x, y, position, edu_site):
    buff = position.buffer(0.08)

    s_in = edu_site[edu_site.intersects(buff)].copy()
    #print(s_in.head())
    #print(len(s_in))

    #Intersect Road Point Lat list
    sx = s_in.loc[:,'LATITUDE'].tolist()
    #Intersect Road Point Long list
    sy = s_in.loc[:, 'LONGITUDE'].tolist()

    #Calculate Distance between the point and intersected road points
    dist = [] #distance list
    for j in range(0, len(s_in)):
        d = new_distance(x, y, sx[j], sy[j])
        dist.append(d)

    #school ID list
    sid = s_in.loc[:,'SchID']

    #Create DF to hold the School ID an the their distance to Kids
    df_sch_in = pd.DataFrame({'SchID': sid, 'Dist':dist}).sort_values(by='Dist')#.reset_index(drop=True)
    #print("++++")
    #print(df_sch_in)

    sch_AgeDistAccept = [s for s in df_sch_in.index if edu_site.loc[s, 'current'] < edu_site.loc[s, 'ENROLLMENT']]
    #print(sch_AgeDistAccept)

    if sch_AgeDistAccept != []:
        j = sch_AgeDistAccept[0]
        #print(df_sch_in.loc[j, 'SchID'])
        return df_sch_in.loc[j, 'SchID']
    else:
        return random.choice(sid)

    school.loc[sch_id, 'current'] += 1

In [None]:
test = g_pop[:100].copy()
test.age.unique()

In [None]:
#test = g_pop[:20].copy()
#test['tract'] = test['hhold'].str[0:11]
test['wp'] = test.apply(assign_school, args=(sch_df, dc_df), axis=1)
test

In [None]:
test.wp.unique()

### 3.3 Apply Assign School ID funtion

In [None]:
print('Start running...')
#set timer
start_time = timeit.default_timer()
g_pop['wp'] = test.apply(assign_school, args=(sch_df, dc_df), axis=1)
elapsed = timeit.default_timer() - start_time
print("Total Time(s):", elapsed)
print('End program')

In [None]:
len(g_pop.wp.unique())