# Step 2:
# Create Unique School and Daycare IDs and Assign Kids to Schools and Daycares

* Kids with age [0-3] go to Daycares
* Kids with age [4-11] go to Elementry school
* Kids with age [12-13] go to Middle school
* Kids with age [14-17] go to High school

In [None]:
import geopandas as gpd
import pandas as pd
import numpy as np
import random
import timeit
#from shapely.ops import cascaded_union
import multiprocessing
from math import sin, cos, sqrt, atan2, radians

## 1, Load Data

In [None]:
#Read Data
#Daycare
daycare = pd.read_csv('../../Model_P/Education_ID_RID/Edu_Daycare_May.csv')

In [None]:
#School
school = pd.read_csv('../../Model_P/Education_ID_RID/Edu_School_time_May.csv')

In [None]:
#Census Treact Boundary
#only read two columns
tract = gpd.read_file('../Data/Census_Cleaned.shp')#.loc[:,['GEOID10','geometry']]

In [None]:
#Population (results from Step 1)
pop = pd.read_csv('../../Model_P/pop_full_Aug_Richard.csv')

In [None]:
census = tract.loc[:,['GEOID10','geometry']]

## 2, Data Preprocess
Assign Unique IDs for Schools and Daycares

### 2.1 Change to Spatial Data

In [None]:
# From Geo Panda DataFram, change the data to spatial data
def to_GPD(data):
    from shapely.geometry import Point
    #combine lat and lon column to a shapely Point() object
    data['geometry'] = data.apply(lambda x: Point((float(x.LONGITUDE), float(x.LATITUDE))), axis=1)
    data = gpd.GeoDataFrame(data, geometry='geometry')
    #WGS84 Coordinate System
    data.crs = {'init' :'epsg:4326'}
    
    return data

In [None]:
#set census crs into wgs84
tract.crs = {'init' :'epsg:4326'}
#daycare to geo spatial data
gdc = to_GPD(daycare)
#school
gsch = to_GPD(school)

In [None]:
gsch.to_file('where you save files')

### 2.2 Create Unique ID for Education Site
Locate the schools and daycares to assign unique IDs

In [None]:
def join_edu_data(data1, data2, col_name):
    #spatial join
    sjdf = gpd.sjoin(data1, data2, how = "inner", op = 'intersects')

    #Create new column with the valus of census tract ID
    #school
    sjdf['temp'] = sjdf.apply(lambda x:'%ss' % (x['GEOID10']),axis=1)
    #daycare
    #sjdf['temp'] = sjdf.apply(lambda x:'%sd' % (x['GEOID10']),axis=1)
    
    #create a column to count the number of education site within the census tract
    sjdf['count'] = 0
    
    #group by census tract number
    census_group = sjdf.groupby(sjdf['GEOID10'])
    
    #create a dataframe to hold the data
    wp = pd.DataFrame()
    
    #loop through each group and get count
    for i,j in census_group:
        group = pd.DataFrame(j).reset_index().drop('index',axis=1)
        for k in range (0,len(group)):
            group.loc[k,'count'] = k
            #print(group)
        wp = wp.append(group,ignore_index = True)
        
    wp[col_name] = wp.apply(lambda x:'%s%s' % (x['temp'],x['count']),axis=1)  
    
    return wp

### 2.3 Apply Spatial Join

In [None]:
#Scchool
sch_df = join_edu_data(gsch, census, "SchID")
#daycare
dc_df = join_edu_data(gdc, census, "dcID")

### 2.4 Get Kids from Population
* All kids (go to daycares and school)
* Find the kids that don't go to daycare (Check their parents, if their parents' workplace is their home, let thses kids stay at home)

In [None]:
#School Kids
sch_k = pop [(pop.age >= 4) & (pop.age <= 17)].copy()
print(len(sch_k), " kids go to school...")
#Daycare Kids
dc_k = pop [pop.age <=3 ].copy()
print(len(dc_k), " kids under age of 3...")
#Adults
adults = pop[pop.age >= 18].copy()
print(len(adults), " adults...")

In [None]:
def check_parents(kid):
    
    parents = adult.loc[adult.hhold == kid.hhold].copy()
    #if wp is null means parents stay at home and take care of babies
    if parents.wp.isnull().all():
        return False
    else:
        #kids go to daycare
        return True
    
def check(kid):
    kid['Stay'] = kid.apply(check_parents,axis=1)
    return kid

In [None]:
#Apply check parents funtion
go_dc_k = parallelize_kid(dc_k, check)

### 2.4 Convert Kids to Geo dataFrame

In [None]:
#Kids to Geo DataFram
def kids_to_GPD(data):
    #df = data[data.index.str.startswith(x)].copy()
    print("Ele AGE Unique: ", data.age.unique())
    from shapely.geometry import Point
    data['geometry'] = data.apply(lambda x: Point((float(x.long), float(x.lat))), axis=1)
    gdf = gpd.GeoDataFrame(data, geometry='geometry')
    print(gdf.shape)
    return gdf

In [None]:
#Apply Convert function
#kids go to daycare
g_go_dc_k = kids_to_GPD(go_dc_k)
#kids go to school
g_sch_k = kids_to_GPD(sch_k)
#Add a column called tract indicating which census tract that the kid lives in
g_sch_k['tract'] = g_sch_k['hhold'].str[0:11]

### 2.5 Parallelize function
apply multi processing to save time, which will be used multiple times in this code

In [None]:
def parallelize(kid, func):
    #Cores
    num_cores = multiprocessing.cpu_count()-1
    num_partitions = num_cores
    #Pools
    pool = multiprocessing.Pool(num_cores)
    
    kid_split = np.array_split(kid, num_partitions)
    
    df = pd.concat(pool.map(func, kid_split))
    return df
    
    pool.close()
    pool.join()

## 3, Assign Daycare ID and School ID

### 3.1 Function to calculate the great circle distance
The reasons why we create this funtion is due the distance function from geopandas package is not giving us the right result. To fix the issue, we create a new distance function.

If yours works fine, you don't have to do this.

In [None]:
def new_distance(x1, y1, x2, y2):
    # approximate radius of earth in km
    R = 6373.0
    
    lat1 = radians (x1)
    long1 = radians (y1)
    lat2 = radians(x2)
    long2 = radians(y2)
    
    dlon = long2 - long1
    dlat = lat2 - lat1
    
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    return distance

### 3.2 Assign School and DaycareID function

In [None]:
census = tract.loc[:,['GEOID10','geometry']].set_index('GEOID10')

def assign_school(data, census, school):
    # Kids Lat and Long 
    x = data.lat
    y = data.long
    
    #Kid Age
    kid_age = data.age
    #print(kid_age)
    
    #Kids Census tract geometry
    tract = str(data.tract)
    #print(tract)
    tract_geometry = census.loc[data.tract, 'geometry']
    
    #print(tract_geometry)
    buff = tract_geometry.buffer(0.08)
    
    #Find the Right Type of School For this child
    sch_Age = school[(school['S_age'] <= kid_age) & (school['E_age'] >= kid_age)]
       
    #Find Intersected schools
    s_in = sch_Age[sch_Age.intersects(buff)].copy()
    #print(len(s_in))
    
    #Intersect Road Point Lat list
    sx = s_in.loc[:,'LATITUDE'].tolist()
    #Intersect Road Point Long list
    sy = s_in.loc[:, 'LONGITUDE'].tolist()
    
    #Calculate Distance between the point and intersected road points 
    dist = [] #distance list
    for j in range(0, len(s_in)):
        d = new_distance(x, y, sx[j], sy[j])
        dist.append(d)
        
    #school ID list    
    sid = s_in.loc[:,'scolID']
    
    #Create DF to hold the School ID an the theire distance to Kids
    df_sch_in = pd.DataFrame({'scolID': sid, 'Dist':dist}).sort_values(by='Dist')
    
    sch_AgeDistAccept = [s for s in df_sch_in.index if school.loc[s, 'current'] < school.loc[s, 'ENROLLMENT']]
    #print(sch_AgeDistAccept)
    
    if sch_AgeDistAccept != []:
        j = sch_AgeDistAccept[0]
        
        return df_sch_in.loc[j, 'scolID']
    else:
        return random.choice(sid)
    
    school.loc[sch_id, 'current'] += 1
    
    #return df_sch_in.iloc[0, 0]
    
#Test
#test = g_sch_k.iloc[:100,1:]
#test['tract'] = test['hhold'].str[0:11]
#test['wp'] = test.apply(assign_school,args=(census, gsch),axis=1)
#test

### 3.3 Apply Assign School ID funtion

In [None]:
def assign_ID(data):
    test['wp'] = data.apply(assign_school,args=(census, gsch),axis=1)
    return data

def parallelize(data, func):
    #Cores
    num_cores = 7
    num_partitions = num_cores
    #Split
    data_split = np.array_split(data, num_partitions)
    #Pools
    pool = multiprocessing.Pool(num_cores)
    
    df = pd.concat(pool.map(func, data_split))
    return df

    pool.close()
    pool.join()
    
if __name__=='__main__':
    print('Start running...')
    #set timer
    start_time = timeit.default_timer()
    
    #appl
    g_sch_k['tract'] = g_sch_k['hhold'].str[0:11]
    print(g_sch_k.head())
    
    n_g_sch_k = parallelize(g_sch_k, assign_ID)
    
    elapsed = timeit.default_timer() - start_time
    print("Total Time(s):", elapsed)
    print('End program')