# ML_traffic_flow_modeling

Traffic Flow Modeling from Points of Interest, Population Density, Employment Density and Number of Workers with Random Forest Regression

   Prakruthi Burra, Rui Shi

In [24]:
import os
import matplotlib.pyplot as plt
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt

import numpy as np
from sklearn import metrics, tree, svm, datasets
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils.multiclass import unique_labels
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import time

Points of Interest

In [40]:
IL_GNIS = gpd.read_file('IL_GNIS_2006_Pt/IL_GNIS_2006_Pt.shp')
IL_GNIS = IL_GNIS[IL_GNIS.geometry.type == 'Point']

Census Tracts

In [75]:
census_tracts = gpd.read_file('tl_2019_17_tract.shp')
census_tracts = census_tracts[census_tracts.geometry.type == 'Polygon']

Spatial Join of Points of Interest(POI) data with census tracts

In [44]:
GNIS_census = gpd.sjoin(IL_GNIS, census_tracts, how="inner", op='intersects')

In [45]:
GNIS_census

Unnamed: 0,FEATURE_ID,FEATURE_NA,CLASS,ST_ALPHA,ST_NUM,COUNTY,COUNTY_NUM,PRIMARY_LA,PRIMARY_LO,PRIMARY__1,...,TRACTCE,GEOID,NAME,NAMELSAD,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON
0,1679009,Rainbow Arch,Arch,IL,17,Johnson,87,372148N,0890140W,37.363385,...,977800,17087977800,9778,Census Tract 9778,G5020,S,312181887,2430138,+37.3699573,-088.9052620
87,1692410,Vienna Reservoir Dam,Dam,IL,17,Johnson,87,372506N,0885324W,37.418385,...,977800,17087977800,9778,Census Tract 9778,G5020,S,312181887,2430138,+37.3699573,-088.9052620
827,1815868,Lincoln Green Post Office (historical),Post Office,IL,17,Johnson,87,371942N,0885950W,37.328385,...,977800,17087977800,9778,Census Tract 9778,G5020,S,312181887,2430138,+37.3699573,-088.9052620
1544,1928665,Vienna Number 1 Election Precinct,Civil,IL,17,Johnson,87,372413N,0885633W,37.403663,...,977800,17087977800,9778,Census Tract 9778,G5020,S,312181887,2430138,+37.3699573,-088.9052620
1770,404855,Bridges Cemetery,Cemetery,IL,17,Johnson,87,372429N,0885714W,37.408107,...,977800,17087977800,9778,Census Tract 9778,G5020,S,312181887,2430138,+37.3699573,-088.9052620
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55007,407663,East Lawn Cemetery,Cemetery,IL,17,McLean,113,402922N,0885535W,40.489480,...,001105,17113001105,11.05,Census Tract 11.05,G5020,S,2341420,0,+40.4951428,-088.9328751
55074,407904,Raymond Ellis Elementary School,School,IL,17,Lake,97,422211N,0880603W,42.369744,...,861304,17097861304,8613.04,Census Tract 8613.04,G5020,S,1294131,28349,+42.3698765,-088.0998026
55398,413470,Micek Park,Park,IL,17,Cook,31,414751N,0874040W,41.797533,...,611600,17031611600,6116,Census Tract 6116,G5020,S,315136,0,+41.7975955,-087.6767426
55721,420527,James Ward Elementary School,School,IL,17,Cook,31,415036N,0873806W,41.843367,...,340300,17031340300,3403,Census Tract 3403,G5020,S,213538,0,+41.8428058,-087.6357881


Merge census tracts with POI and population

In [47]:
GNIS_census_school = GNIS_census.loc[GNIS_census['CLASS'].isin(['School'])]
GNIS_census_hospital = GNIS_census.loc[GNIS_census['CLASS'].isin(['Hospital'])]
GNIS_census_park = GNIS_census.loc[GNIS_census['CLASS'].isin(['Park'])]
GNIS_census_church = GNIS_census.loc[GNIS_census['CLASS'].isin(['Church'])]
GNIS_census_post_office = GNIS_census.loc[GNIS_census['CLASS'].isin(['Post Office'])]
GNIS_census_airport = GNIS_census.loc[GNIS_census['CLASS'].isin(['Airport'])]
GNIS_census_populated_place = GNIS_census.loc[GNIS_census['CLASS'].isin(['Populated Place'])]

In [51]:
Census_school =  GNIS_census_school.groupby("GEOID")["FEATURE_ID"].count().reset_index().sort_values('FEATURE_ID')
Census_hospital =  GNIS_census_hospital.groupby("GEOID")["FEATURE_ID"].count().reset_index().sort_values('FEATURE_ID')
Census_park =  GNIS_census_park.groupby("GEOID")["FEATURE_ID"].count().reset_index().sort_values('FEATURE_ID')
Census_church =  GNIS_census_church.groupby("GEOID")["FEATURE_ID"].count().reset_index().sort_values('FEATURE_ID')
Census_post_office =  GNIS_census_post_office.groupby("GEOID")["FEATURE_ID"].count().reset_index().sort_values('FEATURE_ID')
Census_airport =  GNIS_census_airport.groupby("GEOID")["FEATURE_ID"].count().reset_index().sort_values('FEATURE_ID')
Census_populated_place =  GNIS_census_populated_place.groupby("GEOID")["FEATURE_ID"].count().reset_index().sort_values('FEATURE_ID')

In [54]:
classes_of_interest = ['School', 'Hospital', 'Park', 'Church', 'Post Office', 'Airport', 'Populated Place']
GNIS_census_filtered = GNIS_census.loc[GNIS_census['CLASS'].isin(classes_of_interest)]
Census_POIs = GNIS_census_filtered.groupby("GEOID")["FEATURE_ID"].count().reset_index().sort_values('FEATURE_ID')

In [57]:
tracts = gpd.read_file('tracts.shp')
trips_stores_data = tracts.loc[:,('GEOID', 'TRIPS_ORIG', 'TRIPS_DEST','STORES')]
census_tracts = census_tracts.merge(trips_stores_data, on='GEOID')

In [64]:
pop = pd.read_csv('Population_with_GEOIDs.csv')

In [70]:
pop.loc[:,'GEOID'] = pop.loc[:,'GEOID'].astype(str)

In [77]:
census_tracts = census_tracts.merge(pop, on = 'GEOID')

In [78]:
for i in range(len(census_tracts)):
    #print(i)
    for j in range(len(Census_POIs)):

        if(j in Census_school.index):
            if(Census_school.loc[j,'GEOID'] == census_tracts.loc[i,'GEOID']):
                census_tracts.loc[i, 'No. Schools'] = Census_school.loc[j, 'FEATURE_ID']

        if(j in Census_hospital.index):
            if(Census_hospital.loc[j,'GEOID'] == census_tracts.loc[i,'GEOID']):
                census_tracts.loc[i, 'No. Hospitals'] = Census_hospital.loc[j, 'FEATURE_ID']

        if(j in Census_park.index):
            if(Census_park.loc[j,'GEOID'] == census_tracts.loc[i,'GEOID']):
                census_tracts.loc[i, 'No. Parks'] = Census_park.loc[j, 'FEATURE_ID']

        if(j in Census_church.index):
            if(Census_church.loc[j,'GEOID'] == census_tracts.loc[i,'GEOID']):
                census_tracts.loc[i, 'No. Churches'] = Census_church.loc[j, 'FEATURE_ID']

        if(j in Census_post_office.index):
            if(Census_post_office.loc[j,'GEOID'] == census_tracts.loc[i,'GEOID']):
                census_tracts.loc[i, 'No. Post Offices'] = Census_post_office.loc[j, 'FEATURE_ID']

        if(j in Census_airport.index):
            if(Census_airport.loc[j,'GEOID'] == census_tracts.loc[i,'GEOID']):
                census_tracts.loc[i, 'No. Airports'] = Census_airport.loc[j, 'FEATURE_ID']

        if(j in Census_populated_place.index):
            if(Census_populated_place.loc[j,'GEOID'] == census_tracts.loc[i,'GEOID']):
                census_tracts.loc[i, 'No. Populated Places'] = Census_populated_place.loc[j, 'FEATURE_ID']

        if(Census_POIs.loc[j,'GEOID'] == census_tracts.loc[i,'GEOID']):
            census_tracts.loc[i, 'Total Points of Interest'] = Census_POIs.loc[j, 'FEATURE_ID']
            census_tracts.loc[i, 'Total Points of Interest per Area'] = 1000000*int(Census_POIs.loc[j, 'FEATURE_ID'])/(float(census_tracts.loc[i, 'ALAND'])+0.1)
            census_tracts.loc[i, 'Total Points of Interest per Capita'] = 10*int(Census_POIs.loc[j, 'FEATURE_ID'])/(float(census_tracts.loc[i, 'POPULATION'])+0.1)

#print("FINAL CENSUS TRACTS SHAPEFILE")
census_tracts = census_tracts.fillna(0)
census_tracts = census_tracts.rename(columns={"STORES" : "No. Stores"})

In [99]:
census_tracts

Unnamed: 0,STATEFP,COUNTYFP,TRACTCE,GEOID,NAME,NAMELSAD,MTFCC,FUNCSTAT,ALAND,AWATER,...,No. Parks,No. Churches,No. Schools,Total Points of Interest,Total Points of Interest per Area,POPULATION,Total Points of Interest per Capita,No. Populated Places,No. Hospitals,No. Post Offices
0,17,091,011700,17091011700,117,Census Tract 117,G5020,S,2370100,102060,...,2.0,3.0,2.0,8.0,3.375385,3417,0.023412,0.0,0.0,0.0
1,17,091,011800,17091011800,118,Census Tract 118,G5020,S,1791332,55670,...,0.0,2.0,0.0,3.0,1.674731,2627,0.011419,1.0,0.0,0.0
2,17,119,400951,17119400951,4009.51,Census Tract 4009.51,G5020,S,5169973,169066,...,0.0,1.0,3.0,4.0,0.773698,4966,0.008055,0.0,0.0,0.0
3,17,119,400952,17119400952,4009.52,Census Tract 4009.52,G5020,S,5751299,305906,...,0.0,0.0,1.0,2.0,0.347748,3335,0.005997,1.0,0.0,0.0
4,17,135,957500,17135957500,9575,Census Tract 9575,G5020,S,450037682,512225,...,0.0,8.0,33.0,49.0,0.108880,3273,0.149705,7.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3118,17,037,000100,17037000100,1,Census Tract 1,G5020,S,99421738,712972,...,0.0,6.0,8.0,18.0,0.181047,6712,0.026817,2.0,0.0,0.0
3119,17,037,001500,17037001500,15,Census Tract 15,G5020,S,38529459,171069,...,1.0,0.0,5.0,8.0,0.207633,3807,0.021013,1.0,0.0,0.0
3120,17,037,000400,17037000400,4,Census Tract 4,G5020,S,136874050,1081155,...,3.0,1.0,12.0,24.0,0.175344,8622,0.027835,4.0,0.0,0.0
3121,17,037,000300,17037000300,3,Census Tract 3,G5020,S,285569285,465644,...,1.0,2.0,21.0,33.0,0.115559,2680,0.123130,3.0,0.0,2.0


Import tracts with smart location database

In [100]:
original = gpd.read_file('tracts.shp')

Extract number of stores, population density, number of workers and merge with census tracts

In [102]:
additional = original.loc[:,('STORES','D1B','CBSA_WRK','GEOID')]

In [104]:
additional = additional.rename(columns = {'D1B' : 'POP_DENSITY','CBSA_WRK' : 'NUM_WORKERS'})

In [108]:
census_tracts = census_tracts.merge(additional, on = 'GEOID')

In [110]:
census_tracts = census_tracts.rename(columns = {'STORES' : 'No. Stores'})

In [111]:
census_tracts

Unnamed: 0,STATEFP,COUNTYFP,TRACTCE,GEOID,NAME,NAMELSAD,MTFCC,FUNCSTAT,ALAND,AWATER,...,Total Points of Interest,Total Points of Interest per Area,POPULATION,Total Points of Interest per Capita,No. Populated Places,No. Hospitals,No. Post Offices,No. Stores,POP_DENSITY,NUM_WORKERS
0,17,091,011700,17091011700,117,Census Tract 117,G5020,S,2370100,102060,...,8.0,3.375385,3417,0.023412,0.0,0.0,0.0,0,7.059873,46799.0
1,17,091,011800,17091011800,118,Census Tract 118,G5020,S,1791332,55670,...,3.0,1.674731,2627,0.011419,1.0,0.0,0.0,0,5.838794,46799.0
2,17,119,400951,17119400951,4009.51,Census Tract 4009.51,G5020,S,5169973,169066,...,4.0,0.773698,4966,0.008055,0.0,0.0,0.0,0,3.772428,1237055.0
3,17,119,400952,17119400952,4009.52,Census Tract 4009.52,G5020,S,5751299,305906,...,2.0,0.347748,3335,0.005997,1.0,0.0,0.0,0,2.777786,1237055.0
4,17,135,957500,17135957500,9575,Census Tract 9575,G5020,S,450037682,512225,...,49.0,0.108880,3273,0.149705,7.0,0.0,0.0,0,0.128183,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3118,17,037,000100,17037000100,1,Census Tract 1,G5020,S,99421738,712972,...,18.0,0.181047,6712,0.026817,2.0,0.0,0.0,0,1.022809,4066635.0
3119,17,037,001500,17037001500,15,Census Tract 15,G5020,S,38529459,171069,...,8.0,0.207633,3807,0.021013,1.0,0.0,0.0,0,5.213170,4066635.0
3120,17,037,000400,17037000400,4,Census Tract 4,G5020,S,136874050,1081155,...,24.0,0.175344,8622,0.027835,4.0,0.0,0.0,2,0.252156,4066635.0
3121,17,037,000300,17037000300,3,Census Tract 3,G5020,S,285569285,465644,...,33.0,0.115559,2680,0.123130,3.0,0.0,2.0,0,0.037200,4066635.0


Create training set to predict number of trips originated at census tracts based on POI, population density and number of workers

In [165]:
census_tract_modelling_df = census_tracts.loc[:,('No. Schools', 'No. Hospitals', 'No. Parks', 'No. Churches', 'No. Post Offices', 'No. Airports', 'No. Populated Places', 'No. Stores', 'Total Points of Interest','POP_DENSITY','NUM_WORKERS')]

In [166]:
census_tract_modelling_df

Unnamed: 0,No. Schools,No. Hospitals,No. Parks,No. Churches,No. Post Offices,No. Airports,No. Populated Places,No. Stores,Total Points of Interest,POP_DENSITY,NUM_WORKERS
0,2.0,0.0,2.0,3.0,0.0,1.0,0.0,0,8.0,7.059873,46799.0
1,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0,3.0,5.838794,46799.0
2,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0,4.0,3.772428,1237055.0
3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0,2.0,2.777786,1237055.0
4,33.0,0.0,0.0,8.0,0.0,1.0,7.0,0,49.0,0.128183,0.0
...,...,...,...,...,...,...,...,...,...,...,...
3118,8.0,0.0,0.0,6.0,0.0,2.0,2.0,0,18.0,1.022809,4066635.0
3119,5.0,0.0,1.0,0.0,0.0,1.0,1.0,0,8.0,5.213170,4066635.0
3120,12.0,0.0,3.0,1.0,0.0,4.0,4.0,2,24.0,0.252156,4066635.0
3121,21.0,0.0,1.0,2.0,2.0,4.0,3.0,0,33.0,0.037200,4066635.0


In [171]:
TRIPS_ORIG = original.loc[:,('TRIPS_ORIG','GEOID')]

In [119]:
census_tracts = census_tracts.merge(TRIPS_ORIG, on = 'GEOID')

In [121]:
TRIPS_DEST = original.loc[:,('TRIPS_DEST','GEOID')]

In [122]:
census_tracts = census_tracts.merge(TRIPS_DEST, on = 'GEOID')

In [125]:
census_tracts.to_file('tracts_.shp')

In [126]:
census_tracts.to_csv('tracts_with_POI.csv')

In [137]:
TRIPS_DEST = TRIPS_DEST.drop(columns = 'GEOID')

In [155]:
TRIPS_DEST = TRIPS_DEST.fillna(0)
TRIPS_DEST.loc[:,'TRIPS_DEST'] = TRIPS_DEST.loc[:,'TRIPS_DEST'].astype(int)

In [172]:
TRIPS_ORIG = TRIPS_ORIG.drop(columns = 'GEOID')

In [173]:
TRIPS_ORIG = TRIPS_ORIG.fillna(0)
TRIPS_ORIG.loc[:,'TRIPS_ORIG'] = TRIPS_ORIG.loc[:,'TRIPS_ORIG'].astype(int)

Run machine learning model to make the prediction, result is in vector y_pred

In [234]:
Random_Forest_Regressor = RandomForestRegressor(n_estimators=2000, random_state=0)
Random_Forest_Regressor.fit(census_tract_modelling_df, np.ravel(TRIPS_ORIG))
y_pred = Random_Forest_Regressor.predict(census_tract_modelling_df)
print("RANDOM FOREST REGRESSOR - TRIPS ORIGIN")
print(metrics.mean_squared_error(TRIPS_ORIG, y_pred))

RANDOM FOREST REGRESSOR - TRIPS ORIGIN
349971.2902227906


In [179]:
predicted_orig_trips = pd.DataFrame(y_pred)

In [182]:
census_tracts = census_tracts.join(predicted_orig_trips)

In [191]:
census_tracts = census_tracts.rename(columns = {0 : 'PREDICTED_ORIG_TRIPS'})

In [194]:
modeling_dest = census_tract_modelling_df.drop(columns = 'NUM_WORKERS')

In [200]:
employ_den = original.loc[:,('D1C','GEOID')]

In [208]:
census_tracts = census_tracts.merge(employ_den, on = 'GEOID')

In [211]:
census_tracts = census_tracts.rename(columns = {'D1C' : 'EMPLOYMENT_DENSITY'})

In [213]:
modeling_dest = census_tracts.loc[:,('No. Schools', 'No. Hospitals', 'No. Parks', 'No. Churches', 'No. Post Offices', 'No. Airports', 'No. Populated Places', 'No. Stores', 'Total Points of Interest','POP_DENSITY','EMPLOYMENT_DENSITY')]

Create training set to predict number of trips that end in census tracts with POI, population density and employment density

In [214]:
modeling_dest

Unnamed: 0,No. Schools,No. Hospitals,No. Parks,No. Churches,No. Post Offices,No. Airports,No. Populated Places,No. Stores,Total Points of Interest,POP_DENSITY,EMPLOYMENT_DENSITY
0,2.0,0.0,2.0,3.0,0.0,1.0,0.0,0,8.0,7.059873,2.442126
1,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0,3.0,5.838794,1.327268
2,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0,4.0,3.772428,0.566353
3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0,2.0,2.777786,0.242828
4,33.0,0.0,0.0,8.0,0.0,1.0,7.0,0,49.0,0.128183,0.040990
...,...,...,...,...,...,...,...,...,...,...,...
3118,8.0,0.0,0.0,6.0,0.0,2.0,2.0,0,18.0,1.022809,0.274031
3119,5.0,0.0,1.0,0.0,0.0,1.0,1.0,0,8.0,5.213170,1.535521
3120,12.0,0.0,3.0,1.0,0.0,4.0,4.0,2,24.0,0.252156,0.009529
3121,21.0,0.0,1.0,2.0,2.0,4.0,3.0,0,33.0,0.037200,0.012366


Run machine learning model to make the prediction

In [221]:
Random_Forest_Regressor = RandomForestRegressor(n_estimators=1000, random_state=0)
Random_Forest_Regressor.fit(modeling_dest, np.ravel(TRIPS_DEST))
y_pred = Random_Forest_Regressor.predict(modeling_dest)
print("RANDOM FOREST REGRESSOR - TRIPS DESTINATION")
print(metrics.mean_squared_error(TRIPS_DEST, y_pred))

RANDOM FOREST REGRESSOR - TRIPS DESTINATION
5124.759702816203


In [222]:
y_pred

array([125.582, 155.451, 283.279, ..., 440.224,  65.58 , 168.524])

In [223]:
pred_trips_dest = pd.DataFrame(y_pred)

In [225]:
pred_trips_dest = pred_trips_dest.rename(columns = {0 : 'PREDICTED_DEST_TRIPS'})

In [228]:
census_tracts = census_tracts.join(pred_trips_dest)

In [241]:
census_tracts.to_csv('tracts_2.csv')

In [242]:
census_tracts.to_file('tracts_2.shp')

In [246]:
poi_pred = census_tracts.loc[:,('GEOID', 'No. Schools', 'No. Hospitals', 'No. Parks', 'No. Churches', 'No. Post Offices', 'No. Airports', 'No. Populated Places', 'No. Stores', 'Total Points of Interest','POP_DENSITY','NUM_WORKERS','EMPLOYMENT_DENSITY','PREDICTED_ORIG_TRIPS','PREDICTED_DEST_TRIPS')]

In [247]:
poi_pred

Unnamed: 0,GEOID,No. Schools,No. Hospitals,No. Parks,No. Churches,No. Post Offices,No. Airports,No. Populated Places,No. Stores,Total Points of Interest,POP_DENSITY,NUM_WORKERS,EMPLOYMENT_DENSITY,PREDICTED_ORIG_TRIPS,PREDICTED_DEST_TRIPS
0,17091011700,2.0,0.0,2.0,3.0,0.0,1.0,0.0,0,8.0,7.059873,46799.0,2.442126,126.347,125.582
1,17091011800,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0,3.0,5.838794,46799.0,1.327268,116.855,155.451
2,17119400951,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0,4.0,3.772428,1237055.0,0.566353,314.676,283.279
3,17119400952,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0,2.0,2.777786,1237055.0,0.242828,60.534,203.733
4,17135957500,33.0,0.0,0.0,8.0,0.0,1.0,7.0,0,49.0,0.128183,0.0,0.040990,41.602,70.198
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3118,17037000100,8.0,0.0,0.0,6.0,0.0,2.0,2.0,0,18.0,1.022809,4066635.0,0.274031,160.356,242.515
3119,17037001500,5.0,0.0,1.0,0.0,0.0,1.0,1.0,0,8.0,5.213170,4066635.0,1.535521,404.125,199.039
3120,17037000400,12.0,0.0,3.0,1.0,0.0,4.0,4.0,2,24.0,0.252156,4066635.0,0.009529,193.709,440.224
3121,17037000300,21.0,0.0,1.0,2.0,2.0,4.0,3.0,0,33.0,0.037200,4066635.0,0.012366,87.904,65.580


Merge census tracts with POI and predicted origin, destination trips

In [248]:
original = original.merge(poi_pred, on = 'GEOID')

In [255]:
original.to_file('tracts_3.shp')

In [256]:
original.to_csv('tracts_3.csv')