In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import root_mean_squared_error

In [None]:
main_data = pd.read_csv('./finaldataframe.csv')

X = main_data[[ 
 'visitor_flows',
 'geoid_o',
 'geoid_d',
 'origin_GDP_millions_dollars',
 'destination_GDP_millions_dollars',
 'distance_km',
 'origin_unemployment',
 'destination_unemployment',
 'origin_income',
 'destination_income',
 'public_transport_origin',
 'road_network_residential_origin',
 'road_network_main_origin',
 'road_network_secondary_origin',
 'retail_supermarket_origin',
 'retail_department_store_origin',
 'retail_mall_origin',
 'education_origin',
 'police_station_origin',
 'fire_station_origin',
 'land_use_commercial_origin',
 'land_use_industrial_origin',
 'land_use_retail_origin',
 'land_use_residential_origin',
 'land_use_natural_origin',
 'public_transport_destination',
 'road_network_residential_destination',
 'road_network_main_destination',
 'road_network_secondary_destination',
 'retail_supermarket_destination',
 'retail_department_store_destination',
 'retail_mall_destination',
 'education_destination',
 'police_station_destination',
 'fire_station_destination',
 'land_use_commercial_destination',
 'land_use_industrial_destination',
 'land_use_retail_destination',
 'land_use_residential_destination',
 'land_use_natural_destination',
 'origin_population',
 'destination_population']]

y = main_data[['pop_flows']]
y = y.squeeze()
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, shuffle=False, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, shuffle=False, random_state=42)
  
print(X_train.shape)
print(y_train.shape)

num = ['visitor_flows',
 'origin_GDP_millions_dollars',
 'destination_GDP_millions_dollars',
 'distance_km',
 'origin_unemployment',
 'destination_unemployment',
 'origin_income',
 'destination_income',
 'public_transport_origin',
 'road_network_residential_origin',
 'road_network_main_origin',
 'road_network_secondary_origin',
 'retail_supermarket_origin',
 'retail_department_store_origin',
 'retail_mall_origin',
 'education_origin',
 'police_station_origin',
 'fire_station_origin',
 'land_use_commercial_origin',
 'land_use_industrial_origin',
 'land_use_retail_origin',
 'land_use_residential_origin',
 'land_use_natural_origin',
 'public_transport_destination',
 'road_network_residential_destination',
 'road_network_main_destination',
 'road_network_secondary_destination',
 'retail_supermarket_destination',
 'retail_department_store_destination',
 'retail_mall_destination',
 'education_destination',
 'police_station_destination',
 'fire_station_destination',
 'land_use_commercial_destination',
 'land_use_industrial_destination',
 'land_use_retail_destination',
 'land_use_residential_destination',
 'land_use_natural_destination',
 'origin_population',
 'destination_population']

classing = ['geoid_o', 'geoid_d']


(116811, 42)
(116811,)


In [55]:

class GeoIDEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoder_origin = LabelEncoder()
        self.encoder_destination = LabelEncoder()
    
    def fit(self, X, y=None):
        # Fit label encoders to geoid columns
        print("Fitting GeoIDEncoder with columns:", X.columns)
        self.encoder_origin.fit(X['geoid_o'])
        self.encoder_destination.fit(X['geoid_d'])
        return self
    
    def transform(self, X):
        # Transform geoid columns using label encoders
        print("Transforming GeoID columns:", X.columns)
        X_transformed = X.copy()
        X_transformed['geoid_o'] = self.encoder_origin.transform(X_transformed['geoid_o'])
        X_transformed['geoid_d'] = self.encoder_destination.transform(X_transformed['geoid_d'])
        return X_transformed

In [56]:
print(X.columns)

Index(['visitor_flows', 'geoid_o', 'geoid_d', 'origin_GDP_millions_dollars',
       'destination_GDP_millions_dollars', 'distance_km',
       'origin_unemployment', 'destination_unemployment', 'origin_income',
       'destination_income', 'public_transport_origin',
       'road_network_residential_origin', 'road_network_main_origin',
       'road_network_secondary_origin', 'retail_supermarket_origin',
       'retail_department_store_origin', 'retail_mall_origin',
       'education_origin', 'police_station_origin', 'fire_station_origin',
       'land_use_commercial_origin', 'land_use_industrial_origin',
       'land_use_retail_origin', 'land_use_residential_origin',
       'land_use_natural_origin', 'public_transport_destination',
       'road_network_residential_destination', 'road_network_main_destination',
       'road_network_secondary_destination', 'retail_supermarket_destination',
       'retail_department_store_destination', 'retail_mall_destination',
       'education_destinatio

In [57]:
from sklearn.metrics import root_mean_squared_error
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', num),
        ('geoid', GeoIDEncoder(), ['geoid_o', 'geoid_d']),  # Apply GeoIDEncoder to origin and destination
    ]
)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=200))
])

model.fit(X_train, y_train)


Fitting GeoIDEncoder with columns: Index(['geoid_o', 'geoid_d'], dtype='object')
Transforming GeoID columns: Index(['geoid_o', 'geoid_d'], dtype='object')


In [58]:
y_pred = model.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
print(f"RMSE: {rmse}")


mean_value = main_data['pop_flows'].mean()
std_dev = main_data['pop_flows'].std()
print(f"Mean: {mean_value}, Standard Deviation: {std_dev}")


rf_model = model.named_steps['regressor']

# Extract feature importances from the trained RandomForest model
importances = rf_model.feature_importances_

# Print feature importances
print("Feature importances:")
for feature, importance in zip(num, importances):
    print(f"{feature}: {importance:.4f}")


Transforming GeoID columns: Index(['geoid_o', 'geoid_d'], dtype='object')
RMSE: 8149.639324521757
Mean: 15984.641489142445, Standard Deviation: 44888.51658641136
Feature importances:
visitor_flows: 0.9852
origin_GDP_millions_dollars: 0.0003
destination_GDP_millions_dollars: 0.0000
distance_km: 0.0002
origin_unemployment: 0.0004
destination_unemployment: 0.0001
origin_income: 0.0085
destination_income: 0.0003
public_transport_origin: 0.0001
road_network_residential_origin: 0.0002
road_network_main_origin: 0.0005
road_network_secondary_origin: 0.0001
retail_supermarket_origin: 0.0002
retail_department_store_origin: 0.0001
retail_mall_origin: 0.0002
education_origin: 0.0002
police_station_origin: 0.0001
fire_station_origin: 0.0004
land_use_commercial_origin: 0.0001
land_use_industrial_origin: 0.0005
land_use_retail_origin: 0.0001
land_use_residential_origin: 0.0003
land_use_natural_origin: 0.0004
public_transport_destination: 0.0001
road_network_residential_destination: 0.0001
road_networ

In [59]:
model.score(X_test, y_test)


Transforming GeoID columns: Index(['geoid_o', 'geoid_d'], dtype='object')


0.9682553968174799

In [6]:
print(main_data['pop_flows'].max())
print(main_data['pop_flows'].min())


1501583.0
9.0


In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, 
                         cv = 5,
                         scoring = 'neg_mean_absolute_error')
print("average MAE", scores.mean())

Fitting GeoIDEncoder with columns: Index(['geoid_o', 'geoid_d'], dtype='object')
Transforming GeoID columns: Index(['geoid_o', 'geoid_d'], dtype='object')
Fitting GeoIDEncoder with columns: Index(['geoid_o', 'geoid_d'], dtype='object')
Transforming GeoID columns: Index(['geoid_o', 'geoid_d'], dtype='object')
Fitting GeoIDEncoder with columns: Index(['geoid_o', 'geoid_d'], dtype='object')
Transforming GeoID columns: Index(['geoid_o', 'geoid_d'], dtype='object')
