In [5]:
import re, os, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib

In [6]:
from sklearn import preprocessing
import fastbook
fastbook.setup_book()
from fastbook import *

from fastai.tabular import *
from fastai.tabular.all import *
from fastai.tabular.data import *
from fastai.tabular.learner import *

In [7]:
# Take the case of data in 2012
data_ori = pd.read_csv("data2020.csv")
data_ori = data_ori.drop('fid',axis=1)
data = data_ori.copy()
data.head()

Unnamed: 0,X,Y,country,fua_name,city_pop,pop2012,road_len,land_cover,night202001,night202002,night202005,night202006,night202003,night202004,cen_dist
0,4.487501,51.008333,BE,Bruxelles / Brussel,4892537,206.316861,0.001409,7,16.559999,14.21,19.18,17.879999,20.5,22.07,27430.20297
1,4.491667,51.008333,BE,Bruxelles / Brussel,4892537,845.095145,0.005011,2,15.29,14.21,15.13,11.85,13.69,13.5,27724.21038
2,4.495834,51.008333,BE,Bruxelles / Brussel,4892537,648.962256,0.009897,7,9.36,9.97,11.04,9.75,11.7,11.72,28022.79795
3,4.500001,51.008333,BE,Bruxelles / Brussel,4892537,0.0,0.0,2,9.94,11.37,10.07,10.25,11.08,13.57,28325.82084
4,4.320834,51.004166,BE,Bruxelles / Brussel,4892537,272.925815,0.0,7,7.36,10.34,8.88,11.44,14.63,12.39,20893.36067


In [8]:
alist = data.columns.to_list()
alist

['X',
 'Y',
 'country',
 'fua_name',
 'city_pop',
 'pop2012',
 'road_len',
 'land_cover',
 'night202001',
 'night202002',
 'night202005',
 'night202006',
 'night202003',
 'night202004',
 'cen_dist']

In [9]:
alist = ['pop2012', 'night', 'road_len', 'city_pop', 'cen_dist', 'X', 'Y', 'land_cover','city_id']

In [10]:
# StandardScaler will subtract the mean from each value, then scale to the unit variance
scaler = preprocessing.MinMaxScaler()
blist = ['road_len','city_pop', 'cen_dist']
data1 = data[blist]
data_new = scaler.fit_transform(data1.values.reshape(-1, len(blist)))
data_new = pd.DataFrame(data_new, columns=blist)

In [11]:
# Factorize the city id
data_new['land_cover'] = data['land_cover']
data['city_id'] = data['fua_name'].factorize()[0]
data_new['city_id'] = data['city_id']

In [12]:
# Based on different city_id, to normalize the longitude and latitude
blist = ['X','Y']
for col in blist:
    data[col+'_max'] = data.groupby(['city_id'])[col].transform(max)
    data[col+'_min'] = data.groupby(['city_id'])[col].transform(min)
    data_new[col] = ((data[col] - data[col+'_min']) / (data[col+'_max'] - data[col+'_min']))

In [13]:
max_night = 278.359985
mon_list = ['01','02', '03', '04', '05', '06']
for mon in mon_list:
    data_new['night'] = data['night2020'+mon].to_list() 
    data_new['night'] /= max_night
    data_new = data_new[alist[1:]]
    data_new.to_csv('night2020'+mon+'.csv',index=False)
    print(mon+': File Exported!')

01: File Exported!
02: File Exported!
03: File Exported!
04: File Exported!
05: File Exported!
06: File Exported!


## Prediction

In [14]:
def predict_tabular(mon, xs, joblib_file, pop_max):  
    # Load from file
    joblib_model = joblib.load(joblib_file)
    preds = joblib_model.predict(xs)
    
    # Transform to the pre-normalized state
    preds = preds * pop_max

    # Copy in the csv
    data = pd.read_csv("data2020.csv")
    data['pop'+mon[-2:]] = preds
    data['pop'+mon[-2:]] = data['pop'+mon[-2:]].apply(lambda x: max(0,x))
    data.to_csv("data2020.csv",index=False)
    print(mon+': File Exported!')

In [15]:
pop_list = ['pop01','pop02','pop03','pop04','pop05','pop06',]
night_list = ['night202001','night202002','night202003','night202004','night202005','night202006',]
joblib_file = "gbrt.pkl"
for mon in night_list:
    data_pred = pd.read_csv(mon+'.csv')
    to = TabularPandas(data_pred, procs=[Categorify,], cat_names=alist[-2:], cont_names=alist[1:-2])
    predict_tabular(mon, xs=to.xs, joblib_file=joblib_file, pop_max=85729.664432)

night202001: File Exported!
night202002: File Exported!
night202003: File Exported!
night202004: File Exported!
night202005: File Exported!
night202006: File Exported!


In [16]:
data_pred = pd.read_csv('data2020.csv')
data_pred['popstd'] = np.std(data_pred[pop_list],axis=1)
data_pred['nightstd'] =  np.std(data_pred[night_list],axis=1)
data_pred['S'] = data_pred['popstd'] / data_pred['nightstd']
data_pred['S'].mean()

52.01602471061114

In [17]:
from sklearn.metrics import mean_squared_error
rmse = mean_squared_error(data_pred['pop2012'],data_pred['pop01'],squared=False)
std = data_pred.describe().to_dict()['pop2012']['std']
R2 = 1 - ( rmse / std ) ** 2
rmse, R2

(1079.918606882703, 0.8350135898672021)