In [1]:
from google.colab import drive
drive.mount('/content/gdrive/')
path = '/content/gdrive/My Drive/Colab Notebooks/Thesis'

Mounted at /content/gdrive/


In [2]:
import re, os, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, precision_score, recall_score
import joblib

In [4]:
!pip install -Uqq fastbook
import fastbook
fastbook.setup_book()
from fastbook import *

from fastai.tabular import *
from fastai.tabular.all import *
from fastai.tabular.data import *
from fastai.tabular.learner import *

[K     |████████████████████████████████| 727kB 5.8MB/s 
[K     |████████████████████████████████| 1.2MB 53.2MB/s 
[K     |████████████████████████████████| 51kB 7.5MB/s 
[K     |████████████████████████████████| 194kB 55.8MB/s 
[K     |████████████████████████████████| 61kB 9.2MB/s 
[K     |████████████████████████████████| 51kB 7.8MB/s 
[?25h

In [5]:
data_ori = pd.read_csv(path+'/data1_sf.csv')
alist = data_ori.columns.to_list()
alist

['country',
 'fua_name',
 'road_len',
 'area',
 'population',
 'city_pop',
 'cen_dist',
 'night',
 'land_cover',
 'X',
 'Y',
 'city_id',
 'population_cat']

In [6]:
alist = ['population_cat', 'night', 'road_len', 'city_pop', 'cen_dist', 'X', 'Y', 'land_cover','city_id']
data = data_ori[alist]

In [7]:
data.describe()

Unnamed: 0,night,road_len,city_pop,cen_dist,X,Y,land_cover,city_id
count,260534.0,260534.0,260534.0,260534.0,260534.0,260534.0,260534.0,260534.0
mean,4.625352,2.618643,4394432.0,48.840822,9.948661,49.823713,6.632543,11.373537
std,12.620485,3.395864,3148661.0,26.506507,8.835994,5.054657,2.135577,5.586927
min,0.0,0.0,1207469.0,0.088187,-9.505698,38.409757,1.0,1.0
25%,0.3475,0.0795,2120416.0,29.361106,2.452635,47.71809,7.0,7.0
50%,0.7125,1.553339,2939714.0,44.830977,12.369302,50.759757,7.0,12.0
75%,2.75,3.37645,4814757.0,63.796209,16.669302,52.701424,8.0,17.0
max,1361.78,40.15653,11665960.0,166.826791,26.460969,60.259757,11.0,22.0


In [8]:
data.head()

Unnamed: 0,population_cat,night,road_len,city_pop,cen_dist,X,Y,land_cover,city_id
0,True,0.11,0.358197,1704350,53.481355,-6.697365,53.001424,7,14
1,False,0.275,0.0,3021243,36.127658,9.835969,53.86809,8,5
2,True,0.335,0.299463,4788590,70.35631,13.010969,51.984757,8,8
3,True,0.455,1.71959,2650374,53.448373,16.310969,47.701424,8,7
4,True,0.81,0.0,1704350,17.568716,-6.472365,53.234757,7,14


In [9]:
cat_names = alist[-2:]
cont_names = alist[1:-2]
y_names = alist[0]
procs = [Categorify,]

In [10]:
percent, length = 0.8, len(data.axes[0])
stop = int(percent * length)
splits = [list(range(stop)), list(range(stop,length))]

In [11]:
to = TabularPandas(data, procs=procs, cat_names=cat_names, cont_names=cont_names, y_names=y_names, splits=splits)

In [12]:
len(to.xs),len(to.train),len(to.valid)

(260534, 208427, 52107)

In [13]:
xs, y = to.xs,to.y
train_xs,train_y = to.train.xs,to.train.y
valid_xs,valid_y = to.valid.xs,to.valid.y

In [14]:
# param_grid = [{'n_estimators':[20], 'max_samples':[100000], 
#                'max_features': [0.5], 'min_samples_leaf':[3, 4, 5], 
#                'max_depth': [30, 35, 40]}]
# rf_test = RandomForestClassifier()

In [15]:
# rf_grid_search = GridSearchCV(rf_test, param_grid, cv=6, scoring='f1')
# rf_grid_search.fit(train_xs,train_y)

In [None]:
# rf_grid_search.best_params_

In [16]:
n_estimators = 500
max_samples = 100000
max_features = 0.5
min_samples_leaf = 4
max_depth = 25

In [17]:
rf = RandomForestClassifier(n_jobs=-1, n_estimators=n_estimators,
    max_samples=max_samples, max_features=max_features, max_depth=max_depth,
    min_samples_leaf=min_samples_leaf, oob_score=True)

In [18]:
import time
start = time.time()
m = rf.fit(train_xs, train_y);
end = time.time()
print('Time:', end-start)

Time: 129.8042769432068


In [19]:
precision_score(train_y, rf.predict(train_xs)), precision_score(valid_y, rf.predict(valid_xs))

(0.9351983070343233, 0.9056259040272033)

In [20]:
recall_score(train_y, rf.predict(train_xs)), recall_score(valid_y, rf.predict(valid_xs))

(0.9599352911247081, 0.9344365312107248)

In [21]:
train_preds = np.stack([t.predict(train_xs) for t in rf.estimators_])
valid_preds = np.stack([t.predict(valid_xs) for t in rf.estimators_])
train_preds.shape, valid_preds.shape

((500, 208427), (500, 52107))

In [22]:
for name, score in zip(alist[1:], rf.feature_importances_):
    print('%s: %f' %(name, 100 *score)+"%")

night: 13.163930%
road_len: 3.315768%
city_pop: 17.632648%
cen_dist: 32.965052%
X: 3.493891%
Y: 8.566055%
land_cover: 10.684034%
city_id: 10.178624%


In [23]:
# Save the model
joblib_file = path + '/classifier.pkl'
joblib.dump(m, joblib_file)
print('Model Saved!')

Model Saved!
