In [1]:
VERSION = 'v1.0'

# Model Details
## Features:
- One hot encoded **day of week** and **month** (not year)
- Weather feature (OHE):
    - Icons (cloudy, partial cloudy, ...)
    - Precipitates Type (None, Snow, Sleet, Rain)
- Station info:
    - community area (OHE)
    - capacity
    - long, lat
    
## Target
- Log scale/normal scale
    
# Work Flow
## Training Preprocessing
- Merge station community area (Join tables)
- Drop id after merging
- Add weather info (temp_high/low, and OHE ICONs and Precipitates Types)
- Convert to numpy matrix

## Pipeline
- OHE on date time (Remember column indices)
- Scaling for `year, lon_ave, lat_ave, dp_max, temp_high, temp_low` (`MinMaxScaler`)
- Regressor()

## Test Preprocessing
- Start with Pandas template (station_id, lon_ave, lat_ave, dp_max, OHE community area)
- Add weather info (temp_high/low, and OHE ICONs and Precipitates Types)
- Convert to numpy matrix

## Post prediction
- Rescale if trained on log
- Hard cap negative (activation function)

In [2]:
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
import os

## Load data

In [3]:
INFO_verbose = False

In [4]:
%%time
st_template = pd.read_pickle('../data/model_v1.0_template.pk')

CPU times: user 4.94 ms, sys: 4.92 ms, total: 9.86 ms
Wall time: 35.3 ms


In [5]:
if INFO_verbose:
    st_template.info()

In [6]:
%%time
## load preprocessed data

if os.path.exists('../data/divvy_data_model_'+VERSION+'.pk'):
    print("Loading from previous pickle file.")
    data = pd.read_pickle('../data/divvy_data_model_'+VERSION+'.pk')
else:
    print("Create data set for this model... ")
    data_lst = []

    for year in [2013, 2014, 2015, 2016, 2017, 2018]:
        dt_tmp = pd.read_feather('../data/Final_Divvy_data_'+str(year)+'.feather')
        data_lst.append(dt_tmp)

    data = pd.concat(data_lst, ignore_index=True)


    data.to_pickle('../data/divvy_data_model_'+VERSION+'.pk')
    print("Data saved to pickle file")

Loading from previous pickle file.
CPU times: user 223 ms, sys: 713 ms, total: 937 ms
Wall time: 1.89 s


In [7]:
if INFO_verbose:
    data.info()

## Get target value

In [8]:
target_in = data.total_in
target_out = data.total_out

In [9]:
target_in_log = np.log(target_in+1)

## Prepare features

In [10]:
# Prescreening for useful features
feats = pd.merge(data[['station_id', 'month', 'dayofweek', 'year',
                       'icon_clear-day', 'icon_cloudy', 'icon_partly-cloudy-day', 
                       'icon_rain', 'icon_sleet', 'icon_snow',
                       'precipType_None', 'precipType_rain',
                       'precipType_sleet', 'precipType_snow',
                       'temperatureHigh', 'temperatureLow' 
                         ]], st_template, 
                    left_on='station_id', right_on='id').drop(['station_id', 'id'], axis=1)

## Reordering dataframe

In [11]:
# Reordering
cols = []
cols.extend(feats.columns[15:])
cols.extend(feats.columns[2:15])
cols.extend(feats.columns[:2])

In [12]:
feats_fnl = feats[cols]

In [25]:
if INFO_verbose:
    feats_fnl.info()

In [14]:
feats_fnl = feats_fnl.fillna(0)

## Get/Check indices for numerical columns 

In [15]:
num_col_in = [0, 1, 2, 50, 61, 62]

In [16]:
feats_fnl.iloc[:, num_col_in].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 947515 entries, 0 to 947514
Data columns (total 6 columns):
lon_ave            947515 non-null float64
lat_ave            947515 non-null float64
dp_max             947515 non-null float64
year               947515 non-null float64
temperatureHigh    947515 non-null float64
temperatureLow     947515 non-null float64
dtypes: float64(6)
memory usage: 50.6 MB


## Save model details into txt 

In [17]:
import io

buffer = io.StringIO()
feats_fnl.info(buf=buffer)
s = buffer.getvalue()

with open("../model_features_details/features_"+str(VERSION)+'.txt', "w", encoding="utf-8") as f: 
    f.write(s)

## Convert to numpy matrix 

In [18]:
features_ = feats_fnl.to_numpy()

In [19]:
features_

array([[-87.6277254 ,  41.87401502,  23.        , ...,  62.11      ,
          7.        ,   0.        ],
       [-87.6277254 ,  41.87401502,  23.        , ...,  61.23      ,
          7.        ,   1.        ],
       [-87.6277254 ,  41.87401502,  23.        , ...,  59.93      ,
          7.        ,   2.        ],
       ...,
       [-87.683282  ,  41.939354  ,  16.        , ...,  28.51      ,
         12.        ,   4.        ],
       [-87.683282  ,  41.939354  ,  16.        , ...,  31.15      ,
         12.        ,   6.        ],
       [-87.683282  ,  41.939354  ,  16.        , ...,  29.96      ,
         12.        ,   0.        ]])

## Building pipelines

In [20]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder # deprecated in 0.20
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split

In [21]:
# %%time
# model = Pipeline([
#     ('ohe', OneHotEncoder(categorical_features=[63, 64], sparse=False)),
#     ('mms', MinMaxScaler(feature_range=(0, 1), copy=False)),
#     ('rg', Ridge(alpha=10))
# ])

# # Train Test split
# X_train, X_test, y_train, y_test = train_test_split(
#     features_, 
#     target_out, 
#     test_size=0.2, 
#     random_state=42)

# model_gs = GridSearchCV(
#     model,
#     {'rg__alpha': [0.001, 0.01, 0.1, 1, 10, 100]},
#     cv=5,
#     n_jobs=2
# )

# model_gs.fit(X_train, y_train)
# print("Best params: ", model_gs.best_params_)

In [22]:
%%time
model = Pipeline([
    ('ohe', OneHotEncoder(categorical_features=[63, 64], sparse=False)),
    ('mms', MinMaxScaler(feature_range=(0, 1), copy=False)),
    ('rf', RandomForestRegressor(n_estimators=10))
])

# Train Test split
X_train, X_test, y_train, y_test = train_test_split(
    features_, 
    target_out, 
    test_size=0.2, 
    random_state=42)

model_gs = GridSearchCV(
    model,
    {'rf__n_estimators': [50, 100, 200]},
    cv=5,
    n_jobs=2
)

CPU times: user 1.37 s, sys: 836 ms, total: 2.2 s
Wall time: 2.37 s


In [23]:
# %%time
# model_gs.fit(X_train, y_train)
# # model_gs.fit(features_, target_out)
# print("Best params: ", model_gs.best_params_)

In [26]:
feats_fnl.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 947515 entries, 0 to 947514
Data columns (total 65 columns):
lon_ave                                  947515 non-null float64
lat_ave                                  947515 non-null float64
dp_max                                   947515 non-null float64
community_area_ALBANY_PARK               947515 non-null uint8
community_area_ARMOUR_SQUARE             947515 non-null uint8
community_area_AUSTIN                    947515 non-null uint8
community_area_AVALON_PARK               947515 non-null uint8
community_area_AVONDALE                  947515 non-null uint8
community_area_BRIDGEPORT                947515 non-null uint8
community_area_CHATHAM                   947515 non-null uint8
community_area_DOUGLAS                   947515 non-null uint8
community_area_EAST_GARFIELD_PARK        947515 non-null uint8
community_area_EDGEWATER                 947515 non-null uint8
community_area_ENGLEWOOD                 947515 non-null uint8
c

In [58]:
test = st_template.iloc[:, 1:]

In [59]:
test.shape

(618, 50)

In [60]:
test['year'] = 2019

In [61]:
test.shape

(618, 51)

In [62]:
icon_type = 1

for i in range(6):
    if i == icon_type:
        test['icon'+str(i)] = 1
    else:
        test['icon'+str(i)] = 0

In [63]:
test['prec0'] = 1
test['prec1'] = 0
test['prec2'] = 0
test['prec3'] = 0

In [64]:
test['hightmp'] = 50
test['lowtmp'] = 30

In [66]:
test['month'] = 1

In [68]:
import calendar

In [69]:
test['dayofweek'] = calendar.weekday(2019, 7, 17)

In [71]:
test.to_numpy()

array([[-87.62235567,  41.87413767,  39.        , ...,  30.        ,
          1.        ,   2.        ],
       [-87.61535533,  41.86722597,  55.        , ...,  30.        ,
          1.        ,   2.        ],
       [-87.613348  ,  41.856268  ,  23.        , ...,  30.        ,
          1.        ,   2.        ],
       ...,
       [-87.8025742 ,  41.87985   ,  11.        , ...,  30.        ,
          1.        ,   2.        ],
       [-87.8002935 ,  41.894213  ,  19.        , ...,  30.        ,
          1.        ,   2.        ],
       [-87.785236  ,  41.888085  ,  15.        , ...,  30.        ,
          1.        ,   2.        ]])

In [74]:
import pickle

In [75]:
%%time
with open('../model_data/random_forest_'+VERSION+'.pk', 'rb') as f:
    loaded_model = pickle.load(f)

CPU times: user 345 ms, sys: 841 ms, total: 1.19 s
Wall time: 2.43 s


In [76]:
loaded_model.predict(test)

array([5.540e+01, 4.530e+01, 6.380e+01, 1.097e+02, 2.860e+01, 1.090e+01,
       1.040e+01, 1.010e+01, 2.200e+00, 3.740e+01, 9.210e+01, 5.180e+01,
       7.000e+00, 0.000e+00, 6.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       3.700e+00, 5.200e+00, 4.230e+01, 1.940e+01, 8.300e+00, 4.000e+00,
       4.250e+01, 5.980e+01, 7.770e+01, 2.000e+01, 2.110e+01, 3.580e+01,
       5.000e+01, 2.920e+01, 1.450e+01, 2.000e+01, 1.591e+02, 9.500e+00,
       5.100e+00, 4.200e+00, 1.190e+01, 2.000e+01, 1.340e+01, 6.530e+01,
       1.560e+01, 4.090e+01, 6.080e+01, 4.000e+00, 2.120e+01, 6.900e+01,
       1.330e+01, 3.100e+00, 2.700e+00, 1.140e+01, 9.200e+00, 2.200e+00,
       9.400e+00, 1.900e+00, 1.340e+01, 1.650e+01, 1.870e+01, 1.490e+01,
       8.800e+00, 9.600e+00, 1.450e+01, 2.010e+01, 2.950e+01, 2.530e+01,
       1.540e+01, 1.880e+01, 7.400e+00, 4.400e+00, 

In [77]:
fdf = pd.DataFrame()

In [81]:
fdf['lon_ave'] = test.lon_ave
fdf['lat_ave'] = test.lat_ave
fdf['dp_max'] = test.dp_max

In [82]:
fdf['total_in'] = loaded_model.predict(test)

In [83]:
fdf

Unnamed: 0,lon_ave,lat_ave,dp_max,total_in
0,-87.622356,41.874138,39.0,55.4
1,-87.615355,41.867226,55.0,45.3
2,-87.613348,41.856268,23.0,63.8
3,-87.627725,41.874015,23.0,109.7
4,-87.612798,41.885364,39.0,28.6
5,-87.617517,41.886349,19.0,10.9
6,-87.680604,41.828792,19.0,10.4
7,-87.576450,41.766638,11.0,10.1
8,-87.565688,41.766409,15.0,2.2
9,-87.652695,41.932451,35.0,37.4


In [84]:
import pickle

In [88]:
pickle.VERSION

AttributeError: module 'pickle' has no attribute 'VERSION'