In [1]:
import pandas as pd
import numpy as np
from joblib import dump, load
from ds_code.function.utils import sliding_window
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, HalvingGridSearchCV, GridSearchCV
from sklearn.metrics import root_mean_squared_error
import random

In [2]:
#data for distinction between provinces
city_data = pd.read_csv("data/region/vietnam/extra_info.csv", index_col=0)
city_data

Unnamed: 0_level_0,lat,lng,population
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1704774326,10.7756,106.7019,1.513600e+07
1704413791,21.0000,105.8500,8.246600e+06
1704000623,20.8651,106.6838,2.103500e+06
1704783472,10.0333,105.7833,1.237300e+06
1704863046,10.9500,106.8167,1.104000e+06
...,...,...,...
1704495953,22.8333,104.9833,5.555900e+04
1704000217,22.1333,105.8333,4.503600e+04
1704983526,22.3992,103.4392,4.297300e+04
1704988146,14.3544,108.0075,6.586050e+05


In [3]:
weather_np = []
air_np = []

# take some minutes
for city_id in city_data.index:           #fix this
    air_df = pd.read_csv("data/air_quality/" + str(city_id) + ".csv")
    weather_df = pd.read_csv("data/weather/" + str(city_id) + ".csv")         # data source
    
    air_df = air_df.loc[(air_df.iloc[:, 1:] >= 0).all(axis=1)]
    air_df.drop("aqi", axis=1, inplace=True)
    air_df.reset_index(drop=True, inplace=True)

    weather_df.dropna(axis=0, inplace=True)
    weather_df.reset_index(drop=True, inplace=True)
    
    X, y = sliding_window(weather_df, air_df, target_size="same")
    m = X.shape[0]
    extra_attr = city_data.loc[city_id]
    X = X.reshape((m, -1))
    lat = np.full((m, 1), extra_attr[0])
    lng = np.full((m, 1), extra_attr[1])
    population = np.full((m, 1), extra_attr[2])
    X = np.hstack((X, lat, lng, population))
    
    weather_np.append(X)
    air_np.append(y)
    
weather_np = np.vstack(weather_np)
air_np = np.vstack(air_np)

In [None]:
#dump((weather_np, air_np), "forest_raw_dataset.pkl")

['forest_raw_dataset.pkl']

In [2]:
weather_np, air_np = load("forest_raw_dataset.pkl")

In [3]:
air_np = air_np[:, -1]

In [4]:
weather_np = weather_np.astype("float32")
air_np = air_np.astype("float32")

In [5]:
random.seed(42)
idx = [i for i in range(len(weather_np))]
random.shuffle(idx)
train_idx, test_idx = idx[:1800000], idx[1800000:]
X_train, X_test, y_train, y_test = weather_np[train_idx], weather_np[test_idx], air_np[train_idx], air_np[test_idx]

In [6]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('forest', RandomForestRegressor(n_estimators=30, n_jobs=-1))    
])

model = TransformedTargetRegressor(
    regressor=pipeline,
    transformer=StandardScaler()
)

In [7]:
'''
import matplotlib.pyplot as plt
plt.plot(pred[:, 2])
plt.plot(air_np[10000:10200][:, 2])
plt.show()
'''

'\nimport matplotlib.pyplot as plt\nplt.plot(pred[:, 2])\nplt.plot(air_np[10000:10200][:, 2])\nplt.show()\n'

In [8]:
def custom_scorer(y_true, y_pred):
    scaler = StandardScaler()
    scaled_y_true = scaler.fit_transform(y_true)
    return -root_mean_squared_error(scaled_y_true,
                               scaler.transform(y_pred),
                               multioutput="uniform_average")
    
scoring = make_scorer(custom_scorer)

In [9]:
param_grid = {"regressor__forest__max_depth": [30, 40, 50],      #30-50
              "regressor__forest__min_samples_split": [5, 10, 20],  #5-20
              "regressor__forest__min_samples_leaf": [2, 5, 10]}    #2-10

In [10]:
#halving_tuner = HalvingGridSearchCV(model, param_grid, scoring=scoring, factor=3, verbose=1, cv=3, n_jobs=-1)
halving_tuner = GridSearchCV(model, param_grid, scoring=scoring, verbose=2, cv=3, n_jobs=-1)

In [11]:
halving_tuner.fit(X_train[:50], y_train[:50])

Fitting 3 folds for each of 27 candidates, totalling 81 fits


In [12]:
pd.DataFrame(halving_tuner.cv_results_).sort_values("rank_test_score")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor__forest__max_depth,param_regressor__forest__min_samples_leaf,param_regressor__forest__min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
21,0.090508,0.011972,0.032437,0.0015,50,5,5,"{'regressor__forest__max_depth': 50, 'regresso...",-1.092308,-0.869773,-0.98565,-0.982577,0.090875,1
7,0.092391,0.001673,0.028893,0.00253,30,10,10,"{'regressor__forest__max_depth': 30, 'regresso...",-1.054348,-0.9314,-0.968582,-0.984777,0.051483,2
17,0.089545,0.011146,0.035039,0.001676,40,10,20,"{'regressor__forest__max_depth': 40, 'regresso...",-1.027182,-0.951708,-0.987378,-0.988756,0.030827,3
12,0.104663,0.007227,0.036006,0.004265,40,5,5,"{'regressor__forest__max_depth': 40, 'regresso...",-1.114754,-0.90469,-0.981587,-1.000344,0.086778,4
26,0.087056,0.001603,0.022293,0.001452,50,10,20,"{'regressor__forest__max_depth': 50, 'regresso...",-1.054292,-0.944037,-1.011025,-1.003118,0.045357,5
4,0.159977,0.007988,0.030363,0.001571,30,5,10,"{'regressor__forest__max_depth': 30, 'regresso...",-1.131271,-0.866042,-1.014711,-1.004008,0.108543,6
8,0.095373,0.00444,0.029908,0.002096,30,10,20,"{'regressor__forest__max_depth': 30, 'regresso...",-1.027419,-0.973097,-1.02284,-1.007785,0.0246,7
25,0.096001,0.006439,0.023484,0.00218,50,10,10,"{'regressor__forest__max_depth': 50, 'regresso...",-1.103293,-0.943439,-0.977901,-1.008211,0.068689,8
13,0.094474,0.001873,0.030158,0.001561,40,5,10,"{'regressor__forest__max_depth': 40, 'regresso...",-1.106,-0.919144,-1.000803,-1.008649,0.076485,9
15,0.101039,0.007498,0.027868,0.003062,40,10,5,"{'regressor__forest__max_depth': 40, 'regresso...",-1.059857,-0.971978,-0.996397,-1.009411,0.037038,10


In [19]:
#dump(halving_tuner.best_estimator_, "random_forest.pkl", compress=0) 

['random_forest.pkl']

In [6]:
model2 = load("random_forest.pkl")

In [23]:
#model2.predict(X_train[:20000])

In [8]:
#dump(model2, "random_forest.pkl", compress=3) 

['random_forest.pkl']

In [7]:
model2.predict(X_train[:50])

array([[7.13484572e+02, 7.78796147e+00, 2.09239633e+01, 1.87766312e+00,
        4.94172221e+01, 5.56190263e+01],
       [2.86986139e+02, 4.99256636e-01, 2.65893673e+01, 2.66387426e-01,
        1.18577749e+00, 1.34451177e+00],
       [4.21056898e+02, 6.05820734e+00, 3.61149976e+01, 3.54565724e+00,
        1.52820470e+01, 2.10466495e+01],
       [6.08009389e+02, 1.42187889e+01, 3.02743232e+01, 3.64824879e+00,
        2.00258078e+01, 2.34178249e+01],
       [7.44474059e+02, 1.48106634e+01, 2.78178858e+01, 5.38319203e+00,
        2.28182174e+01, 2.48759620e+01],
       [5.45760509e+02, 8.65154208e+00, 1.04951416e+02, 1.09152856e+01,
        8.10286315e+01, 9.11381623e+01],
       [5.00848977e+02, 1.21823120e+00, 4.31036241e+01, 4.35608631e-01,
        1.63396147e+01, 1.69884782e+01],
       [8.64360570e+02, 3.62553922e+00, 1.72724282e+02, 5.85682784e+00,
        9.81619773e+01, 1.09577969e+02],
       [6.95399474e+02, 1.51230445e+01, 1.62992056e+01, 5.07634168e+00,
        3.19296612e+01, 