# Modeling

In [43]:
import pandas as pd
import numpy as np

df = pd.read_csv('feature_df.csv', parse_dates=True)
df.set_index(pd.DatetimeIndex(df['Date']))
df

Unnamed: 0,Date,amount_dispensed,t_minus_1,t_minus_2,t_minus_3,t_minus_7,weekday,month,pm25,pm10
0,2020-07-13,240.0,0.0,500.0,240.0,240.0,0,7,,10.0
1,2020-07-14,240.0,240.0,0.0,500.0,220.0,1,7,18.0,20.0
2,2020-07-15,0.0,240.0,240.0,0.0,380.0,2,7,22.0,24.0
3,2020-07-16,0.0,0.0,240.0,240.0,700.0,3,7,21.0,26.0
4,2020-07-17,620.0,0.0,0.0,240.0,240.0,4,7,21.0,17.0
...,...,...,...,...,...,...,...,...,...,...
388,2021-08-05,0.0,720.0,1920.0,60.0,860.0,3,8,,
389,2021-08-06,0.0,0.0,720.0,1920.0,280.0,4,8,,
390,2021-08-07,260.0,0.0,0.0,720.0,120.0,5,8,,
391,2021-08-08,0.0,260.0,0.0,0.0,480.0,6,8,,


# Predicting how much cash will be dispensed on a given day based on the Air Quality and the amount dispensed in the recent past.

In [45]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer



simple_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
target = 'amount_dispensed'
features = [a for a in model_df.columns if a != target]

X = model_df[features]
y = model_df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 


simple_imputer.fit(X_train)
X_train = pd.DataFrame(data=simple_imputer.transform(X_train), columns = X.columns)
X_test = pd.DataFrame(data=simple_imputer.transform(X_test), columns = X.columns)


Unnamed: 0,t_minus_1,t_minus_2,t_minus_3,t_minus_7,weekday,month
0,500.0,1160.0,720.0,1540.0,0.0,6.0
1,140.0,480.0,100.0,840.0,0.0,1.0
2,1380.0,300.0,0.0,300.0,4.0,1.0
3,200.0,100.0,440.0,60.0,0.0,9.0
4,660.0,1140.0,1860.0,200.0,5.0,6.0
...,...,...,...,...,...,...
309,0.0,400.0,100.0,280.0,1.0,9.0
310,1000.0,460.0,660.0,220.0,1.0,10.0
311,680.0,780.0,620.0,1540.0,4.0,4.0
312,600.0,500.0,280.0,1220.0,5.0,6.0


In [52]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
# you only have 6 features 
RandomForestRegressor?


[0;31mInit signature:[0m
[0mRandomForestRegressor[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mn_estimators[0m[0;34m=[0m[0;36m100[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcriterion[0m[0;34m=[0m[0;34m'mse'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_depth[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_samples_split[0m[0;34m=[0m[0;36m2[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_samples_leaf[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_weight_fraction_leaf[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_features[0m[0;34m=[0m[0;34m'auto'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_leaf_nodes[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_impurity_decrease[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_impurity_split[0m[0;34m=[0m[0;32mNone[0m[0;34m

# Testing parameters


In [63]:
def get_train_test_split(model_df):
    simple_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    target = 'amount_dispensed'
    features = [a for a in model_df.columns if a != target]

    X = model_df[features]
    y = model_df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 

    simple_imputer.fit(X_train)
    X_train = pd.DataFrame(data=simple_imputer.transform(X_train), columns = X.columns)
    X_test = pd.DataFrame(data=simple_imputer.transform(X_test), columns = X.columns)

    return X_train, X_test, y_train, y_test 


In [95]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
random_forest_outcomes = pd.DataFrame(columns=['max_depth', 'n_estimators', 'max_features', 'mean_absolute_error'])

predicted_true_points = []

for max_depth in [None, 2,3]:
    for n_estimators in [100,200,500]:
        for max_features in [2,4,6]:
            X_train, X_test, y_train, y_test = get_train_test_split(model_df) # get different train test splits with a different imputer at each call.

            rf_model = RandomForestRegressor(n_estimators=n_estimators, max_depth= max_depth, max_features=max_features, random_state=42)
            rf_model.fit(X_train,y_train)
            y_pred = rf_model.predict(X_test)

            points = list(zip(y_pred, y_test))
            predicted_true_points.append(points)

            mse = round(mean_absolute_error(y_test,y_pred),2)

            row = [max_depth,n_estimators,max_features,mse]
            random_forest_outcomes.loc[len(random_forest_outcomes)] = row
            print(row)


[None, 100, 2, 347.23]
[None, 100, 4, 319.45]
[None, 100, 6, 348.52]
[None, 200, 2, 296.8]
[None, 200, 4, 323.86]
[None, 200, 6, 327.46]
[None, 500, 2, 346.48]
[None, 500, 4, 311.79]
[None, 500, 6, 292.88]
[2, 100, 2, 311.84]
[2, 100, 4, 325.7]
[2, 100, 6, 281.74]
[2, 200, 2, 319.43]
[2, 200, 4, 332.61]
[2, 200, 6, 304.92]
[2, 500, 2, 316.81]
[2, 500, 4, 317.97]
[2, 500, 6, 293.4]
[3, 100, 2, 289.83]
[3, 100, 4, 304.15]
[3, 100, 6, 335.18]
[3, 200, 2, 302.22]
[3, 200, 4, 339.83]
[3, 200, 6, 292.65]
[3, 500, 2, 271.14]
[3, 500, 4, 306.17]
[3, 500, 6, 329.72]


In [94]:
for points in predicted_true_points:
    
    

[(559.0, 460.0),
 (344.0, 1020.0),
 (515.4, 440.0),
 (547.2, 0.0),
 (410.4, 100.0),
 (462.6, 440.0),
 (689.2, 260.0),
 (406.0, 60.0),
 (287.8, 160.0),
 (495.2, 1240.0),
 (479.0, 620.0),
 (347.0, 1040.0),
 (491.4, 240.0),
 (496.4, 740.0),
 (379.2, 440.0),
 (570.6, 1260.0),
 (668.4, 860.0),
 (602.2, 0.0),
 (544.6, 220.0),
 (465.0, 980.0),
 (352.6, 860.0),
 (329.8, 180.0),
 (436.8, 660.0),
 (308.2, 0.0),
 (1114.8, 240.0),
 (565.4, 0.0),
 (431.6, 1680.0),
 (407.0, 480.0),
 (330.2, 0.0),
 (458.2, 60.0),
 (557.6, 100.0),
 (465.4, 380.0),
 (421.8, 1000.0),
 (546.4, 500.0),
 (452.0, 1280.0),
 (752.6, 20.0),
 (429.6, 400.0),
 (265.4, 480.0),
 (559.8, 1160.0),
 (528.6, 1100.0),
 (313.6, 360.0),
 (487.8, 120.0),
 (249.2, 140.0),
 (462.0, 1380.0),
 (336.8, 400.0),
 (471.6, 220.0),
 (395.6, 20.0),
 (390.2, 400.0),
 (421.6, 820.0),
 (506.4, 300.0),
 (481.4, 200.0),
 (500.2, 400.0),
 (344.8, 1040.0),
 (646.0, 260.0),
 (624.0, 1540.0),
 (645.2, 620.0),
 (464.4, 720.0),
 (381.0, 280.0),
 (626.2, 420.0)