# Automated model building

This notebook extracts and prepares data for automated model building and evaluation in the project Digital Water City. 

In [1]:
from ews.models import BathingSpot
from django_pandas.io import read_frame
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as MSE
import matplotlib.pyplot as plt

In [4]:
data = FeatureData.objects.all()

In [5]:
data = read_frame(data, index_col = "date")

TypeError: argument must be int or float

In [3]:
data.info()

AttributeError: 'QuerySet' object has no attribute 'info'

In [163]:
data_long =data.pivot(columns='station', values='value')


In [164]:
data_long

station,Spandau,flow
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-01 00:00:00+00:00,2.0,2.0
2020-01-02 00:00:00+00:00,3.0,3.0
2020-01-03 00:00:00+00:00,4.0,4.0
2020-01-04 00:00:00+00:00,5.0,5.0
2020-01-05 00:00:00+00:00,6.0,6.0
2020-01-06 00:00:00+00:00,7.0,7.0
2020-01-07 00:00:00+00:00,0.0,0.0
2020-01-08 00:00:00+00:00,0.0,0.0
2020-01-09 00:00:00+00:00,4.0,4.0
2020-01-10 00:00:00+00:00,5.0,5.0


In [165]:
data_long.rolling('3D').mean()

station,Spandau,flow
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-01 00:00:00+00:00,2.0,2.0
2020-01-02 00:00:00+00:00,2.5,2.5
2020-01-03 00:00:00+00:00,3.0,3.0
2020-01-04 00:00:00+00:00,4.0,4.0
2020-01-05 00:00:00+00:00,5.0,5.0
2020-01-06 00:00:00+00:00,6.0,6.0
2020-01-07 00:00:00+00:00,4.333333,4.333333
2020-01-08 00:00:00+00:00,2.333333,2.333333
2020-01-09 00:00:00+00:00,1.333333,1.333333
2020-01-10 00:00:00+00:00,3.0,3.0


In [166]:
iterator = data.station.unique()

In [167]:
iterator

array(['Spandau', 'flow'], dtype=object)

In [168]:
results = []
for i in iterator:
    for j in [1, 2, 3, 4, 5]:
        df = pd.DataFrame()
        df[i + '_shift_'+ str(j)] = data_long[i].rolling(window=j).mean().shift(1)
        results.append(df)
    
    

In [169]:
res = pd.concat(results, axis = 1)

In [170]:
res

Unnamed: 0_level_0,Spandau_shift_1,Spandau_shift_2,Spandau_shift_3,Spandau_shift_4,Spandau_shift_5,flow_shift_1,flow_shift_2,flow_shift_3,flow_shift_4,flow_shift_5
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-01-01 00:00:00+00:00,,,,,,,,,,
2020-01-02 00:00:00+00:00,2.0,,,,,2.0,,,,
2020-01-03 00:00:00+00:00,3.0,2.5,,,,3.0,2.5,,,
2020-01-04 00:00:00+00:00,4.0,3.5,3.0,,,4.0,3.5,3.0,,
2020-01-05 00:00:00+00:00,5.0,4.5,4.0,3.5,,5.0,4.5,4.0,3.5,
2020-01-06 00:00:00+00:00,6.0,5.5,5.0,4.5,4.0,6.0,5.5,5.0,4.5,4.0
2020-01-07 00:00:00+00:00,7.0,6.5,6.0,5.5,5.0,7.0,6.5,6.0,5.5,5.0
2020-01-08 00:00:00+00:00,0.0,3.5,4.333333,4.5,4.4,0.0,3.5,4.333333,4.5,4.4
2020-01-09 00:00:00+00:00,0.0,0.0,2.333333,3.25,3.6,0.0,0.0,2.333333,3.25,3.6
2020-01-10 00:00:00+00:00,4.0,2.0,1.333333,2.75,3.4,4.0,2.0,1.333333,2.75,3.4


In [171]:
y = data_long["Spandau"]

In [172]:
X = res.fillna(0)

In [173]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)


In [174]:
rf = RandomForestRegressor()

In [175]:
rf.fit(X_train, y_train)

RandomForestRegressor()

In [176]:

# Predict the test set labels
y_pred = rf.predict(X_test)

# Evaluate the test set RMSE
rmse_test = MSE(y_test, y_pred)**(1/2)

# Print rmse_test
print('Test set RMSE of rf: {:.2f}'.format(rmse_test))

Test set RMSE of rf: 17.58


In [177]:
# Create a pd.Series of features importances
importances = pd.DataFrame({'importances':rf.feature_importances_,
                        'predictor': X_train.columns})



In [178]:
# Sort importances
importances_sorted = importances.sort_values(by=['importances'])



In [179]:
# Draw a horizontal barplot of importances_sorted
importances_sorted

Unnamed: 0,importances,predictor
6,0.034088,flow_shift_2
3,0.054332,Spandau_shift_4
2,0.057912,Spandau_shift_3
7,0.059758,flow_shift_3
8,0.065424,flow_shift_4
1,0.083202,Spandau_shift_2
9,0.10776,flow_shift_5
4,0.1294,Spandau_shift_5
0,0.176248,Spandau_shift_1
5,0.231875,flow_shift_1


In [182]:
import plotly.express as px

fig = px.bar(importances_sorted, x="importances", y="predictor", orientation='h')
fig.show()

In [181]:
rf.predict(X_train)

array([ 2.68,  7.78,  3.62,  5.42,  4.41,  4.33, 13.57,  1.71, 36.33])