## Naive Regression procedure

In [22]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from matplotlib import pyplot as plt


In [23]:
def create_X_y(df):
    """
	X has the following format:
	One week per row
	row == sample
	9 sensor values (including timestamps) * 24 h * 7 days = 1512 entries per sample

	[t_0, w_station_A(t_0), w_station_B(t_0), w_station_C(t_0), 
	 t_1, w_station_A(t_1), w_station_B(t_1), w_station_C(t_1), 
	 
	 t_N, w_station_A(t_N), w_station_B(t_N), w_station_C(t_N)] 


	"""
    timestamps = df.shape[0]
    
    prediction_timestamps = 24
    prediction_step = 3
    
    timestamps_per_week = 24 * 7
    samples = timestamps - timestamps_per_week - prediction_timestamps

    X = []
    y = []
    for i in range(samples):
        X.append(df[i:i+timestamps_per_week].to_numpy().flatten())
        y.append(df['main_level'].iloc[i + timestamps_per_week : 
                                       i + timestamps_per_week + prediction_timestamps : 
                                           prediction_step])
    X = np.array(X)
    y = np.array(y)
    return X, y





def get_sample_dataframe(X, index):
    
    assert 0 <= index <= X.shape[0]
    
    sample_mat = X[index].reshape((24*7,9))
    sample_df = pd.DataFrame(sample_mat[:, 1:], index=sample_mat[:,0])

    return sample_df

def plot_in_2d(X, title=None):
    pca = PCA(n_components=2)
    pca.fit(X)
    X_2d = pca.transform(X)
    plt.scatter(X_2d[:,0], X_2d[:,1])
    
    if title is not None:
        plt.title(title)
    plt.show()    
    
# Root Mean Square Error
def rmse(y_pred, y_true):  
    return np.sqrt(np.mean((y_pred - y_true)**2))

#### Load data in ML representation X, y

In [24]:
sample_df = pd.read_csv("./preprocessed_stations.csv")
# sample_df

Unnamed: 0,time,main_level,main_flow,a_temp,a_status,a_rain,c_temp,c_status,c_rain
0,2014-01-01 01:00:00,182.0,7.19,2.6,2,0.0,2.8,2,0.0
1,2014-01-01 02:00:00,182.0,7.19,2.4,2,0.0,2.5,2,0.0
2,2014-01-01 03:00:00,182.0,7.19,1.9,2,0.0,1.9,2,0.0
3,2014-01-01 04:00:00,182.0,7.19,2.0,2,0.0,2.1,2,0.0
4,2014-01-01 05:00:00,182.0,7.19,1.7,2,0.0,2.1,2,0.0
...,...,...,...,...,...,...,...,...,...
34828,2017-12-31 19:00:00,255.0,15.90,8.9,4,0.1,9.0,4,0.0
34829,2017-12-31 20:00:00,259.0,16.40,8.9,4,0.0,9.3,3,0.0
34830,2017-12-31 21:00:00,263.0,16.90,8.9,4,0.0,9.2,4,0.0
34831,2017-12-31 22:00:00,268.0,17.50,8.7,4,0.2,8.9,3,0.5


In [None]:
X, y = create_X_y(sample_df)

# Remove timestamps
X = np.delete(X, slice(0, X.shape[1], 9), axis=1)
# X

#### 1. Split Train and Test set
#### 2. Reduce Dimensions via PCA
#### 3. Train LinearRegression Model on Training Set
#### 4. Test LinearRegression Model on Test Set

In [None]:
#1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
# plot_in_2d(X, title='Train set raw')

#2
dim_reducer = PCA(n_components=10).fit(X_train)
X_train = dim_reducer.transform(X_train)

# plot_in_2d(X_train, title='Train set scaled to unit variance and zero-mean and reduced dimensions to 10')
X_test

### LinearRegression

In [None]:
#3
l_reg = LinearRegression()
l_reg.fit(X_train, y_train)

#4 make predictions on the testing set
y_pred = l_reg.predict(dim_reducer.transform(X_test))
# compare actual response values (y_test) with predicted response values (y_pred)
rmse_result = rmse(y_pred, y_test)
rmse_result

### DecisionTreeRegressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)

# score = dtr.score(X_train, y_train)

dtr_y_pred = dtr.predict(dim_reducer.transform(X_test))
# compare actual response values (y_test) with predicted response values (y_pred)
print(rmse(dtr_y_pred, y_test))


### RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

rfr = DecisionTreeRegressor()
rfr.fit(X_train, y_train)

# score = dtr.score(X_train, y_train)

rfr_y_pred = rfr.predict(dim_reducer.transform(X_test))
# compare actual response values (y_test) with predicted response values (y_pred)
print(rmse(rfr_y_pred, y_test))


The **RMSE decreased** when we use **DecisionTreeRegressor** as model. (Error is something we want to minimize, so **a lower number for RMSE is better**.)

### Persist the model

For the evaluation the persited model will used by calling _obj.predict(X)_ on the target data.

The data will be in the _ML representation X,y_ format above - as returned by the function _create_X_y()_.

In [None]:
# pickle.dump(dtr, open("model.obj", "wb"))

In [None]:
# model_obj = pickle.load("model.obj")

# Verify Input Data