In [28]:
import pickle
import pandas as pd
import numpy as np

file_station_a = "sample_42_station_a.csv"
file_station_c = "sample_42_station_c.csv"
file_station_main = "sample_42_station_main.csv"
# file_all_station = "sample_42_stations.csv"

In [29]:
station_a = pd.read_csv(file_station_a, index_col="time", parse_dates=True)
station_c = pd.read_csv(file_station_c, index_col="time", parse_dates=True)
station_main = pd.read_csv(file_station_main, index_col="time", parse_dates=True)

In [30]:
def create_X_y(df):
    """
	X has the following format:
	One week per row
	row == sample
	9 sensor values (including timestamps) * 24 h * 7 days = 1512 entries per sample

	[t_0, w_station_A(t_0), w_station_B(t_0), w_station_C(t_0), 
	 t_1, w_station_A(t_1), w_station_B(t_1), w_station_C(t_1), 
	 
	 t_N, w_station_A(t_N), w_station_B(t_N), w_station_C(t_N)] 


	"""
    timestamps = df.shape[0]
   
    prediction_timestamps = 24
    prediction_step = 3
    
    timestamps_per_week = 24 * 7
    samples = timestamps - timestamps_per_week - prediction_timestamps
    print(samples)
    X = []
    y = []
    for i in range(samples):
        X.append(df[i:i+timestamps_per_week].to_numpy().flatten())
        y.append(df['main_level'].iloc[i + timestamps_per_week : 
                                       i + timestamps_per_week + prediction_timestamps : 
                                           prediction_step])
    X = np.array(X)
    y = np.array(y)
    return X, y





def get_sample_dataframe(X, index):
    
    assert 0 <= index <= X.shape[0]
    
    sample_mat = X[index].reshape((24*7,9))
    sample_df = pd.DataFrame(sample_mat[:, 1:], index=sample_mat[:,0])

    return sample_df

def plot_in_2d(X, title=None):
    pca = PCA(n_components=2)
    pca.fit(X)
    X_2d = pca.transform(X)
    plt.scatter(X_2d[:,0], X_2d[:,1])
    
    if title is not None:
        plt.title(title)
    plt.show()    
    
# Root Mean Square Error
def rmse(y_pred, y_true):  
    return np.sqrt(np.mean((y_pred - y_true)**2))

## Missing Values

In [31]:
station_a.status.interpolate(method="pad", inplace=True)
station_c.status.interpolate(method="pad", inplace=True)

## For the numerical attributes we use linear interpolation
station_a.temp_c.interpolate(method="linear", inplace=True)
station_c.temp_c.interpolate(method="linear", inplace=True)

station_a.rain_mm.interpolate(method="linear", inplace=True)
station_c.rain_mm.interpolate(method="linear", inplace=True)

station_main.level_cm.interpolate(method="linear", inplace=True)
station_main.flow_m3_s.interpolate(method="linear", inplace=True)

## Encoding

In [32]:
# Encode the status
station_a.status = pd.Categorical(station_a.status, categories = ["low", "decreased", "normal", "increased", "max"]).codes
station_c.status = pd.Categorical(station_c.status, categories = ["low", "decreased", "normal", "increased", "max"]).codes

## Combined Data

In [33]:
all_stations = station_main.join(station_a.join(station_c, how="inner", rsuffix="_from_c"), how="inner")
all_stations.columns = ["main_level", "main_flow", "a_temp", "a_status", "a_rain", "c_temp", "c_status", "c_rain"]
all_stations
# all_stations.reset_index(inplace= True)
# all_stations.head()
# all_stations.to_csv(file_all_station)

Unnamed: 0_level_0,main_level,main_flow,a_temp,a_status,a_rain,c_temp,c_status,c_rain
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-04-06 00:00:00,161.0,4.85,,2,0.0,6.3,2,0.0
2019-04-06 01:00:00,162.0,4.97,4.9,2,0.0,5.6,2,0.0
2019-04-06 02:00:00,161.0,4.85,4.4,2,0.0,4.9,2,0.0
2019-04-06 03:00:00,162.0,4.97,4.0,3,0.0,4.3,2,0.0
2019-04-06 04:00:00,161.0,4.85,3.3,3,0.0,3.5,3,0.0
...,...,...,...,...,...,...,...,...
2019-04-12 19:00:00,169.0,5.82,7.5,3,0.0,7.2,3,0.0
2019-04-12 20:00:00,170.0,5.95,7.0,3,0.0,6.8,3,0.0
2019-04-12 21:00:00,170.0,5.95,6.8,3,0.0,6.7,3,0.0
2019-04-12 22:00:00,170.5,6.07,6.4,3,0.0,6.3,3,0.0


In [34]:
np.sum(pd.isna(all_stations))
all_stations.to_csv(file_all_station)

In [35]:

# X, y = create_X_y(sample_df)
# Remove timestamps
# X = np.delete(X, slice(0, X.shape[1], 9), axis=1)
# X