In [3]:
#IMPORTS
import numpy as np 
import pandas as pd 
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import xgboost as xgb
from catboost import CatBoostRegressor
import pickle

In [4]:
#Read data
def readData(): 
    dataA, dataB, dataC = [], [], []
    path = 'data/data/'
    
    dataA.append(pd.DataFrame(pd.read_parquet(path + 'A/' + 'train_targets.parquet')))
    dataA.append(pd.DataFrame(pd.read_parquet(path + 'A/' + 'X_test_estimated.parquet')))
    dataA.append(pd.DataFrame(pd.read_parquet(path + 'A/' + 'X_train_estimated.parquet')))
    dataA.append(pd.DataFrame(pd.read_parquet(path + 'A/' + 'X_train_observed.parquet')))        
   
    dataB.append(pd.DataFrame(pd.read_parquet(path + 'B/' + 'train_targets.parquet')))
    dataB.append(pd.DataFrame(pd.read_parquet(path + 'B/' + 'X_test_estimated.parquet')))
    dataB.append(pd.DataFrame(pd.read_parquet(path + 'B/' + 'X_train_estimated.parquet')))
    dataB.append(pd.DataFrame(pd.read_parquet(path + 'B/' + 'X_train_observed.parquet')))

    dataC.append(pd.DataFrame(pd.read_parquet(path + 'C/' + 'train_targets.parquet')))
    dataC.append(pd.DataFrame(pd.read_parquet(path + 'C/' + 'X_test_estimated.parquet')))
    dataC.append(pd.DataFrame(pd.read_parquet(path + 'C/' + 'X_train_estimated.parquet')))
    dataC.append(pd.DataFrame(pd.read_parquet(path + 'C/' + 'X_train_observed.parquet')))
   
    return dataA, dataB, dataC

A, B, C = readData()

def splitWeatherAndEnergyReports(data):
    weather = [data[1], data[2], data[3]]
    energy = data[0]
    return weather, energy

def quartersToHours(data):
    data1 = []
    for df in data:
        df['date_forecast'] = df['date_forecast'].map(lambda x: str(x)[:-6])
        df_new = df.groupby(['date_forecast'], as_index=False).mean()
        df_new['date_forecast'] = df_new['date_forecast'].apply(lambda x: x + ':00:00')
        data1.append(df_new)     
    return data1

#A
X = pd.concat([A[3], A[2]], ignore_index = True)
X = pd.concat([X, A[1]], ignore_index = True)
X['snow_density:kgm3'] = X['snow_density:kgm3'].fillna(0)
X[['ceiling_height_agl:m', 'cloud_base_agl:m']] = X[['ceiling_height_agl:m', 'cloud_base_agl:m']].interpolate(method='cubic')
A[3], A[2], A[1] = X[:len(A[3])], X[len(A[3]):len(A[3])+len(A[2])], X[len(A[2])+len(A[3]):]


#B
X = pd.concat([B[3], B[2]], ignore_index = True)
X = pd.concat([X, B[1]], ignore_index = True)
X['snow_density:kgm3'] = X['snow_density:kgm3'].fillna(0)
X[['ceiling_height_agl:m', 'cloud_base_agl:m']] = X[['ceiling_height_agl:m', 'cloud_base_agl:m']].interpolate(method='cubic')
B[3], B[2], B[1] = X[:len(B[3])], X[len(B[3]):len(B[3])+len(B[2])], X[len(B[2])+len(B[3]):]


#C
X = pd.concat([C[3], C[2]], ignore_index = True)
X = pd.concat([X, C[1]], ignore_index = True)
X['snow_density:kgm3'] = X['snow_density:kgm3'].fillna(0)
X[['ceiling_height_agl:m', 'cloud_base_agl:m']] = X[['ceiling_height_agl:m', 'cloud_base_agl:m']].interpolate(method='cubic')

C[3], C[2], C[1] = X[:len(C[3])], X[len(C[3]):len(C[3])+len(C[2])], X[len(C[2])+len(C[3]):]

#Splitting weather and energy datasets
weather_A, energy_A = splitWeatherAndEnergyReports(A)
weather_B, energy_B = splitWeatherAndEnergyReports(B)
weather_C, energy_C = splitWeatherAndEnergyReports(C)

#Joining rows from same hour.
weather_A1 = quartersToHours(weather_A)
weather_B1 = quartersToHours(weather_B)
weather_C1 = quartersToHours(weather_C)

  df_new = df.groupby(['date_forecast'], as_index=False).mean()
  df_new = df.groupby(['date_forecast'], as_index=False).mean()
  df_new = df.groupby(['date_forecast'], as_index=False).mean()
  df_new = df.groupby(['date_forecast'], as_index=False).mean()
  df_new = df.groupby(['date_forecast'], as_index=False).mean()
  df_new = df.groupby(['date_forecast'], as_index=False).mean()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_forecast'] = df['date_forecast'].map(lambda x: str(x)[:-6])
  df_new = df.groupby(['date_forecast'], as_index=False).mean()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/

In [5]:
#B - Converting rows in B that is incorrect to NAN, to later be removed. 
energy_B['flag'] = energy_B['pv_measurement'].groupby([energy_B['pv_measurement'], energy_B['pv_measurement'].diff().ne(0).cumsum()]).transform('size').ge(24).astype(int) 
energy_B.loc[energy_B['flag'] == 1, 'pv_measurement'] = None
energy_B = energy_B.drop(['flag'], axis=1)

In [6]:
#DROP NAN-ROWS
def dropNanrows(df):
    return df.dropna()

#Block B&C has label-rows with NAN-values that needs to be dropped.
energy_B  = dropNanrows(energy_B)
energy_C = dropNanrows(energy_C)

In [7]:
#Align rows
def alignData(x_train, x_test, y):     
    X = pd.concat([x_train, x_test], ignore_index = True)
    y = y.rename(columns={'time': 'date_forecast'})
    y['date_forecast'] = y['date_forecast'].astype(str)
    aligned = X.merge(y, how='inner', on=['date_forecast'])

    X = aligned.drop(['pv_measurement'], axis=1)
    Y = aligned['pv_measurement']
    return X, Y

#A-BLOCK
X_A, Y_A = alignData(weather_A1[2], weather_A1[1], energy_A) 
testData_A = weather_A1[0] 

#B-BLOCK
X_B, Y_B = alignData(weather_B1[2], weather_B1[1], energy_B)
testData_B = weather_B1[0]

#C-BLOCK
X_C, Y_C = alignData(weather_C1[2], weather_C1[1], energy_C)
testData_C = weather_C1[0]        

In [8]:
#Preprocessing functions (currently in use)

def normalizeAll(tr, te):
    all = pd.concat([tr, te], ignore_index = True)
    #minmax = [(all[str(col)].min(), all[str(col)].max()) for col in all.columns]
    all = all.apply(lambda iterator: ((iterator.max() - iterator)/(iterator.max() - iterator.min())))
    return all[:len(tr)], all[len(tr):]

def dataSplit2(X_tree, X_NN, Y, td_tree, td_NN, splitRatio):
        X_tree['Y'] = pd.DataFrame(Y)['pv_measurement']
        tr_trees, te_trees = train_test_split(X_tree, test_size=splitRatio)
        
        Y_tr_trees = tr_trees['Y']
        Y_te_trees = te_trees['Y']

        tr_trees.drop(['Y'], axis=1, inplace=True)
        te_trees.drop(['Y'], axis=1, inplace=True)

        X_NN['Y'] = pd.DataFrame(Y)['pv_measurement']
        tr_NN, te_NN = train_test_split(X_NN, test_size=splitRatio)
        
        Y_tr_NN = tr_NN['Y']
        Y_te_NN = te_NN['Y']

        tr_NN.drop(['Y'], axis=1, inplace=True)
        te_NN.drop(['Y'], axis=1, inplace=True)

        return tr_trees, te_trees, tr_NN, te_NN, Y_tr_trees, Y_te_trees, Y_tr_NN, Y_te_NN, td_tree, td_NN

def add_times(tr, te):
    all = pd.concat([tr, te], ignore_index = True) 
    all["Hour"] = [int(all.iloc[i]['date_forecast'][11:13]) for i in range(len(all))]
    all["Month"] = [int(all.iloc[i]['date_forecast'][5:7]) for i in range(len(all))]
    all["Year"] = [int(all.iloc[i]['date_forecast'][0:4]) for i in range(len(all))]
    all["Day"] = [int(all.iloc[i]['date_forecast'][8:10]) for i in range(len(all))]
    all['Week'] = pd.to_datetime(all['date_forecast']).dt.isocalendar().week.astype(float)
    return all[:len(tr)], all[len(tr):] 
   
def removeDates(X):
    X = X.drop(['date_forecast'], axis=1)
    return X

def removeColumns(df, columns): 
    return df.drop(columns, axis=1)  

def flag(tr, te, splitIndex):
    data_observed = tr.iloc[:splitIndex]
    data_estimated = tr.iloc[splitIndex:]
    
    data_observed['flag'] = 1
    data_estimated['flag'] = 0
    te['flag'] = 0
    return pd.concat([data_observed, data_estimated], ignore_index = True), te

def createFeatures(tr, te,): 
    all = pd.concat([tr, te], ignore_index = True)
    top5 = ['absolute_humidity_2m:gm3', 'ceiling_height_agl:m', 'air_density_2m:kgm3', 'cloud_base_agl:m', 'sun_azimuth:d'] 
    seenKombos = []
    toCombine = [('effective_cloud_cover:p', 'clear_sky_energy_1h:J')]
    for elem in top5:
        for elem2 in top5:
            if {elem, elem2} not in seenKombos:
               seenKombos.append({elem, elem2})
               if elem != elem2:
                    toCombine.append((str(elem), str(elem2)))
    toClose = []
    for col in all.copy().columns:
        for col2 in all.copy().columns:
            if {str(col), str(col2)} not in seenKombos:
                seenKombos.append({str(col), str(col2)})
                if col != col2:
                    if abs(all[str(col)].corr(all[str(col2)])) >= 0.8:
                        if str(col)[:4] == str(col2)[:4]:
                            toClose.append((str(col), str(col2)))
                        elif str(col) in top5 or str(col2) in top5:
                            toCombine.append((str(col), str(col2)))                       
    created = []
    for i in range(len(toCombine)):
        all[toCombine[i][0] + '_' + toCombine[i][1]] = all[toCombine[i][0]] * all[toCombine[i][1]]
        created.append(toCombine[i][0] + '_' + toCombine[i][1])
    return all[:len(tr)], all[len(tr):], created 

In [9]:
#Applying preprocessing functions to data-blocks.

#DATA A
splitIndex_A = X_A.index[X_A['date_forecast'] == '2022-10-28 22:00:00'][0]
X_A, testData_A = flag(X_A, testData_A, splitIndex_A) #Observed data
X_A, testData_A = add_times(X_A, testData_A)
testData_A = removeDates(testData_A)
X_A = removeColumns(X_A, X_A.columns.difference(testData_A.columns))  #Fitting trainingData to useful Features in testData
X_A, testData_A, created = createFeatures(X_A, testData_A)
X_A_neuralNets, testData_A_neuralNets = normalizeAll(X_A.copy(), testData_A.copy())
X_A_tr_trees, X_A_te_trees,  X_A_tr_NN, X_A_te_NN, Y_A_tr_trees, Y_A_te_trees, Y_A_tr_NN, Y_A_te_NN, testData_A_tree, testData_A_NN = dataSplit2(X_A, X_A_neuralNets, Y_A, testData_A, 
                                                                                                                          testData_A_neuralNets, 0.15)
#Features to remove (Products from createFeatures() that wasn't useful)
X_A_tr_trees.drop(['absolute_humidity_2m:gm3_ceiling_height_agl:m', 'absolute_humidity_2m:gm3_dew_point_2m:K', 'absolute_humidity_2m:gm3_t_1000hPa:K'], axis=1, inplace=True)
X_A_te_trees.drop(['absolute_humidity_2m:gm3_ceiling_height_agl:m', 'absolute_humidity_2m:gm3_dew_point_2m:K', 'absolute_humidity_2m:gm3_t_1000hPa:K'], axis=1, inplace=True)
X_A_tr_NN.drop(['absolute_humidity_2m:gm3_ceiling_height_agl:m', 'absolute_humidity_2m:gm3_dew_point_2m:K', 'absolute_humidity_2m:gm3_t_1000hPa:K'], axis=1, inplace=True)
X_A_te_NN.drop(['absolute_humidity_2m:gm3_ceiling_height_agl:m', 'absolute_humidity_2m:gm3_dew_point_2m:K', 'absolute_humidity_2m:gm3_t_1000hPa:K'], axis=1, inplace=True)
testData_A_tree.drop(['absolute_humidity_2m:gm3_ceiling_height_agl:m', 'absolute_humidity_2m:gm3_dew_point_2m:K', 'absolute_humidity_2m:gm3_t_1000hPa:K'], axis=1, inplace=True)
testData_A_NN.drop(['absolute_humidity_2m:gm3_ceiling_height_agl:m', 'absolute_humidity_2m:gm3_dew_point_2m:K', 'absolute_humidity_2m:gm3_t_1000hPa:K'], axis=1, inplace=True)


#DATA B
splitIndex_B = X_B.index[X_B['date_forecast'] == '2022-10-28 22:00:00'][0]
X_B, testData_B = flag(X_B, testData_B, splitIndex_B)
X_B, testData_B = add_times(X_B, testData_B )
testData_B = removeDates(testData_B)
X_B = removeColumns(X_B, X_B.columns.difference(testData_B.columns)) #Fitting trainingData to useful FEatures in testData
X_B, testData_B, created = createFeatures(X_B, testData_B)
X_B_neuralNets, testData_B_neuralNets = normalizeAll(X_B.copy(), testData_B.copy())
X_B_tr_trees, X_B_te_trees,  X_B_tr_NN, X_B_te_NN, Y_B_tr_trees, Y_B_te_trees, Y_B_tr_NN, Y_B_te_NN, testData_B_tree, testData_B_NN = dataSplit2(X_B, X_B_neuralNets, Y_B, testData_B, 
                                                                                                                          testData_B_neuralNets, 0.15)
#Features to remove (Products from createFeatures() that wasn't useful)
X_B_tr_trees.drop(['absolute_humidity_2m:gm3_ceiling_height_agl:m', 'absolute_humidity_2m:gm3_dew_point_2m:K', 'absolute_humidity_2m:gm3_t_1000hPa:K'], axis=1, inplace=True)
X_B_te_trees.drop(['absolute_humidity_2m:gm3_ceiling_height_agl:m', 'absolute_humidity_2m:gm3_dew_point_2m:K', 'absolute_humidity_2m:gm3_t_1000hPa:K'], axis=1, inplace=True)
X_B_tr_NN.drop(['absolute_humidity_2m:gm3_ceiling_height_agl:m', 'absolute_humidity_2m:gm3_dew_point_2m:K', 'absolute_humidity_2m:gm3_t_1000hPa:K'], axis=1, inplace=True)
X_B_te_NN.drop(['absolute_humidity_2m:gm3_ceiling_height_agl:m', 'absolute_humidity_2m:gm3_dew_point_2m:K', 'absolute_humidity_2m:gm3_t_1000hPa:K'], axis=1, inplace=True)
testData_B_tree.drop(['absolute_humidity_2m:gm3_ceiling_height_agl:m', 'absolute_humidity_2m:gm3_dew_point_2m:K', 'absolute_humidity_2m:gm3_t_1000hPa:K'], axis=1, inplace=True)
testData_B_NN.drop(['absolute_humidity_2m:gm3_ceiling_height_agl:m', 'absolute_humidity_2m:gm3_dew_point_2m:K', 'absolute_humidity_2m:gm3_t_1000hPa:K'], axis=1, inplace=True)



#DATA C
splitIndex_C = X_C.index[X_C['date_forecast'] == '2022-10-28 22:00:00'][0]
X_C, testData_C = flag(X_C, testData_C, splitIndex_C)
X_C, testData_C = add_times(X_C, testData_C)
testData_C = removeDates(testData_C)
X_C = removeColumns(X_C, X_C.columns.difference(testData_C.columns)) #Fitting trainingData to useful FEatures in testData
X_C, testData_C, created = createFeatures(X_C, testData_C)
X_C_neuralNets, testData_C_neuralNets = normalizeAll(X_C.copy(), testData_C.copy())
X_C_tr_trees, X_C_te_trees,  X_C_tr_NN, X_C_te_NN, Y_C_tr_trees, Y_C_te_trees, Y_C_tr_NN, Y_C_te_NN, testData_C_tree, testData_C_NN = dataSplit2(X_C, X_C_neuralNets, Y_C, testData_C, 
                                                                                                                          testData_C_neuralNets, 0.15)
#Features to remove (Products from createFeatures() that wasn't useful)
X_C_tr_trees.drop(['absolute_humidity_2m:gm3_ceiling_height_agl:m', 'absolute_humidity_2m:gm3_dew_point_2m:K', 'absolute_humidity_2m:gm3_t_1000hPa:K'], axis=1, inplace=True)
X_C_te_trees.drop(['absolute_humidity_2m:gm3_ceiling_height_agl:m', 'absolute_humidity_2m:gm3_dew_point_2m:K', 'absolute_humidity_2m:gm3_t_1000hPa:K'], axis=1, inplace=True)
X_C_tr_NN.drop(['absolute_humidity_2m:gm3_ceiling_height_agl:m', 'absolute_humidity_2m:gm3_dew_point_2m:K', 'absolute_humidity_2m:gm3_t_1000hPa:K'], axis=1, inplace=True)
X_C_te_NN.drop(['absolute_humidity_2m:gm3_ceiling_height_agl:m', 'absolute_humidity_2m:gm3_dew_point_2m:K', 'absolute_humidity_2m:gm3_t_1000hPa:K'], axis=1, inplace=True)
testData_C_tree.drop(['absolute_humidity_2m:gm3_ceiling_height_agl:m', 'absolute_humidity_2m:gm3_dew_point_2m:K', 'absolute_humidity_2m:gm3_t_1000hPa:K'], axis=1, inplace=True)
testData_C_NN.drop(['absolute_humidity_2m:gm3_ceiling_height_agl:m', 'absolute_humidity_2m:gm3_dew_point_2m:K', 'absolute_humidity_2m:gm3_t_1000hPa:K'], axis=1, inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_observed['flag'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_estimated['flag'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_observed['flag'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

Se

In [10]:
#TESTDATA
testData_A = np.nan_to_num(testData_A_neuralNets.to_numpy(), nan=0)
testData_B = np.nan_to_num(testData_B_neuralNets.to_numpy(), nan=0)
testData_C = np.nan_to_num(testData_C_neuralNets.to_numpy(), nan=0)

In [None]:
#CATBOOST-MODEL
def catBoost(iterations=10, depth=14, lr=0.1,  loss ="MAE"):
  cat_model = CatBoostRegressor(iterations=iterations, 
                            depth=depth, 
                            learning_rate=lr, 
                            loss_function=loss,
                            )
  cat_model.fit(X_B_tr_trees, Y_B_tr_trees, eval_set=(X_B_te_trees, Y_B_te_trees), verbose=False, plot=True)
  return cat_model

#Train catboost-models
catBoost_models = {'A': catBoost(iterations=3800, depth=12, lr=0.01, loss="MAE"),
                   'B': catBoost(iterations=30000, depth=9, lr=0.001, loss="MAE"),
                   'C': catBoost(iterations=30000, depth=9, lr=0.001, loss="MAE")}

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [None]:
#SUBMISSIONS
def submission(dict_models):
    sample_submission = pd.read_csv('data/data/sample_submission.csv')

    #A  
    models_A = [m.predict(testData_A_tree) for m in dict_models['A']]
    preds_A = np.column_stack([m for m in models_A])
    meanPrediction_A = preds_A.mean(axis=1)
    final_pred_A = [[p] for p in meanPrediction_A]


    #B 
    models_B = [m.predict(testData_B_tree) for m in dict_models['B']]
    preds_B = np.column_stack([m for m in models_B])
    meanPrediction_B = preds_B.mean(axis=1)
    final_pred_B = [[p] for p in meanPrediction_B]


    #C 
    models_C = [m.predict(testData_C_tree) for m in dict_models['C']]
    preds_C = np.column_stack([m for m in models_C])
    meanPrediction_C = preds_C.mean(axis=1)
    final_pred_C = [[p] for p in meanPrediction_C]



    allPreds = np.append(final_pred_A, final_pred_B, axis=0)
    allPreds = np.append(allPreds, final_pred_C, axis=0)

    #Replace potential negative values with zero, as energy production cannot be negative.
    allPreds[allPreds<0] = 0

    allPredictions = pd.DataFrame(allPreds, columns=['prediction'])

    sample_submission['prediction'] = allPredictions['prediction']

    return sample_submission

#Submission 
sub = submission({'A': [catBoost_models['A']], 
                   'B': [catBoost_models['B']], 
                   'C': [catBoost_models['C']]},)

  preds_A = np.column_stack((m for m in models_A))
  preds_B = np.column_stack((m for m in models_B))
  preds_C = np.column_stack((m for m in models_C))
  preds_A = np.column_stack((m for m in models_A))
  preds_B = np.column_stack((m for m in models_B))
  preds_C = np.column_stack((m for m in models_C))


In [None]:
#create folder in working
from pathlib import Path
Path('data/final_submissions').mkdir(parents=True, exist_ok=True)

In [None]:
#SAVE SUBMISSIONS
def saveSub(sub, filename):
    filepath = 'data/final_submissions/' + filename + '.csv'
    #PHD_STIPENDS = pd.read_csv('/kaggle/input/phd-stipends/csv') # load from notebook input
    sub.to_csv(filepath, index=False) # save to notebook output

saveSub(sub, "submission_1")
