In [9]:
import math
import pandas as pd
import numpy as np
import scipy
from scipy import stats
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from math import sqrt
import matplotlib.pyplot as plt
%matplotlib inline


# Reading data

In [10]:
train_df = pd.read_csv('data/TrainData3.csv', delimiter = ',').dropna().reset_index(drop=True)
train_df['TIMESTAMP'] = pd.to_datetime(train_df['TIMESTAMP'], format='%Y%m%d %H:%M')
train_df = train_df.set_index('TIMESTAMP')
std_dev = 3
train_df = train_df[(np.abs(stats.zscore(train_df)) < float(std_dev)).all(axis=1)]

In [11]:
pred_df = pd.read_csv('data/WeatherForecastInput3.csv', delimiter = ',')
pred_df['TIMESTAMP'] = pd.to_datetime(pred_df['TIMESTAMP'], format='%Y%m%d %H:%M')
pred_df = pred_df.set_index('TIMESTAMP')

In [12]:
result_st2 = pd.read_csv('data/Solution2.csv', delimiter = ',')
result_st2['TIMESTAMP'] = pd.to_datetime(result_st2['TIMESTAMP'], format='%Y%m%d %H:%M')
result_st2 = result_st2.set_index('TIMESTAMP')

# Feature Engineering

In [13]:
#Dataset containing the traindata and all the features
X = pd.DataFrame()
#X['U10'] = train_df['U10']
#X['U100'] = train_df['U100']
#X['V10'] = train_df['V10']
#X['V100'] = train_df['V100']
#X['U10^2'] = X['U10']**2
#X['U100^2'] = X['U100']**2
#X['V10^2'] = X['V10']**2
#X['V100^2'] = X['V100']**2
#X['U10^3'] = X['U10']**3
#X['U100^3'] = X['U100']**3
#X['V10^3'] = X['V10']**3
#X['V100^3'] = X['V100']**3

X['W10'] = np.sqrt(train_df['U10']**2 + train_df['V10']**2)
X['W100'] = np.sqrt(train_df['U100']**2 + train_df['V100']**2)
X['W10^2'] = X['W10']**2
X['W100^2'] = X['W100']**2
X['W10^3'] = X['W10']**3
X['W100^3'] = X['W100']**3
X['W10^4'] = X['W10']**4
X['W100^4'] = X['W100']**4
X['cosTheta10'] = train_df['U10']/X['W10']
X['sinTheta10'] = train_df['V10']/X['W10']
X['cosTheta100'] = train_df['U100']/X['W100']
X['sinTheta100'] = train_df['V100']/X['W100']

X['beta_0'] = 1

#Perform a one-out-K-encoding on the hour and month attributes
X['hour'] = X.index.hour
hour_df = pd.get_dummies(X['hour'],prefix='h')
X = X.drop(['hour'], axis=1)
X = X.join(hour_df, how = 'left')

X['month'] = X.index.month
#month_df = pd.get_dummies(X['month'],prefix='m')
#X = X.drop(['month'], axis=1)
#X = X.join(month_df, how = 'left')

X['year'] = X.index.year

y = train_df[['POWER']].copy()


In [14]:
#Dataset containing the forecasted values of wind speed and its relations as new attributes
X_pred = pd.DataFrame()

#X_pred['U10'] = pred_df['U10']
#X_pred['U100'] = pred_df['U100']
#X_pred['V10'] = pred_df['V10']
#X_pred['V100'] = pred_df['V100']
#X_pred['U10^2'] = X_pred['U10']**2
#X_pred['U100^2'] = X_pred['U100']**2
#X_pred['V10^2'] = X_pred['V10']**2
#X_pred['V100^2'] = X_pred['V100']**2
#X_pred['U10^3'] = X_pred['U10']**3
#X_pred['U100^3'] = X_pred['U100']**3
#X_pred['V10^3'] = X_pred['V10']**3
#X_pred['V100^3'] = X_pred['V100']**3

X_pred['W10'] = np.sqrt(pred_df['U10']**2 + pred_df['V10']**2)
X_pred['W100'] = np.sqrt(pred_df['U100']**2 + pred_df['V100']**2)
X_pred['W10^2'] = X_pred['W10']**2
X_pred['W100^2'] = X_pred['W100']**2
X_pred['W10^3'] = X_pred['W10']**3
X_pred['W100^3'] = X_pred['W100']**3
X_pred['W10^4'] = X_pred['W10']**4
X_pred['W100^4'] = X_pred['W100']**4
X_pred['cosTheta10'] = pred_df['U10']/X_pred['W10']
X_pred['sinTheta10'] = pred_df['V10']/X_pred['W10']
X_pred['cosTheta100'] = pred_df['U100']/X_pred['W100']
X_pred['sinTheta100'] = pred_df['V100']/X_pred['W100']

X_pred['beta_0'] = 1

#Perform a one-out-K-encoding on the hour and month attributes
X_pred['hour'] = X_pred.index.hour
hour_pred_df = pd.get_dummies(X_pred['hour'],prefix='h')
X_pred = X_pred.drop(['hour'], axis=1)
X_pred = X_pred.join(hour_pred_df, how = 'left')

X_pred['month'] = X_pred.index.month
#month_pred_df = pd.get_dummies(X_pred['month'],prefix='m')
#for i in range(X_pred['month'].max()+1,13):
#    month_pred_df['m_'+str(i)] = 0
#X_pred = X_pred.drop(['month'], axis=1)
#X_pred = X_pred.join(month_pred_df, how = 'left')

X_pred['year'] = X_pred.index.year

In [15]:
results = result_st2['POWER'].values.reshape(len(result_st2))

# Third stage

In [30]:
scores = []
output = pd.DataFrame()

#n = [2,5,7,10,12,15,20]

n = range(20)

for j in n[1:]:
    data = X.join(y)
    data_pred = X_pred

    data_dict = {}
    data_pred_dict = {}

    models = {}

    predictions = {}

    timestamps = {}

    dict_df = {}


    N = j
    
    delta = 20/N
    
    for i in range(N):
        data_dict.update({i : data[(data['W10'] >= (i*delta)) & (data['W10'] < ((i+1)*delta))]})
        data_pred_dict.update({i : data_pred[(data_pred['W10'] >= (i*delta)) & (data_pred['W10'] < ((i+1)*delta))]})
        X_aux = data_dict[i].values[:,:-1]
        y_aux = data_dict[i].values[:,-1]
        models.update({i : LinearRegression().fit(X_aux,y_aux)})

        X_pred_aux = data_pred_dict[i].values
        timestamps.update({i : data_pred_dict[i].index})

        if X_pred_aux.shape[0] != 0:
            predictions.update({i : models[i].predict(X_pred_aux).clip(min=0,max=1)})
        else:
            predictions.update({i : []})

        df_pred = pd.DataFrame(predictions[i])
        df_time = pd.DataFrame(timestamps[i])

        dict_df.update({i : df_time.join(df_pred).set_index('TIMESTAMP')})

    df = dict_df[0]

    for i in range(N-1):
        df = pd.concat([df, dict_df[i+1]])

    predictions = df.sort_index().values
    
    #rmse = np.sqrt(mean_squared_error(predictions,results))

    #scores.append(rmse)

In [31]:
predictions

array([[0.87788155],
       [0.88677316],
       [0.84635417],
       [0.80674379],
       [0.81499364],
       [0.8257382 ],
       [0.83156428],
       [0.81358073],
       [0.80772341],
       [0.82697271],
       [0.88028252],
       [0.88227087],
       [0.90481793],
       [0.93362087],
       [0.89049455],
       [0.8807843 ],
       [0.79739703],
       [0.5689366 ],
       [0.37860124],
       [0.42025271],
       [0.44744084],
       [0.4973669 ],
       [0.53931196],
       [0.46790284],
       [0.48106079],
       [0.42861557],
       [0.32689572],
       [0.32867117],
       [0.37986605],
       [0.67059143],
       [0.69929898],
       [0.8216885 ],
       [0.82960123],
       [0.67396523],
       [0.63347083],
       [0.52855055],
       [0.53670439],
       [0.46203067],
       [0.37483789],
       [0.36317812],
       [0.31313281],
       [0.30255775],
       [0.2939954 ],
       [0.23996779],
       [0.19877628],
       [0.20738571],
       [0.18861997],
       [0.194

In [19]:
np.savetxt("stage3b.csv", predictions, delimiter=".")