In [98]:
import math
import pandas as pd
import numpy as np
from scipy import stats
from datetime import datetime
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.kernel_ridge import KernelRidge
import matplotlib.pyplot as plt
%matplotlib inline

# Reading data

In [2]:
train_df = pd.read_csv('data/TrainData2.csv', delimiter = ',').dropna().reset_index(drop=True)
train_df['TIMESTAMP'] = pd.to_datetime(train_df['TIMESTAMP'], format='%Y%m%d %H:%M')
train_df = train_df.set_index('TIMESTAMP')
std_dev = 3
train_df = train_df[(np.abs(stats.zscore(train_df)) < float(std_dev)).all(axis=1)]

In [3]:
pred_df = pd.read_csv('data/WeatherForecastInput2.csv', delimiter = ',')
pred_df['TIMESTAMP'] = pd.to_datetime(pred_df['TIMESTAMP'], format='%Y%m%d %H:%M')
pred_df = pred_df.set_index('TIMESTAMP')

# Model 1

In [None]:
X_train = train_df[['U10', 'V10', 'U100', 'V100']].copy()
y_train = train_df[['POWER']].copy()

X_est = pred_df[['U10', 'V10', 'U100', 'V100']].copy()

polynomial_features = PolynomialFeatures(degree=3)

X_train = pd.DataFrame(polynomial_features.fit_transform(X_train))
X_est = pd.DataFrame(polynomial_features.fit_transform(X_est))

model = LinearRegression()
scores =[]

kfold = KFold(n_splits=2, shuffle=True, random_state=42)

for i, (train, test) in enumerate(kfold.split(X_train, y_train)):
    model.fit(X_train.iloc[train,:], y_train.iloc[train,:])
    y_est = model.predict(X_train.iloc[test,:])
    error = (y_est - y_train.iloc[test,:])**2
    RMSE = math.sqrt(error.sum())/len(y_est)
    scores.append(RMSE)
print(scores)

# Model 2

In [None]:
X_train = pd.DataFrame()
X_train['W10'] = np.sqrt(train_df['U10']**2 + train_df['V10']**2)
X_train['W100'] = np.sqrt(train_df['U100']**2 + train_df['V100']**2)

y_train = train_df[['POWER']].copy()

X_est = pd.DataFrame()
X_est['W10'] = np.sqrt(pred_df['U10']**2 + pred_df['V10']**2)
X_est['W100'] = np.sqrt(pred_df['U100']**2 + pred_df['V100']**2)

polynomial_features = PolynomialFeatures(degree=3)

X_train = pd.DataFrame(polynomial_features.fit_transform(X_train))
X_est = pd.DataFrame(polynomial_features.fit_transform(X_est))

model = LinearRegression()
scores =[]

kfold = KFold(n_splits=2, shuffle=True, random_state=42)

for i, (train, test) in enumerate(kfold.split(X_train, y_train)):
    model.fit(X_train.iloc[train,:], y_train.iloc[train,:])
    y_est = model.predict(X_train.iloc[test,:])
    error = (y_est - y_train.iloc[test,:])**2
    RMSE = math.sqrt(error.sum())/len(y_est)
    scores.append(RMSE)
print(scores)

In [None]:
stage1 = pd.DataFrame(model2.predict(X_est2)).to_csv('stage1.csv', index = False)

# Model 3

In [None]:
X_train3 = train_df
X_train3['beta_0'] = 1
X_train3 = X_train3.drop(columns=['U10', 'V10', 'U100', 'V100', 'POWER'])
X_train3['W10'] = np.sqrt(train_df['U10']**2 + train_df['V10']**2)
X_train3['W100'] = np.sqrt(train_df['U100']**2 + train_df['V100']**2)
X_train3['W10^3'] = X_train3['W10']**3
X_train3['W100^3'] = X_train3['W100']**3

y_train3 = train_df[['POWER']].copy()

X_est3 = pd.DataFrame()
X_est3['W10'] = np.sqrt(train_df['U10']**2 + train_df['V10']**2)
X_est3['W100'] = np.sqrt(train_df['U100']**2 + train_df['V100']**2)
X_est3['W10^3'] = X_est3['W10']**3
X_est3['W100^3'] = X_est3['W100']**3

model3 = LinearRegression()
model3.fit(X_train3,y_train3)

y_est3 = model3.predict(X_train3)
error3 = (y_est3 - y_train3)**2
RMSE3 = math.sqrt(error3.sum())/len(y_est3)
RMSE3

# Model 5

In [None]:
X_train2 = pd.DataFrame()

X_train2['W10'] = np.sqrt(train_df['U10']**2 + train_df['V10']**2)
X_train2['W100'] = np.sqrt(train_df['U100']**2 + train_df['V100']**2)

y_train2 = train_df[['POWER']].copy()

X_est2 = pd.DataFrame()
X_est2['W10'] = np.sqrt(train_df['U10']**2 + train_df['V10']**2)
X_est2['W100'] = np.sqrt(train_df['U100']**2 + train_df['V100']**2)

polynomial_features = PolynomialFeatures(degree=3)

X_train2 = pd.DataFrame(polynomial_features.fit_transform(X_train2))
X_est2 = pd.DataFrame(polynomial_features.fit_transform(X_est2))

X_train2['HOUR'] = train_df.index.hour 
X_est2['HOUR'] = train_df.index.hour

model2 = LinearRegression()
model2.fit(X_train2,y_train2)

y_est2 = model2.predict(X_train2)
error2 = (y_est2 - y_train2)**2
RMSE2 = math.sqrt(error2.sum())/len(y_est2)
RMSE2

# Model 6

In [71]:
X_train = pd.DataFrame()
X_train['W10'] = np.sqrt(train_df['U10']**2 + train_df['V10']**2)
X_train['W100'] = np.sqrt(train_df['U100']**2 + train_df['V100']**2)
X_train['W10^3'] = X_train['W10']**3
X_train['W100^3'] = X_train['W100']**3
X_train['beta_0'] = 1

y_train = train_df[['POWER']].copy()

X_pred = pd.DataFrame()
X_pred['W10'] = np.sqrt(pred_df['U10']**2 + pred_df['V10']**2)
X_pred['W100'] = np.sqrt(pred_df['U100']**2 + pred_df['V100']**2)
X_pred['W10^3'] = X_pred['W10']**3
X_pred['W100^3'] = X_pred['W100']**3
X_pred['beta_0'] = 1

model = LinearRegression()
scores =[]

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

for i, (train, test) in enumerate(kfold.split(X_train, y_train)):
    model.fit(X_train.iloc[train,:], y_train.iloc[train,:])
    y_est = model.predict(X_train.iloc[test,:])
    RMSE = mean_squared_error(y_train.iloc[test,:], y_est)
    scores.append(RMSE)
print(scores)

[0.020836836998225347, 0.020633158508647786, 0.02083280933139269, 0.021931661595181836, 0.02132264102387183, 0.021655636892589803, 0.021920868865405816, 0.020983410776763346, 0.021760720437012805, 0.02046134487309995]


In [82]:
X_train = pd.DataFrame()
X_train['W10'] = np.sqrt(train_df['U10']**2 + train_df['V10']**2)
X_train['W100'] = np.sqrt(train_df['U100']**2 + train_df['V100']**2)
X_train['W10^3'] = X_train['W10']**3
X_train['W100^3'] = X_train['W100']**3
X_train['beta_0'] = 1
X_train['hour'] = X_train.index.hour

y_train = train_df[['POWER']].copy()

X_pred = pd.DataFrame()
X_pred['W10'] = np.sqrt(pred_df['U10']**2 + pred_df['V10']**2)
X_pred['W100'] = np.sqrt(pred_df['U100']**2 + pred_df['V100']**2)
X_pred['W10^3'] = X_pred['W10']**3
X_pred['W100^3'] = X_pred['W100']**3
X_pred['beta_0'] = 1
X_pred['hour'] = X_pred.index.hour

model = LinearRegression()
scores =[]

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

for i, (train, test) in enumerate(kfold.split(X_train, y_train)):
    model.fit(X_train.iloc[train,:], y_train.iloc[train,:])
    y_est = model.predict(X_train.iloc[test,:])
    RMSE = mean_squared_error(y_train.iloc[test,:], y_est)
    scores.append(RMSE)
print(scores)

[0.020841214792652815, 0.02063226113178808, 0.020837013188034614, 0.021930084526445924, 0.021336017347337626, 0.021668113733025223, 0.021916138369874445, 0.020986562503533146, 0.02175628246389216, 0.020457834191082006]


In [86]:
X_train = pd.DataFrame()
X_train['W10'] = np.sqrt(train_df['U10']**2 + train_df['V10']**2)
X_train['W100'] = np.sqrt(train_df['U100']**2 + train_df['V100']**2)
X_train['W10^3'] = X_train['W10']**3
X_train['W100^3'] = X_train['W100']**3
X_train['beta_0'] = 1
X_train['hour'] = X_train.index.hour
X_train['year'] = X_train.index.year

y_train = train_df[['POWER']].copy()

X_pred = pd.DataFrame()
X_pred['W10'] = np.sqrt(pred_df['U10']**2 + pred_df['V10']**2)
X_pred['W100'] = np.sqrt(pred_df['U100']**2 + pred_df['V100']**2)
X_pred['W10^3'] = X_pred['W10']**3
X_pred['W100^3'] = X_pred['W100']**3
X_pred['beta_0'] = 1
X_pred['hour'] = X_pred.index.hour
X_pred['year'] = X_pred.index.year

model = LinearRegression()
scores =[]

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

for i, (train, test) in enumerate(kfold.split(X_train, y_train)):
    model.fit(X_train.iloc[train,:], y_train.iloc[train,:])
    y_est = model.predict(X_train.iloc[test,:])
    RMSE = mean_squared_error(y_train.iloc[test,:], y_est)
    scores.append(RMSE)
print(scores)
print(np.mean(scores))

[0.01953892723117234, 0.01915265902939513, 0.019354306655754258, 0.020434850682616842, 0.019938833042845466, 0.020289595394254612, 0.020121658639476608, 0.01941810207873255, 0.02018259464526591, 0.018917943973467025]
0.01973494713729807


In [87]:
X_train = pd.DataFrame()
X_train['W10'] = np.sqrt(train_df['U10']**2 + train_df['V10']**2)
X_train['W100'] = np.sqrt(train_df['U100']**2 + train_df['V100']**2)
X_train['W10^2'] = X_train['W10']**2
X_train['W100^2'] = X_train['W100']**2
X_train['W10^3'] = X_train['W10']**3
X_train['W100^3'] = X_train['W100']**3
X_train['beta_0'] = 1
X_train['hour'] = X_train.index.hour
X_train['year'] = X_train.index.year

y_train = train_df[['POWER']].copy()

X_pred = pd.DataFrame()
X_pred['W10'] = np.sqrt(pred_df['U10']**2 + pred_df['V10']**2)
X_pred['W100'] = np.sqrt(pred_df['U100']**2 + pred_df['V100']**2)
X_pred['W10^2'] = X_pred['W10']**2
X_pred['W100^2'] = X_pred['W100']**2
X_pred['W10^3'] = X_pred['W10']**3
X_pred['W100^3'] = X_pred['W100']**3
X_pred['beta_0'] = 1
X_pred['hour'] = X_pred.index.hour
X_pred['year'] = X_pred.index.year

model = LinearRegression()
scores =[]

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

for i, (train, test) in enumerate(kfold.split(X_train, y_train)):
    model.fit(X_train.iloc[train,:], y_train.iloc[train,:])
    y_est = model.predict(X_train.iloc[test,:])
    RMSE = mean_squared_error(y_train.iloc[test,:], y_est)
    scores.append(RMSE)
print(scores)
print(np.mean(scores))

[0.01895817840902218, 0.018795861684604908, 0.01875308293252113, 0.020152582403420598, 0.01945547590717547, 0.019614128191974514, 0.01954481801220398, 0.018823063186088333, 0.01967651210937354, 0.018886544918937896]
0.019266024775532256


In [92]:
X_train = pd.DataFrame()
X_train['U10'] = train_df['U10']
X_train['U100'] = train_df['U100']
X_train['V10'] = train_df['V10']
X_train['V100'] = train_df['V100']
X_train['W10'] = np.sqrt(train_df['U10']**2 + train_df['V10']**2)
X_train['W100'] = np.sqrt(train_df['U100']**2 + train_df['V100']**2)
X_train['W10^2'] = X_train['W10']**2
X_train['W100^2'] = X_train['W100']**2
X_train['W10^3'] = X_train['W10']**3
X_train['W100^3'] = X_train['W100']**3
X_train['beta_0'] = 1
X_train['hour'] = X_train.index.hour
X_train['year'] = X_train.index.year

y_train = train_df[['POWER']].copy()

X_pred = pd.DataFrame()
X_pred['U10'] = pred_df['U10']
X_pred['U100'] = pred_df['U100']
X_pred['V10'] = pred_df['V10']
X_pred['V100'] = pred_df['V100']
X_pred['W10'] = np.sqrt(pred_df['U10']**2 + pred_df['V10']**2)
X_pred['W100'] = np.sqrt(pred_df['U100']**2 + pred_df['V100']**2)
X_pred['W10^2'] = X_pred['W10']**2
X_pred['W100^2'] = X_pred['W100']**2
X_pred['W10^3'] = X_pred['W10']**3
X_pred['W100^3'] = X_pred['W100']**3
X_pred['beta_0'] = 1
X_pred['hour'] = X_pred.index.hour
X_pred['year'] = X_pred.index.year

model = LinearRegression()
scores =[]

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

for i, (train, test) in enumerate(kfold.split(X_train, y_train)):
    model.fit(X_train.iloc[train,:], y_train.iloc[train,:])
    y_est = model.predict(X_train.iloc[test,:])
    RMSE = mean_squared_error(y_train.iloc[test,:], y_est)
    scores.append(RMSE)
print(scores)
print(np.mean(scores))

[0.016785343944477383, 0.017217983475412955, 0.016912939678504612, 0.01881772587101483, 0.018119560770157497, 0.017669842910729025, 0.01780357267908842, 0.016773366922352692, 0.018030541572373736, 0.01741574141196116]
0.01755466192360723


In [99]:
X_train = pd.DataFrame()
X_train['U10'] = train_df['U10']
X_train['U100'] = train_df['U100']
X_train['V10'] = train_df['V10']
X_train['V100'] = train_df['V100']
X_train['U10^2'] = X_train['U10']**2
X_train['U100^2'] = X_train['U100']**2
X_train['V10^3'] = X_train['V10']**3
X_train['V100^3'] = X_train['V100']**3

X_train['W10'] = np.sqrt(train_df['U10']**2 + train_df['V10']**2)
X_train['W100'] = np.sqrt(train_df['U100']**2 + train_df['V100']**2)
X_train['W10^2'] = X_train['W10']**2
X_train['W100^2'] = X_train['W100']**2
X_train['W10^3'] = X_train['W10']**3
X_train['W100^3'] = X_train['W100']**3
X_train['beta_0'] = 1
X_train['hour'] = X_train.index.hour
X_train['year'] = X_train.index.year

y_train = train_df[['POWER']].copy()

X_pred = pd.DataFrame()

X_pred['U10'] = pred_df['U10']
X_pred['U100'] = pred_df['U100']
X_pred['V10'] = pred_df['V10']
X_pred['V100'] = pred_df['V100']
X_pred['U10^2'] = X_pred['U10']**2
X_pred['U100^2'] = X_pred['U100']**2
X_pred['V10^3'] = X_pred['V10']**3
X_pred['V100^3'] = X_pred['V100']**3

X_pred['W10'] = np.sqrt(pred_df['U10']**2 + pred_df['V10']**2)
X_pred['W100'] = np.sqrt(pred_df['U100']**2 + pred_df['V100']**2)
X_pred['W10^2'] = X_pred['W10']**2
X_pred['W100^2'] = X_pred['W100']**2
X_pred['W10^3'] = X_pred['W10']**3
X_pred['W100^3'] = X_pred['W100']**3

X_pred['beta_0'] = 1

X_pred['hour'] = X_pred.index.hour
X_pred['year'] = X_pred.index.year

model = LinearRegression()
scores = []

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

for i, (train, test) in enumerate(kfold.split(X_train, y_train)):
    model.fit(X_train.iloc[train,:], y_train.iloc[train,:])
    y_est = model.predict(X_train.iloc[test,:])
    RMSE = mean_squared_error(y_train.iloc[test,:], y_est)
    scores.append(RMSE)
print(scores)
print(np.mean(scores))

[0.01653745868141711, 0.01702478424177884, 0.01683979052719368, 0.01863879947053853, 0.017982386259269175, 0.017424827214917147, 0.017763324209702315, 0.01657757677241125, 0.017778321191418534, 0.017239924277617072]
0.017380719284626367


In [None]:
X_train = pd.DataFrame()
X_train['U10'] = train_df['U10']
X_train['U100'] = train_df['U100']
X_train['V10'] = train_df['V10']
X_train['V100'] = train_df['V100']
X_train['U10^2'] = X_train['U10']**2
X_train['U100^2'] = X_train['U100']**2
X_train['V10^3'] = X_train['V10']**3
X_train['V100^3'] = X_train['V100']**3

X_train['W10'] = np.sqrt(train_df['U10']**2 + train_df['V10']**2)
X_train['W100'] = np.sqrt(train_df['U100']**2 + train_df['V100']**2)
X_train['W10^2'] = X_train['W10']**2
X_train['W100^2'] = X_train['W100']**2
X_train['W10^3'] = X_train['W10']**3
X_train['W100^3'] = X_train['W100']**3
X_train['beta_0'] = 1
X_train['hour'] = X_train.index.hour
X_train['year'] = X_train.index.year

y_train = train_df[['POWER']].copy()

X_pred = pd.DataFrame()

X_pred['U10'] = pred_df['U10']
X_pred['U100'] = pred_df['U100']
X_pred['V10'] = pred_df['V10']
X_pred['V100'] = pred_df['V100']
X_pred['U10^2'] = X_pred['U10']**2
X_pred['U100^2'] = X_pred['U100']**2
X_pred['V10^3'] = X_pred['V10']**3
X_pred['V100^3'] = X_pred['V100']**3

X_pred['W10'] = np.sqrt(pred_df['U10']**2 + pred_df['V10']**2)
X_pred['W100'] = np.sqrt(pred_df['U100']**2 + pred_df['V100']**2)
X_pred['W10^2'] = X_pred['W10']**2
X_pred['W100^2'] = X_pred['W100']**2
X_pred['W10^3'] = X_pred['W10']**3
X_pred['W100^3'] = X_pred['W100']**3

X_pred['beta_0'] = 1

X_pred['hour'] = X_pred.index.hour
X_pred['year'] = X_pred.index.year

model = LinearRegression()
model2 = KernelRidge(alpha=1.0)
scores = []
scores2 = []

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

for i, (train, test) in enumerate(kfold.split(X_train, y_train)):
    model.fit(X_train.iloc[train,:], y_train.iloc[train,:])
    model2.fit(X_train.iloc[train,:], y_train.iloc[train,:])
    
    y_est = model.predict(X_train.iloc[test,:])
    y_est2 = model2.predict(X_train.iloc[test,:])
    
    RMSE = mean_squared_error(y_train.iloc[test,:], y_est)
    RMSE2 = mean_squared_error(y_train.iloc[test,:], y_est2)
    
    scores.append(RMSE)
    scores2.append(RMSE)

print(scores)
print(scores2)
print(np.mean(scores), np.mean(scores2))