In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression as OLS
from sklearn.metrics import mean_squared_error

In [4]:
power_data = pd.read_csv('power_plant.csv')
print(power_data.shape)
power_data.head()

(9568, 5)


Unnamed: 0,AT,V,AP,RH,EP
0,8.34,40.77,1010.84,90.01,480.48
1,23.64,58.49,1011.4,74.2,445.75
2,29.74,56.9,1007.15,41.91,438.76
3,19.07,49.69,1007.22,76.79,453.09
4,11.8,40.66,1017.13,97.2,464.43


In [13]:
#
# split into train / val / test as 0.8 / 0.1 / 0.1
#
np.random.seed(42)
train_rows = pd.Series(np.random.choice(list(power_data.index), int(0.8 * power_data.shape[0]), replace = False))
val_rows = pd.Series(np.random.choice(list(power_data.drop(train_rows, axis = 0).index), int(0.1 * power_data.shape[0]), replace = False))
test_rows = pd.Series(power_data.drop(pd.concat([train_rows, val_rows]), axis = 0).index)
train_data = power_data.iloc[train_rows, :]
val_data = power_data.iloc[val_rows, :]
test_data = power_data.iloc[test_rows, :]
#
print('train is ', train_data.shape, ' rows, cols\n',
      'val is ', val_data.shape, ' rows, cols\n',
      'test is ', test_data.shape, 'rows, cols')
#


train is  (7654, 5)  rows, cols
 val is  (956, 5)  rows, cols
 test is  (958, 5) rows, cols


In [15]:
scaler = StandardScaler()
scaler.fit(train_data.iloc[:, :-1])
train_X = scaler.transform(train_data.iloc[:, :-1])
train_y = train_data['EP']
val_X = scaler.transform(val_data.iloc[:, :-1])
val_y = val_data['EP']
test_X = scaler.transform(test_data.iloc[:, :-1])
test_y = test_data['EP']

In [16]:
linear_model = OLS()
linear_model.fit(train_X, train_y)

LinearRegression()

In [17]:
print('train score: ', linear_model.score(train_X, train_y),
    '\nvalidation score: ', linear_model.score(val_X,val_y),
    '\ntest score: ', linear_model.score(test_X,test_y))
print('train RMSE: ',mean_squared_error(linear_model.predict(train_X),train_y),
      '\nvalidation RMSE: ',mean_squared_error(linear_model.predict(val_X),val_y),
      '\ntest RMSE: ',mean_squared_error(linear_model.predict(test_X),test_y))

train score:  0.9287072840354756 
validation score:  0.9238845251967255 
test score:  0.9333918854821254
train RMSE:  20.732519659228682 
validation RMSE:  22.82059184376622 
test RMSE:  19.023390952574694
