# Setup

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression as OLS
from sklearn.metrics import mean_squared_error

# 2. Import data

In [3]:
data = pd.read_csv('sample_data/power_plant.csv')
print(data.shape)
data.head()

(9568, 5)


Unnamed: 0,AT,V,AP,RH,EP
0,8.34,40.77,1010.84,90.01,480.48
1,23.64,58.49,1011.4,74.2,445.75
2,29.74,56.9,1007.15,41.91,438.76
3,19.07,49.69,1007.22,76.79,453.09
4,11.8,40.66,1017.13,97.2,464.43


# 3. Split Data

In [10]:
np.random.seed(42)
train_rows = pd.Series(np.random.choice(list(data.index), int(0.8 * data.shape[0]), replace=False))
value_rows = pd.Series(np.random.choice(list(data.drop(train_rows, axis=0).index), int(0.1 * data.shape[0]), replace=False))
test_rows = pd.Series(data.drop(pd.concat([train_rows, value_rows]), axis=0).index)

train_data = data.iloc[train_rows, :]
value_data = data.iloc[value_rows, :]
test_data = data.iloc[test_rows, :]

print('train is ', train_data.shape, ' rows, cols')
print('value is ', value_data.shape, ' rows, cols')
print('test is ', test_data.shape, ' rows, cols')

train is  (7654, 5)  rows, cols
value is  (956, 5)  rows, cols
test is  (958, 5)  rows, cols


# 4. Split again

In [11]:
train_data_2, value_data_2 = train_test_split(data, train_size = 0.8, random_state = 42)
value_data_2, test_data_2 = train_test_split(value_data_2, test_size = 0.5, random_state = 42)

print('train is ', train_data_2.shape, ' rows, cols')
print('value is ', value_data_2.shape, ' rows, cols')
print('test is ', test_data_2.shape, ' rows, cols')

train is  (7654, 5)  rows, cols
value is  (957, 5)  rows, cols
test is  (957, 5)  rows, cols


# 5. Ensure row count

# 6. Standard Scaler

In [12]:
scaler = StandardScaler()
scaler.fit(train_data.iloc[:, :-1])

train_x = scaler.transform(train_data.iloc[:, :-1])
train_y = train_data['EP']

value_x = scaler.transform(value_data.iloc[:, :-1])
value_y = value_data['EP']

test_x = scaler.transform(test_data.iloc[:, :-1])
test_y = test_data['EP']

# 7. Linear Regression

In [13]:
linear_model = OLS()
linear_model.fit(train_x, train_y)

# 8. Print Score

In [14]:
print('Train score: ', linear_model.score(train_x, train_y))
print('Validation score: ', linear_model.score(value_x, value_y))
print('Test score: ', linear_model.score(test_x, test_y))

print('Train RSME: ', mean_squared_error(linear_model.predict(train_x), train_y))
print('Validation RSME: ', mean_squared_error(linear_model.predict(value_x), value_y))
print('Test RSME: ', mean_squared_error(linear_model.predict(test_x), test_y))

Train score:  0.9287072840354756
Validation score:  0.9238845251967255
Test score:  0.9333918854821254
Train RSME:  20.732519659228682
Validation RSME:  22.820591843766213
Test RSME:  19.023390952574694
