In [21]:
import pandas as pd

df = pd.read_csv("../data-master/Combined_Cycle_Power_Plant.csv")
df.head(5)

Unnamed: 0,AT,V,AP,RH,EP
0,14.96,41.76,1024.07,73.17,463.26
1,25.18,62.96,1020.04,59.08,444.37
2,5.11,39.4,1012.16,92.14,488.56
3,20.86,57.32,1010.24,76.64,446.48
4,10.82,37.5,1009.23,96.62,473.9


In [26]:
from sklearn import model_selection, preprocessing, linear_model, metrics,pipeline
import numpy as np

# dependent variable/target varible/label
label = "EP"

# independepent variables/features/predictors
X = df.drop(columns=[label])

# vector for target variable
y = df[label]

# Creating trainnig and test sets
# test_size is 30% of the whole
# random_state: to reprduce the same combination of training and test records
X_train, X_test, y_train, y_test = model_selection.train_test_split(X.values, 
                                                                    
                                                                    y, test_size = 0.3
                                                        , random_state = 1)

print("size of training:", len(X_train)/len(X))

# We want to calcualate z score for each column 
scaler = preprocessing.StandardScaler()

# We calcualte mean and std dev for each column
scaler.fit(X_train)

# Calculating the z scores
# purpose of z scoring is make mean = 0 and std = 1 for each column
X_train_std = scaler.transform(X_train)

# Displaying the mean and standard deviation of the stadandarized features
pd.DataFrame(X_train_std).describe()

# Applying the same transformation on the test data
X_test_std = scaler.transform(X_test)

# building a regression model
est = linear_model.LinearRegression()
est.fit(X_train_std, y_train)

# prediction on training and test data
y_train_pred = est.predict(X_train_std)
y_test_pred = est.predict(X_test_std)

print("R2 training:", metrics.r2_score(y_train, y_train_pred))
print("R2 test:", metrics.r2_score(y_test, y_test_pred))

print("Rmse training:", np.sqrt(metrics.mean_squared_error(y_train, y_train_pred)))
print("Rmse: testing", np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

size of training: 0.7
R2 training: 0.928778403256508
R2 test: 0.9284502782189371
Rmse training: 4.552608777072398
Rmse: testing 4.5690840307569704


In [27]:
target = "EP"
X = df.drop(columns=[target])
X = pd.get_dummies(X, drop_first=True).values.astype("float")
y = np.log(df[target])

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y,
                           test_size = 0.3, random_state = 1)

pipe = pipeline.Pipeline([
    ("poly", preprocessing.PolynomialFeatures(degree=2, include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("est", linear_model.LinearRegression())
])


pipe.fit(X_train, y_train)

y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)

print("training r2", metrics.r2_score(y_train, y_train_pred))
print("test r2", metrics.r2_score(y_test, y_test_pred))

print("training rmse:", np.sqrt(metrics.mean_squared_error(y_train
                                                , y_train_pred)))
print("test rmse:", np.sqrt(metrics.mean_squared_error(y_test
                                            , y_test_pred)))


training r2 0.9371394353763669
test r2 0.9369188951192259
training rmse: 0.009367963301816207
test rmse: 0.009399920743046849
