# Regression Mega-Script

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from math import sqrt

In [2]:
df = pd.read_csv("Data.csv")
df

Unnamed: 0,AT,V,AP,RH,PE
0,14.96,41.76,1024.07,73.17,463.26
1,25.18,62.96,1020.04,59.08,444.37
2,5.11,39.40,1012.16,92.14,488.56
3,20.86,57.32,1010.24,76.64,446.48
4,10.82,37.50,1009.23,96.62,473.90
...,...,...,...,...,...
9563,16.65,49.69,1014.01,91.00,460.03
9564,13.19,39.18,1023.67,66.78,469.62
9565,31.32,74.33,1012.92,36.48,429.57
9566,24.48,69.45,1013.86,62.39,435.74


In [3]:
# Making sure there's no NaN data
df.isnull().values.any()
# df.isnull().sum() count the total number of NaN values

False

In [4]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [5]:
X

array([[  14.96,   41.76, 1024.07,   73.17],
       [  25.18,   62.96, 1020.04,   59.08],
       [   5.11,   39.4 , 1012.16,   92.14],
       ...,
       [  31.32,   74.33, 1012.92,   36.48],
       [  24.48,   69.45, 1013.86,   62.39],
       [  21.6 ,   62.52, 1017.23,   67.87]])

In [6]:
y

array([463.26, 444.37, 488.56, ..., 429.57, 435.74, 453.28])

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

### Multiple Linear Regression

In [8]:
multi_r = LinearRegression()
multi_r.fit(X_train, y_train)

LinearRegression()

In [9]:
y_pred = multi_r.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[[431.42761597 431.23      ]
 [458.56124622 460.01      ]
 [462.75264705 461.14      ]
 ...
 [469.51835895 473.26      ]
 [442.41759454 438.        ]
 [461.88279939 463.28      ]]


In [10]:
s = "The Least-Squared Multiple Linear Regression is y = " + str(multi_r.intercept_)
for i in range(len(multi_r.coef_)):
    s += " + (" + str(multi_r.coef_[i]) + ")*x" + str(i)
print(s)

The Least-Squared Multiple Linear Regression is y = 452.8410371616388 + (-1.973130989101957)*x0 + (-0.23649992698007033)*x1 + (0.06387890764108489)*x2 + (-0.1580701858027475)*x3


In [11]:
rms = sqrt(mean_squared_error(y_test, y_pred))
print(rms)

4.44226285844249


In [12]:
r2_score(y_test, y_pred)

0.9325315554761303

### Polynomial Regression (degree 4)

In [13]:
poly_f_4 = PolynomialFeatures(degree = 4)
X_poly_4 = poly_f_4.fit_transform(X_train)
poly_r_4 = LinearRegression()
poly_r_4.fit(X_poly_4, y_train)

LinearRegression()

In [14]:
y_pred = poly_r_4.predict(poly_f_4.transform(X_test))
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[[433.94393014 431.23      ]
 [457.90442554 460.01      ]
 [460.52454365 461.14      ]
 ...
 [469.52872254 473.26      ]
 [438.26560574 438.        ]
 [461.66502572 463.28      ]]


In [15]:
rms = sqrt(mean_squared_error(y_test, y_pred))
print(rms)

3.9808514634682273


In [16]:
r2_score(y_test, y_pred)

0.9458193591299994

### SVR

In [17]:
sc_X = StandardScaler()
sc_y = StandardScaler()
X_svr = sc_X.fit_transform(X_train)
y_svr = sc_y.fit_transform(y_train.reshape(len(y_train), 1))

In [18]:
svr_r = SVR(kernel = 'rbf')
svr_r.fit(X_svr, y_svr)

  return f(**kwargs)


SVR()

In [19]:
y_pred = sc_y.inverse_transform(svr_r.predict(sc_X.transform(X_test)))
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[[434.05242921 431.23      ]
 [457.93810186 460.01      ]
 [461.03113894 461.14      ]
 ...
 [470.60268461 473.26      ]
 [439.41653548 438.        ]
 [460.91757115 463.28      ]]


In [20]:
rms = sqrt(mean_squared_error(y_test, y_pred))
print(rms)

3.8969776670879264


In [21]:
r2_score(y_test, y_pred)

0.9480784049986257

### Regression Tree

In [22]:
tree_r = DecisionTreeRegressor(random_state = 0)
tree_r.fit(X_train, y_train)

DecisionTreeRegressor(random_state=0)

In [23]:
y_pred = tree_r.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[[431.28 431.23]
 [459.59 460.01]
 [460.06 461.14]
 ...
 [471.46 473.26]
 [437.76 438.  ]
 [462.74 463.28]]


In [24]:
rms = sqrt(mean_squared_error(y_test, y_pred))
print(rms)

4.748588631545224


In [25]:
r2_score(y_test, y_pred)

0.922905874177941

### Regression Forest

In [26]:
forest_r = RandomForestRegressor(n_estimators = 10, random_state = 0)
forest_r.fit(X_train, y_train)

RandomForestRegressor(n_estimators=10, random_state=0)

In [27]:
y_pred = forest_r.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[[434.049 431.23 ]
 [458.785 460.01 ]
 [463.02  461.14 ]
 ...
 [469.479 473.26 ]
 [439.566 438.   ]
 [460.385 463.28 ]]


In [28]:
rms = sqrt(mean_squared_error(y_test, y_pred))
print(rms)

3.3517479009675766


In [29]:
r2_score(y_test, y_pred)

0.9615908334363876