## Multiple Linear Regression

In [5]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [6]:
# Read csv
file_path = '../data/final/wo_na.csv'
df = pd.read_csv(file_path)

In [7]:
# Define x and y variables
X = df[['Iron Ore (CFR, $/t)', 'HCC (Aus FOB, $/t)',
         'Domestic Scrap (DDP Jiangsu incl. VAT $/t)',
         'Monthly Export of Semis & Finished Steel as % of Production',
         'FAI in urban real estate development (y-o-y) Growth',
         'Automobile Production (y-o-y)', 'Civil Metal-Vessels/Steel Ships (y-o-y)',
         'Household Fridges (y-o-y)', 'Air Conditioner (y-o-y)']]
y = df['HRC (FOB, $/t)']

In [51]:
# Scale x variables since they are of different magnitudes
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(X_scaled)

[[-1.21162485e+00 -9.06443942e-01 -1.35172198e+00 ... -1.29702093e-01
   1.85324573e+00 -3.72634321e-01]
 [-1.13077569e+00 -1.24208298e+00 -1.27679578e+00 ...  8.01746133e-02
   1.67753934e+00 -2.07643759e-01]
 [-1.13077569e+00 -1.25291005e+00 -1.25806423e+00 ... -4.03101625e-02
   1.46522744e+00 -1.78181158e-01]
 ...
 [-1.33636003e-01  6.79919871e-02 -4.71339093e-01 ... -3.20145771e-01
  -1.16130105e-01 -1.42826038e-01]
 [-2.68384609e-01 -1.37722265e-01 -5.55631072e-01 ... -3.20145771e-01
  -1.01487906e-01 -1.31040997e-01]
 [ 1.11260317e-03  4.63378553e-02 -3.96412890e-01 ... -3.12372560e-01
  -2.82769083e-02 -1.19255957e-01]]


In [52]:
# Split into training and testing sets with a ratio of 80:20
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [53]:
# Model fitting
model = LinearRegression()
model.fit(X_train, y_train)

In [54]:
# Obtain regression model coefficients
print('Intercept: ', model.intercept_)
list(zip(X, model.coef_))

Intercept:  561.2766137542528


[('Iron Ore (CFR, $/t)', -9.317858722131529),
 ('HCC (Aus FOB, $/t)', 48.46583370910989),
 ('Domestic Scrap (DDP Jiangsu incl. VAT $/t)', 94.30050218404277),
 ('Monthly Export of Semis & Finished Steel as % of Production',
  23.97714239539002),
 ('FAI in urban real estate development (y-o-y) Growth', 24.09638822586932),
 ('Automobile Production (y-o-y)', 5.894560942831938),
 ('Civil Metal-Vessels/Steel Ships (y-o-y)', 4.137037760670328),
 ('Household Fridges (y-o-y)', 4.8634876396297955),
 ('Air Conditioner (y-o-y)', -13.68563478392782)]

In [55]:
# Model predictions
y_pred = model.predict(X_test)

In [56]:
# Error metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print('Mean Absolute Error: ', mae)
print('Mean Squared Error: ', mse)
print('Root Mean Squared Error: ', rmse)
print('R-squared: ', r2)

Mean Absolute Error:  48.743183460959365
Mean Squared Error:  6224.921352379985
Root Mean Squared Error:  78.89817077968276
R-squared:  0.7344433105743483
