In [14]:
from ucimlrepo import fetch_ucirepo 
# fetch dataset 
steel_industry_energy_consumption = fetch_ucirepo(id=851) 
  
# data (as pandas dataframes) 
X = steel_industry_energy_consumption.data.features 
y = steel_industry_energy_consumption.data.targets 
  
# metadata 
print(steel_industry_energy_consumption.metadata) 
  
# variable information 
print(steel_industry_energy_consumption.variables) 

{'uci_id': 851, 'name': 'Steel Industry Energy Consumption', 'repository_url': 'https://archive.ics.uci.edu/dataset/851/steel+industry+energy+consumption', 'data_url': 'https://archive.ics.uci.edu/static/public/851/data.csv', 'abstract': 'The data is collected from a smart small-scale steel industry in South Korea.', 'area': 'Physics and Chemistry', 'tasks': ['Regression'], 'characteristics': ['Multivariate'], 'num_instances': 35040, 'num_features': 9, 'feature_types': ['Real', 'Categorical'], 'demographics': [], 'target_col': ['Load_Type'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2021, 'last_updated': 'Mon Aug 14 2023', 'dataset_doi': '10.24432/C52G8C', 'creators': ['Sathishkumar V E', 'Changsun Shin', 'Yongyun Cho'], 'intro_paper': {'title': 'Efficient energy consumption prediction model for a data analytic-enabled industry building in a smart city', 'authors': 'Sathishkumar V E, Changsun Shin, Yongyun Cho', 'published

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn import preprocessing 

In [16]:
steel_industry_energy_consumption.data.original.to_csv("lab3.csv", index=False)
df = pd.read_csv("lab3.csv")

In [17]:
df.head()

Unnamed: 0,date,Usage_kWh,Lagging_Current_Reactive.Power_kVarh,Leading_Current_Reactive_Power_kVarh,CO2(tCO2),Lagging_Current_Power_Factor,Leading_Current_Power_Factor,NSM,WeekStatus,Day_of_week,Load_Type
0,01/01/2018 00:15,3.17,2.95,0.0,0.0,73.21,100.0,900,Weekday,Monday,Light_Load
1,01/01/2018 00:30,4.0,4.46,0.0,0.0,66.77,100.0,1800,Weekday,Monday,Light_Load
2,01/01/2018 00:45,3.24,3.28,0.0,0.0,70.28,100.0,2700,Weekday,Monday,Light_Load
3,01/01/2018 01:00,3.31,3.56,0.0,0.0,68.09,100.0,3600,Weekday,Monday,Light_Load
4,01/01/2018 01:15,3.82,4.5,0.0,0.0,64.72,100.0,4500,Weekday,Monday,Light_Load


In [18]:
X

Unnamed: 0,Usage_kWh,Lagging_Current_Reactive.Power_kVarh,Leading_Current_Reactive_Power_kVarh,CO2(tCO2),Lagging_Current_Power_Factor,Leading_Current_Power_Factor,NSM,WeekStatus,Day_of_week
0,3.17,2.95,0.00,0.0,73.21,100.00,900,Weekday,Monday
1,4.00,4.46,0.00,0.0,66.77,100.00,1800,Weekday,Monday
2,3.24,3.28,0.00,0.0,70.28,100.00,2700,Weekday,Monday
3,3.31,3.56,0.00,0.0,68.09,100.00,3600,Weekday,Monday
4,3.82,4.50,0.00,0.0,64.72,100.00,4500,Weekday,Monday
...,...,...,...,...,...,...,...,...,...
35035,3.85,4.86,0.00,0.0,62.10,100.00,82800,Weekday,Monday
35036,3.74,3.74,0.00,0.0,70.71,100.00,83700,Weekday,Monday
35037,3.78,3.17,0.07,0.0,76.62,99.98,84600,Weekday,Monday
35038,3.78,3.06,0.11,0.0,77.72,99.96,85500,Weekday,Monday


In [19]:
df["Prev_Load_Type"] = df["Load_Type"].shift(1)
df = df.drop(df.index[0])

one_hot_features = pd.get_dummies(df[["Day_of_week", "WeekStatus", "Prev_Load_Type"]], dtype=np.int64, prefix="")
df = pd.concat([df, one_hot_features], axis=1)

In [20]:
df.head()

Unnamed: 0,date,Usage_kWh,Lagging_Current_Reactive.Power_kVarh,Leading_Current_Reactive_Power_kVarh,CO2(tCO2),Lagging_Current_Power_Factor,Leading_Current_Power_Factor,NSM,WeekStatus,Day_of_week,...,_Saturday,_Sunday,_Thursday,_Tuesday,_Wednesday,_Weekday,_Weekend,_Light_Load,_Maximum_Load,_Medium_Load
1,01/01/2018 00:30,4.0,4.46,0.0,0.0,66.77,100.0,1800,Weekday,Monday,...,0,0,0,0,0,1,0,1,0,0
2,01/01/2018 00:45,3.24,3.28,0.0,0.0,70.28,100.0,2700,Weekday,Monday,...,0,0,0,0,0,1,0,1,0,0
3,01/01/2018 01:00,3.31,3.56,0.0,0.0,68.09,100.0,3600,Weekday,Monday,...,0,0,0,0,0,1,0,1,0,0
4,01/01/2018 01:15,3.82,4.5,0.0,0.0,64.72,100.0,4500,Weekday,Monday,...,0,0,0,0,0,1,0,1,0,0
5,01/01/2018 01:30,3.28,3.56,0.0,0.0,67.76,100.0,5400,Weekday,Monday,...,0,0,0,0,0,1,0,1,0,0


In [21]:
df = df.drop(["Prev_Load_Type"], axis=1)

In [22]:
cols_to_scale = [
        "Usage_kWh",
        "Leading_Current_Reactive_Power_kVarh",
        "Lagging_Current_Reactive.Power_kVarh",
        "CO2(tCO2)",
        "Lagging_Current_Power_Factor",
        "Leading_Current_Power_Factor",
        "NSM",
    ]


def normalize(data: np.ndarray):
    mean = data.mean(axis= 0)
    return (data - mean) / np.std(data, axis=0)

df[cols_to_scale] = normalize(df[cols_to_scale])

In [23]:
X = df.drop(["date", "Day_of_week", "WeekStatus", "Load_Type"] + ["_Light_Load", "_Medium_Load", "_Maximum_Load"], axis=1)

In [24]:
Y = df["Load_Type"]

In [25]:
Y = pd.get_dummies(Y, dtype=np.int64, prefix="")

In [26]:
X.to_csv("lab3_train.csv", index=False)
Y.to_csv("lab3_target.csv", index=False)

In [27]:
Y["_Light_Load"].value_counts()

_Light_Load
1    18071
0    16968
Name: count, dtype: int64

In [28]:
Y["_Maximum_Load"].value_counts()

_Maximum_Load
0    27767
1     7272
Name: count, dtype: int64

In [29]:
Y["_Medium_Load"].value_counts()

_Medium_Load
0    25343
1     9696
Name: count, dtype: int64