In [None]:
import json
import pandas as pd
import numpy as np
import seaborn as sns
import re
import matplotlib.pyplot as plt
import sklearn

In [None]:
df_2018 = pd.read_csv('2018_public_use_data.csv')
print(df_2018.shape)

In [None]:
df_filled = df_2018.copy()

In [None]:
df_filled.sample(5)

In [None]:
df_codebook = pd.read_csv('2018cbecs_codebook_wDicts.csv')
df_codebook.head()

In [None]:
print(df_codebook.category.unique())

In [None]:
df_codebook_2012 = pd.read_csv('2012cbecs_codebook_wDicts.csv')
df_codebook_2012.head()

In [None]:
col_name_2012 = df_codebook_2012.col_name.values.tolist()
col_name_2018 = df_codebook.col_name.values.tolist()

In [None]:
keep_var = df_codebook[df_codebook['col_name'].isin(col_name_2012)]
keep_var.shape

## 330 overlapping variables 

In [None]:
keep_var_list = keep_var.col_name.values.tolist()

In [None]:
## Read 2012 Data

df_2012 = pd.read_csv('2012cbecs_train-data.csv')
df_2012_cons = pd.read_csv('2012cbecs_consumption-data.csv')

In [None]:
df_2012.sample(5)

In [None]:
df_2012 = df_2012[df_2012.columns.intersection(keep_var_list)]
df_2012.shape

In [None]:
df_2012_cons.sample(5)

In [None]:
category_map = {}
for cat in df_codebook.category.unique():
    category_cols_df = df_codebook[df_codebook['category'] == cat]
    cat_cols = []
    for col in category_cols_df.col_name.values:
        if type(col) == str:
            cat_cols.append(col)
    category_map[cat] = cat_cols

In [None]:
keep_cols = []
for wanted_cols in category_map.values():
    for col in wanted_cols:
        keep_cols.append(col)
print(len(keep_cols))

In [None]:
def find_num_cat_cols(list_of_dfs, category_num, list_of_numcols, list_of_catcols):
    numeric_cols = []
    categorical_cols = []
    for col in list_of_dfs[category_num].columns:
        if col in list_of_numcols:
            numeric_cols.append(col)
        elif col in list_of_catcols:
            categorical_cols.append(col)
    df_category = list_of_dfs[category_num].copy()[numeric_cols + categorical_cols]
    return df_category, numeric_cols, categorical_cols

In [None]:
def create_new_codebook(codebook, cols):
    new_codebook = dict(codebook.set_index('col_name').loc[cols].codes_dict)
    return new_codebook

In [None]:
def create_replace_map(new_codebook, category_df, return_errors=False):
    replace_map = {}
    error_map = {}
    for col_name, codebook in new_codebook.items():
        try:
            codebook = json.loads(codebook.replace("'", '"'))
        except ValueError as e:
            print(f'JSONDecodeError on column {col_name}')
            error_map[col_name] = (col_name, e)
            codebook = codebook.replace('"', '')
            codebook = json.loads(codebook.replace("'", '"'))
        finally:
            if col_name in category_df.columns:
                for code, code_val in codebook.items():
                    if code == '2' and code_val == 'No':
                        replace_map[col_name] = 2
    
    if return_errors == True:
        return replace_map, error_map
    else:
        return replace_map

In [None]:
def build_hist(column_list):
    figname = 0
    for column in column_list:
        plt.figure(figname)
        sns.set(rc = {'figure.figsize':(2,2)})
        sns.histplot(data=df, x=column, kde=True)
        figname += 1

In [None]:
df_filled = df_filled[keep_cols]
print(df_filled.shape)
df_filled.sample(5)

In [None]:
category_dfs = {}
for cat_name, cat_list in category_map.items():
    category_dfs[cat_name] = df_filled[cat_list]
category_dfs[0].sample(10)

In [None]:
codebook_check = df_codebook.copy().set_index('col_name')
num_cols = []
cat_cols = []
for col in codebook_check.index:
    if codebook_check.loc[col].col_type == 'Num':
        num_cols.append(col)
    else:
        cat_cols.append(col)
print(len(num_cols))
print(len(cat_cols))

In [None]:
df_filled = df_filled[df_filled.columns.intersection(keep_var_list)]
df_filled.shape

In [None]:
df_filled.sample(5)

In [None]:
df_codebook = create_new_codebook(df_codebook, list(df_filled.columns))
replace_map, error_map = create_replace_map(df_codebook, df_filled, return_errors=True)
print(error_map)

In [None]:
df_filled = df_filled.replace(to_replace=replace_map, value=0).fillna(value=0)
df_filled = df_filled.replace(to_replace='.', value=0)
df_filled[0:20]

In [None]:
objects = list(df_filled.select_dtypes(['object']).columns)
objects

In [None]:
for col in objects:
    df_filled[col] = df_filled[col].astype(str).astype(int)

In [None]:
list(df_filled.select_dtypes(['object']).columns)

In [None]:
print(df_filled.NFLOOR.dtype)
print(df_filled.BASEMNT.dtype)
print(df_filled.FLCEILHT.dtype)
print(df_filled.NELVTR.dtype)
print(df_filled.NESLTR.dtype)
print(df_filled.RWSEAT.dtype)
print(df_filled.PBSEAT.dtype)
print(df_filled.HCBED.dtype)
print(df_filled.NRSBED.dtype)
print(df_filled.LODGRM.dtype)
print(df_filled.NOCC.dtype) 
print(df_filled.NOCCAT.dtype)
print(df_filled.XRAYN.dtype)
print(df_filled.TVVIDEON.dtype)
print(df_filled.RFGCOMPN.dtype)
print(df_filled.RFGRSN.dtype)
print(df_filled.SERVERN.dtype)
print(df_filled.RFGCOMP.dtype)

In [None]:
## Adjusting for weird values

df_filled.NFLOOR.replace(to_replace=994, value=10, inplace=True)
df_filled.NFLOOR.replace(to_replace=995, value=15, inplace=True)
df_filled.BASEMNT.replace(to_replace=995, value=5, inplace=True)
df_filled.FLCEILHT.replace(to_replace=995, value=51, inplace=True)
df_filled.NELVTR.replace(to_replace=995, value=31, inplace=True)
df_filled.NESLTR.replace(to_replace=995, value=11, inplace=True)
df_filled.RWSEAT.replace(to_replace=99995, value=2001, inplace=True)
df_filled.PBSEAT.replace(to_replace=999995, value=15001, inplace=True)
df_filled.HCBED.replace(to_replace=9995, value=251, inplace=True)
df_filled.NRSBED.replace(to_replace=9995, value=251, inplace=True)
df_filled.LODGRM.replace(to_replace=99995, value=1001, inplace=True)
df_filled.NOCC.replace(to_replace=996, value=0, inplace=True) 
df_filled.NOCCAT.replace (to_replace=996, value=0, inplace=True) 
df_filled.XRAYN.replace(to_replace=995, value=21, inplace=True)
df_filled.TVVIDEON.replace(to_replace=995, value=201, inplace=True) 
df_filled.RFGCOMPN.replace(to_replace=9995, value=1001, inplace=True)
df_filled.RFGRSN.replace(to_replace=99995, value=1001, inplace=True)
df_filled.SERVERN.replace(to_replace=9995, value=501, inplace=True)
df_filled.RFGCOMP.replace(to_replace=99995, value=0, inplace=True)

Training Target 

In [None]:
col_list = ['MFHTBTU',
'MFCLBTU',
'MFVNBTU',
'MFWTBTU',
'MFLTBTU',
'MFCKBTU',
'MFRFBTU',
'MFOFBTU',
'MFPCBTU',
'MFOTBTU',
'ELHTBTU',
'ELCLBTU',
'ELVNBTU',
'ELWTBTU',
'ELLTBTU',
'ELCKBTU',
'ELRFBTU',
'ELOFBTU',
'ELPCBTU',
'ELOTBTU',
'NGHTBTU',
'NGCLBTU',
'NGWTBTU',
'NGCKBTU',
'NGOTBTU',
'FKHTBTU',
'FKCLBTU',
'FKWTBTU',
'FKCKBTU',
'FKOTBTU',
'DHHTBTU',
'DHCLBTU',
'DHWTBTU',
'DHCKBTU',
'DHOTBTU']
y_train = pd.DataFrame(columns=["energy_consumption"])
y_train["energy_consumption"]=(df_2012_cons[col_list].sum(axis=1))
y_train.sample(5)

In [None]:
y_train.describe()

In [None]:
train_total = df_2012.join(y_train)
train_total.shape

In [None]:
train_total = train_total[train_total.energy_consumption != 0]
train_total.shape

In [None]:
X_train1 = train_total.drop(columns=['energy_consumption'])
y_train = pd.DataFrame(train_total['energy_consumption'])

In [None]:
X_train1.shape

In [None]:
y_train.shape

In [None]:
X_test = df_filled.copy()

# PCA

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

lr = LinearRegression()
scaler_train = StandardScaler()
pca_train = PCA()

X_train = X_train1.to_numpy(dtype=float, copy=True)
X_train_scaled = scaler_train.fit_transform(X_train)
X_train_pca = pca_train.fit_transform(X_train_scaled)

In [None]:
pca_exp_var = pd.DataFrame(pca_train.explained_variance_ratio_,)
pca_exp_var.T

In [None]:
# plt.figure(figsize=(5,5))
# plt.plot(np.cumsum(pca_train.explained_variance_ratio_))
# plt.xlabel('num components')
# plt.ylabel('cumulative explained variance')

In [None]:
pca_train_df = pd.DataFrame(pca_train.components_)
sns.set(rc = {'figure.figsize':(5,5)})
sns.scatterplot(x=pca_train_df.T[0], y=pca_train_df.T[1])

In [None]:
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (12,6)

fig, ax = plt.subplots()
xi = np.arange(1, 331, step=1)
y = np.cumsum(pca_train.explained_variance_ratio_)

plt.ylim(0.0,1.1)
plt.plot(xi, y, marker='o', linestyle='--', color='b')

plt.xlabel('Number of Components')
plt.xticks(np.arange(0, 400, step=50)) #change from 0-based array index to 1-based human-readable label
plt.ylabel('Cumulative variance (%)')
plt.title('The number of components needed to explain variance')

plt.axhline(y=0.80, color='r', linestyle='-')
plt.text(0.5, 0.85, '80% cut-off threshold', color = 'red', fontsize=16)

plt.axhline(y=0.95, color='b', linestyle='-')
plt.text(0.5, 0.97, '95% cut-off threshold', color = 'red', fontsize=16)


ax.grid(axis='x')
plt.show()

# https://www.kaggle.com/code/kushal1506/deciding-n-components-in-pca/notebook

In [None]:
pca_final = PCA(n_components = 200) # INSERT NUMBER OF COMPONENTS
X_train_pca = pca_final.fit_transform(X_train_scaled)

In [None]:
X_train_pca.shape

In [None]:
# pca_exp_var = pd.DataFrame(pca_final.explained_variance_ratio_,)
# pca_exp_var.T

In [None]:
## PCA on 2018 data

X_test = X_test.to_numpy(dtype=float, copy=True)
X_test_scaled = scaler_train.transform(X_test)
X_test_pca = pca_final.transform(X_test_scaled)
X_test_pca.shape

SVM

In [None]:
y_train_final = np.log(y_train).to_numpy().ravel()
y_train_final

In [None]:
from sklearn import svm
regr = svm.SVR(kernel='linear')
model_svm = regr.fit(X_train_pca, y_train_final)

In [None]:
y_pred = model_svm.predict(X_test_pca)
y_pred.shape

In [None]:
predicted = pd.DataFrame(y_pred).rename(columns={0:'predicted'})
predicted.describe()

In [None]:
# predicted[predicted['predicted'] >= 500].shape
# predicted[predicted['predicted'] <= -0].shape
# df_outliers.to_csv('df_outliers.csv')

Testing with Residuals

In [None]:
testing = model_svm.predict(X_train_pca)
testing = pd.DataFrame(testing)
testing

In [None]:
y_train_final = pd.DataFrame(y_train_final).rename(columns={0:'energy_consumption'})
df1=pd.concat([y_train_final,testing],axis=1,join='inner').rename(columns={'energy_consumption': 'original', 0:'predicted'})
df1

In [None]:
df1['residual']= df1['original'] - df1['predicted']
df1

In [None]:
sns.scatterplot(data=df1, x="predicted", y="residual")

Rerunning SVM without outliers

In [None]:
df_outliers=pd.concat([df_filled,predicted],axis=1,join='inner')
X_test2 = df_outliers.drop(df_outliers[df_outliers.predicted <= -500].index)
X_test_final = X_test2.drop(X_test2[X_test2.predicted >= 500].index)

In [None]:
X_test2.shape

In [None]:
X_test_final.shape

In [None]:
X_test_final = pd.DataFrame(X_test_final)
X_test_final

In [None]:
X_test_final.drop(columns=['predicted'], axis=1, inplace=True)

In [None]:
X_test_final = X_test_final.to_numpy(dtype=float, copy=True)
X_test_scaled_final = scaler_train.transform(X_test_final)
X_test_pca_final = pca_final.transform(X_test_scaled_final)

In [None]:
y_pred_final = model_svm.predict(X_test_pca_final)
y_pred_final.shape

In [None]:
predicted_final = pd.DataFrame(y_pred_final).rename(columns={0:'predicted'})

In [None]:
predicted_final.describe()