|<h2>Substack post:</h2>|<h1><a href=" " target="_blank">Least squares part 3: real-data examples</a></h1>|
|-|:-:|
|<h2>Teacher:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the post may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import pandas as pd
import seaborn as sns

# new: for running regression models
import statsmodels.api as sm

In [None]:
### Run this cell only if you're using "dark mode"

# svg plots (higher-res)
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

plt.rcParams.update({
    'figure.facecolor': '#383838',
    'figure.edgecolor': '#383838',
    'axes.facecolor':   '#383838',
    'axes.edgecolor':   '#DDE2F4',
    'axes.labelcolor':  '#DDE2F4',
    'xtick.color':      '#DDE2F4',
    'ytick.color':      '#DDE2F4',
    'text.color':       '#DDE2F4',
    'axes.spines.right': False,
    'axes.spines.top':   False,
    'axes.titleweight': 'bold',
    'axes.labelweight': 'bold',
})

# Import and visualize the data

In [None]:
# dataset website ref
# https://archive.ics.uci.edu/dataset/437/residential+building+data+set


# download the zip file
!wget https://archive.ics.uci.edu/static/public/437/residential+building+data+set.zip -O z.zip

# unpack it locally
import zipfile
with zipfile.ZipFile('z.zip','r') as zz:
  zz.extractall('./')

# import into pandas
data = pd.read_excel('/content/Residential-Building-Data-Set.xlsx',skiprows=1,usecols='F,V,AA,DD')
data.columns = ['FloorArea','Interest','CPI','Price']
data

In [None]:
# pairplot
sns.pairplot(data,vars=['FloorArea','CPI','Price','Interest'],height=2,aspect=1.5,
             plot_kws={'color':[.7,.7,.9,.7]},diag_kws={'color':[.7,.9,.7]})

plt.tight_layout()
plt.show()

# Transform and clean the data

In [None]:
# transformations
data['log-Price']     = np.log(data['Price'])
data['log-FloorArea'] = np.log(data['FloorArea'])
data['bin-Interest']  = (data['Interest']>14.5) + 0 # "+0" transforms bool to int

data

In [None]:
# redo the pairplot with the new variables
sns.pairplot(data,vars=['log-FloorArea','CPI','log-Price'],height=2,aspect=1.5,
             plot_kws={'color':[.7,.7,.9,.7]},diag_kws={'color':[.7,.9,.7]})

plt.tight_layout()
plt.show()

In [None]:
# pick a threshold for outliers
zThresh = 3

# create a copy of the data and z-transform
data_z = data[['log-Price','log-FloorArea','CPI']].copy()
for col in data_z.columns:
  data_z[col] = (data[col] - data[col].mean()) / data[col].std(ddof=1)

# box plots of z-scored data
plt.figure(figsize=(4,5))
bh = sns.boxplot(data=data_z,
                 medianprops={'color':'w','zorder':10},
                 whiskerprops={'color':'w','zorder':10},
                 boxprops={'edgecolor':'w'},
                 capprops={'color':'w','zorder':10}
                 ).set(xlabel='Data feature',ylabel='Data value (z)')

sns.stripplot(data=data_z,jitter=True,alpha=.5,edgecolor='w',linewidth=.5)

# plot the z-thresholds
plt.axhline(y=zThresh,color=[.9,.7,.7],linestyle='--')
plt.axhline(y=-zThresh,color=[.9,.7,.7],linestyle='--')

plt.tight_layout()
plt.show()

In [None]:
# remove the outliers from the original data

print(f'Pre-cleaned dataset has {len(data)} rows.')
zThresh = 3
data = data[(data_z.abs() <= zThresh).all(axis=1)].copy()
print(f'Post-cleaned dataset has {len(data)} rows.')

# tip: try re-running the previous cell to recreate the boxplot

# Run the analysis

In [None]:
# 1) add an intercept term
data['Intercept'] = np.ones(len(data))

# 2) add an interaction
data['Int X CPI'] = data['bin-Interest']*data['CPI']

# 3) fit the model
desmat = data.drop(['log-Price','Price','FloorArea','Interest'],axis=1)
model = sm.OLS(data['log-Price'],desmat).fit()



# show the regression summary
print(model.summary())

In [None]:
# plot with predicted data and residuals

# and visualizations
colorPalette = {0:[.3,.3,.9,.7],1:[.9,.3,.3,.7]} # color mapping for visualization

# generate predicted RT and residuals
data['Predicted'] = model.predict(desmat)
data['Residuals'] = data['Predicted'] - data['log-Price']



### now for the visualizations
fig,axs = plt.subplots(2,2,figsize=(12,8))

# scatter plot of observed data
sns.scatterplot(x='CPI',y='log-Price',hue='bin-Interest',data=data,linewidth=.5,
                style='bin-Interest',palette=colorPalette,ax=axs[0,0],s=80)

# line plot of model predictions
sns.scatterplot(x='CPI',y='Predicted',data=data,color='k',
                ax=axs[0,0],s=20,edgecolor='m',linewidth=.5)
axs[0,0].set_title('1)  Data and predictions')

# predicted by observed
sns.scatterplot(x='log-Price',y='Predicted',hue='bin-Interest',data=data,linewidth=.5,
                style='bin-Interest',palette=colorPalette,ax=axs[0,1],s=80)
axs[0,1].set_title('2)  Data by predictions')

# residuals plot
sns.scatterplot(x='Predicted',y='Residuals',hue='bin-Interest',data=data,linewidth=.5,
                ax=axs[1,0],s=80,style='bin-Interest',palette=colorPalette)
axs[1,0].set_title('3)  Residuals Plot')


# histograms of residuals separated by category
sns.histplot(data=data,x='Residuals',hue='bin-Interest',
             palette=colorPalette,ax=axs[1,1])
axs[1,1].set(xlabel='Residuals',ylabel='Count')
axs[1,1].set_title('4)  Residuals histograms')

# shrink down the legend font sizes
for a in axs.flatten(): a.legend(['Low int.','High int.'])


plt.tight_layout()
plt.show()

In [None]:
# Scatter plot with the original data
sns.scatterplot(x='CPI', y='log-Price', hue='bin-Interest', data=data, linewidth=0.5,
                style='bin-Interest', palette=colorPalette, ax=axs[0, 0], s=80)

# Line plot of model predictions
sns.scatterplot(x='CPI', y='Predicted', data=data, color='k', label='Predicted',
                ax=axs[0, 0], s=20, edgecolor='m', linewidth=0.5)

# Update the legend to include the predicted values
handles, labels = axs[0, 0].get_legend_handles_labels()

# Add the 'Predicted' label explicitly if it's missing
if 'Predicted' not in labels:
    handles.append(axs[0, 0].lines[-1])  # The last line plot is 'Predicted'
    labels.append('Predicted')

# Set the legend with all desired labels
axs[0, 0].legend(handles=handles, labels=labels)

# Title for the plot
axs[0, 0].set_title('1) Data and predictions')

# Show the plot
plt.show()
