In [109]:
# Importing necessary libraries for plotting, numerical operations, and statistical modeling
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm

In [None]:
from ucimlrepo import fetch_ucirepo  # Import the fetch_ucirepo function from the ucimlrepo module
  
energy_efficiency = fetch_ucirepo(id=242)  # Fetch the energy efficiency dataset with id 242
  
X = energy_efficiency.data.features  # Extract the features from the dataset and store them in X
y = energy_efficiency.data.targets  # Extract the targets from the dataset and store them in y
  
print(X.head())  # Print the first few rows of the features dataframe
print(y.head())  # Print the first few rows of the targets dataframe

In [111]:
X = X.values  # Convert the features dataframe to a numpy array
Y = y.iloc[:, 0].values  # Extract the first column of the targets dataframe and convert it to a numpy array

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
Xtr, Xte, Ytr, Yte = train_test_split(X, Y, test_size=0.2, random_state=42)

print("Training features shape:", Xtr.shape)
print("Testing features shape:", Xte.shape)
print("Training target shape:", Ytr.shape)
print("Testing target shape:", Yte.shape)

In [113]:
from sklearn.preprocessing import StandardScaler  # Import the StandardScaler class from sklearn.preprocessing
normalizer = StandardScaler()  # Create an instance of StandardScaler
Xtr = normalizer.fit_transform(Xtr)  # Fit the scaler to the training data and transform it
Xte = normalizer.transform(Xte)  # Transform the testing data using the fitted scaler

In [114]:
# Add a constant term to the features using numpy
Xtr = np.hstack([np.ones((Xtr.shape[0], 1)), Xtr])  # Add a column of ones to the training features for the intercept term
Xte = np.hstack([np.ones((Xte.shape[0], 1)), Xte])  # Add a column of ones to the testing features for the intercept term

In [None]:
features_names = energy_efficiency.variables["description"].iloc[:8].tolist()  # Extract the first 8 descriptions for features and convert to a list
target_name = energy_efficiency.variables["description"].iloc[8]  # Extract the 9th description for the target variable

print(features_names)  # Print the list of feature names
print(target_name)  # Print the target variable name

In [116]:
# Update feature names to include the constant term
features_names = ['Constant'] + features_names

In [None]:
# Fit the linear regression model using statsmodels
model = sm.OLS(Ytr, Xtr).fit()

# Get the regression coefficients
aa = model.params

# Get the p-values
p_vals = model.pvalues

# Calculate R-squared
rr = model.rsquared

# Print the results
print("Regression Coefficients (aa):", aa)
print("P-values (p_vals):", p_vals)
print("R-squared (rr):", rr)

In [None]:
# Predict on training and testing data
Ytr_pred = model.predict(Xtr)

# Plot actual vs predicted for training data
plt.figure(figsize=(4, 4))  # Create a new figure with a size of 4x4 inches
plt.scatter(Ytr, Ytr_pred, alpha=0.7)  # Create a scatter plot of actual vs predicted values for training data with 70% opacity
plt.plot([Ytr.min(), Ytr.max()], [Ytr.min(), Ytr.max()], 'r--')  # Plot a red dashed line representing the ideal fit (y=x)
plt.xlabel('Actual')  # Set the x-axis label to 'Actual'
plt.ylabel('Predicted')  # Set the y-axis label to 'Predicted'
plt.title('Actual vs Predicted for Training Data')  # Set the plot title to 'Actual vs Predicted for Training Data'
plt.show()  # Display the plot

In [None]:
Yte_pred = model.predict(Xte)  # Predict the target variable for the test set using the linear regression model

# Plot actual vs predicted for testing data
plt.figure(figsize=(4, 4))  # Create a new figure with a size of 4x4 inches
plt.scatter(Yte, Yte_pred, alpha=0.7)  # Create a scatter plot of actual vs predicted values for testing data with 70% opacity
plt.plot([Yte.min(), Yte.max()], [Yte.min(), Yte.max()], 'r--')  # Plot a red dashed line representing the ideal fit (y=x)
plt.xlabel('Actual')  # Set the x-axis label to 'Actual'
plt.ylabel('Predicted')  # Set the y-axis label to 'Predicted'
plt.title('Actual vs Predicted for Testing Data')  # Set the plot title to 'Actual vs Predicted for Testing Data'
plt.show()  # Display the plot

In [None]:
import matplotlib.pyplot as plt

# Plot the regression coefficients with feature names
plt.figure(figsize=(10, 2))  # Create a new figure with a size of 10x2 inches
plt.bar(features_names, aa)  # Create a bar plot of regression coefficients with feature names on the x-axis
plt.xlabel('Features')  # Set the x-axis label to 'Features'
plt.ylabel('Regression Coefficients')  # Set the y-axis label to 'Regression Coefficients'
plt.title('Regression Coefficients for Each Feature')  # Set the plot title to 'Regression Coefficients for Each Feature'
plt.xticks(rotation=90)  # Rotate the x-axis labels by 90 degrees for better readability
plt.show()  # Display the plot

In [None]:
plt.figure(figsize=(10, 2))  # Create a new figure with a size of 10x2 inches
plt.bar(features_names, p_vals)  # Create a bar plot of p-values with feature names on the x-axis
plt.xlabel('Features')  # Set the x-axis label to 'Features'
plt.ylabel('P-values')  # Set the y-axis label to 'P-values'
plt.title('P-values for Each Feature')  # Set the plot title to 'P-values for Each Feature'
plt.xticks(rotation=90)  # Rotate the x-axis labels by 90 degrees for better readability
plt.show()  # Display the plot