# Supervised Learning: Regression & Classification

Student: Phabel Antonio López Delgado

## Linear Regression

### Load Data

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score

In [2]:
# Load Data
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
raw_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.00632,18.00,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3
1,396.90000,4.98,24.00,,,,,,,,
2,0.02731,0.00,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8
3,396.90000,9.14,21.60,,,,,,,,
4,0.02729,0.00,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8
...,...,...,...,...,...,...,...,...,...,...,...
1007,396.90000,5.64,23.90,,,,,,,,
1008,0.10959,0.00,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0
1009,393.45000,6.48,22.00,,,,,,,,
1010,0.04741,0.00,11.93,0.0,0.573,6.030,80.8,2.5050,1.0,273.0,21.0


In [3]:
# Describe Data
raw_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
count,1012.0,1012.0,1012.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,180.143778,12.00835,16.834792,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534
std,188.132839,17.250728,9.912616,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6
25%,0.25783,0.0,8.375,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4
50%,24.021,7.24,18.1,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05
75%,391.435,16.78,21.89,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2
max,396.9,100.0,50.0,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0


In [4]:
# General Info
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1012 entries, 0 to 1011
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       1012 non-null   float64
 1   1       1012 non-null   float64
 2   2       1012 non-null   float64
 3   3       506 non-null    float64
 4   4       506 non-null    float64
 5   5       506 non-null    float64
 6   6       506 non-null    float64
 7   7       506 non-null    float64
 8   8       506 non-null    float64
 9   9       506 non-null    float64
 10  10      506 non-null    float64
dtypes: float64(11)
memory usage: 87.1 KB


In [5]:
# Processing and filtering
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, 1:2]])
# Target Y values
target = raw_df.values[1::2, 2]

In [6]:
# Training X Dataframe
frame = pd.DataFrame(data)
frame

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,9.08
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,5.64
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,6.48


In [7]:
# Describe Training X data
frame.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,12.653063
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,7.141062
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,1.73
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,6.95
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,11.36
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,16.955
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,37.97


In [8]:
# New gral. info
frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       506 non-null    float64
 1   1       506 non-null    float64
 2   2       506 non-null    float64
 3   3       506 non-null    float64
 4   4       506 non-null    float64
 5   5       506 non-null    float64
 6   6       506 non-null    float64
 7   7       506 non-null    float64
 8   8       506 non-null    float64
 9   9       506 non-null    float64
 10  10      506 non-null    float64
 11  11      506 non-null    float64
dtypes: float64(12)
memory usage: 47.6 KB


### Multiple Linear Regression Model

In [9]:
# Split
X_train, X_test, y_train, y_test = train_test_split(frame,target,test_size=0.2)
# Model
model = LinearRegression()
model.fit(X_train,y_train)
# Predict
predicted = model.predict(X_test)

In [11]:
# Print Coefficients
print(model.coef_)
print(model.intercept_)

[-1.25164871e-01  3.95796312e-02  3.51524933e-02  2.97249423e+00
 -2.05483754e+01  3.81268948e+00  1.09877090e-02 -1.37806580e+00
  2.64009839e-01 -1.03375629e-02 -9.70440288e-01 -5.85797903e-01]
40.76483910961779


In [12]:
# Results: Coefficients
colors = ['Positive' if c > 0 else 'Negative' for c in model.coef_]

# Barplot
fig = px.bar(
    x=frame.columns, y=model.coef_, color=colors,
    color_discrete_sequence=['red', 'blue'],
    labels=dict(x='Feature', y='Linear Coefficient'),
    title='Coefficients of each feature'
)

# Axes
fig.update_layout(
    xaxis_title='Feature',
    yaxis_title='Linear Coefficient',
    title_x=0.5,
    font=dict(size=20)
)

# Regression Metrics
R2 = r2_score(y_test, predicted)
mae = mean_absolute_error(y_test, predicted)
rmse = root_mean_squared_error(y_test, predicted)

# Add Regression Metrics
fig.update_layout(
    annotations=[
        dict(
            text=f"Model Metrics:<br>RMSE: {rmse:.3f}<br>MAE: {mae:.3f}<br>R2: {R2:.3f}<br><br>Intercept: {model.intercept_:.2f}",
            x=0.97,
            y=0.1,
            xref='paper',
            yref='paper',
            showarrow=False,
            align='left',
            bgcolor='white',
            bordercolor='black',
            borderwidth=1,
            font=dict(size=17)
        )
    ]
)

# Show
fig.show()

### Regression Plots

In [13]:
#========== PLOT
# Get the number of columns to plot
num_cols = len(frame.columns)



# Determine the number of rows and columns for the subplot grid
# Set the number of columns in the grid
n_cols = 3
# Calculate the number of rows needed
n_rows = (num_cols + n_cols - 1) // n_cols



# Create the subplot figure
fig = make_subplots(# Number of rows
                    rows=n_rows,
                    # Number of cols
                    cols=n_cols,
                    # Titles per subplot
                    subplot_titles=[f" Partial Regression for Feature {i}" for i in frame.columns],
                    # Adjust spacing between rows
                    vertical_spacing=0.1,
                    # Adjust spacing between columns
                    horizontal_spacing=0.1)



# Create a RegPlot per each variable, i.e. per column
for i in frame.columns:
    # Current row
    row = i // n_cols + 1
    # Current col
    col = i % n_cols + 1

    # Create scatter plot for the feature vs target
    scatter = go.Scatter(
        y=target,
        x=frame[i],
        mode='markers'
    )

    # Generate regression line data
    x_range = np.linspace(frame[i].min(), frame[i].max(), 100)
    y_range = model.intercept_ + x_range * model.coef_[i]
    # Create regression line trace
    reg_line = go.Scatter(
        x=x_range,
        y=y_range,
        mode='lines',
        name='Regression Fit'
    )

    # Add both traces the subplot
    fig.add_trace(scatter, row=row, col=col)
    fig.add_trace(reg_line, row=row, col=col)

    # Add annotation with the formula inside the subplot
    formula_str = f"$\hat y_{i} = {model.intercept_:.2f} + {model.coef_[i]:.2f} X_{i}$"
    fig.add_annotation(
        text=formula_str,
        xref=f"x{(row - 1) * n_cols + col}",
        yref=f"y{(row - 1) * n_cols + col}",
        x=0.05,
        y=0.95,
        xanchor='left',
        yanchor='top',
        showarrow=False,
        font=dict(size=12, color='black'),
        row=row,
        col=col
    )

    # Update the y-axis title for each subplot
    fig.update_yaxes(title_text=f"Target",
                     row=row,
                     col=col)
    # Update the x-axis title
    fig.update_xaxes(title_text=f"Feature {i}",
                     row=row,
                     col=col)



# Update the overall layout of the figure
fig.update_layout(# Adjust overall height based on number of rows * Pixels
                  height=n_rows * 350,
                  # Adjust overall width based on number of columns * Pixels
                  width=n_cols * 350,
                  # Legend is not required
                  showlegend=False,
                  # Overall Title
                  title=dict(
                      # Set text
                      text="Regressions for Dataset Features",
                      # Center the title
                      x=0.5,
                      y=0.99,
                      xanchor='center',
                      yanchor='top',
                      # Size
                      font=dict(
                          size=40
                      )
                  )
)



# Show the Final Figure
fig.show()

Interpretación: Es un modelo aceptable, aunque un poco simple. Ayudará en su cometido de estimar un buen rango de precios; una buena aproximación. Seguramente ayudará a vendedores en la búsqueda de precios adecuados. La visualización de alguna variable podría ayudar a determinar si hay relaciones lineales, y un análisis más exhaustivo podría hallar variable correlacionadas.