### Implementing linear regression in numpy.
Linear regression implemented using:
- closed form solution
- gradient descent
#### First, importing the data.

In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv('data/StudentPerformanceFactors.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6607 entries, 0 to 6606
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Hours_Studied               6607 non-null   int64 
 1   Attendance                  6607 non-null   int64 
 2   Parental_Involvement        6607 non-null   object
 3   Access_to_Resources         6607 non-null   object
 4   Extracurricular_Activities  6607 non-null   object
 5   Sleep_Hours                 6607 non-null   int64 
 6   Previous_Scores             6607 non-null   int64 
 7   Motivation_Level            6607 non-null   object
 8   Internet_Access             6607 non-null   object
 9   Tutoring_Sessions           6607 non-null   int64 
 10  Family_Income               6607 non-null   object
 11  Teacher_Quality             6529 non-null   object
 12  School_Type                 6607 non-null   object
 13  Peer_Influence              6607 non-null   obje

#### Separating features from target.

In [2]:
X = data.drop('Exam_Score', axis=1)
Y = data['Exam_Score']

#### Defining pipelines.

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.75)

num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object', 'category']).columns

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

#### Fitting the model and predicting data.

In [4]:
from linear_regression_models import LinearRegressionClosedForm, LinearRegressionGradientDescent
from sklearn.linear_model import LinearRegression

sklearn_LR = LinearRegression()
closed_form_LR = LinearRegressionClosedForm()
gradient_descent_LR = LinearRegressionGradientDescent()

sklearn_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', sklearn_LR)
])

closed_form_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', closed_form_LR)
])

gradient_descent_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', gradient_descent_LR)
])

sklearn_pipeline.fit(X_train, Y_train)
closed_form_pipeline.fit(X_train, Y_train)
gradient_descent_pipeline.fit(X_train, Y_train)

y_skl = sklearn_pipeline.predict(X_test)
y_cf = closed_form_pipeline.predict(X_test)
y_gd = gradient_descent_pipeline.predict(X_test)



#### Evaluating and comparing the models.

In [None]:
from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_error

numerical_features = list(X.select_dtypes(['int64', 'float64']).columns)

categorical_features = list(X.select_dtypes(['object', 'category']).columns)

# creating feature names
onehotencoded_features = []
for feat in categorical_features:
    values = data[feat].dropna().unique()
    onehotencoded_features.extend([f"{feat} - {val}" for val in values])

all_features = []
all_features.extend(numerical_features)
all_features.extend(onehotencoded_features)

metrics = {
    'R2': [r2_score(Y_test, y_skl), r2_score(Y_test, y_cf), r2_score(Y_test, y_gd)],
    'RMSE': [root_mean_squared_error(Y_test, y_skl), root_mean_squared_error(Y_test, y_cf), root_mean_squared_error(Y_test, y_gd)],
    'MAE': [mean_absolute_error(Y_test, y_skl), mean_absolute_error(Y_test, y_cf), mean_absolute_error(Y_test, y_gd)]
}

df_coefficients = pd.DataFrame({
    'Feature': all_features,
    'Sklearn': sklearn_LR.coef_,
    'Closed Form': closed_form_LR.coef_,
    'Gradient Descent': gradient_descent_LR.weights
})

df_intercepts = pd.DataFrame({
    'Feature': ['Intercept'],
    'Sklearn': [sklearn_LR.intercept_],
    'Closed Form': [closed_form_LR.intercept_],
    'Gradient Descent': [gradient_descent_LR.bias]
})

df_results = pd.concat([df_coefficients, df_intercepts], ignore_index=True)
df_metrics = pd.DataFrame(metrics, index=['Sklearn', 'Closed Form', 'Gradient Descent'])

df_results = df_results.round(6)
df_metrics = df_metrics.round(6)

styled_df = (
    df_results.style
    .set_properties(**{
        'text-align': 'left',
        'white-space': 'normal',
        'min-width': '200px'
    })
    .set_table_styles([{
        'selector': 'th',
        'props': [('text-align', 'left')]
    }])
)

In [20]:
print("Comparing coefficients:")
display(styled_df)

print("Evaluating models:")
display(df_metrics)

Comparing coefficients:


Unnamed: 0,Feature,Sklearn,Closed Form,Gradient Descent
0,Hours_Studied,1.779275,1.779275,1.781642
1,Attendance,2.287862,2.287862,2.286426
2,Sleep_Hours,0.010191,0.010191,0.00098
3,Previous_Scores,0.739097,0.739097,0.743177
4,Tutoring_Sessions,0.617862,0.617862,0.613288
5,Physical_Activity,0.214333,0.214333,0.21414
6,Parental_Involvement - Low,1.000491,4.580362,4.578402
7,Parental_Involvement - Medium,-0.930803,2.649069,2.647051
8,Parental_Involvement - High,-0.069688,3.510183,3.507537
9,Access_to_Resources - High,0.996111,4.575982,4.574594


Evaluating models:


Unnamed: 0,R2,RMSE,MAE
Sklearn,0.820601,1.531815,0.427189
Closed Form,0.820601,1.531815,0.427189
Gradient Descent,0.821107,1.529655,0.412691
