### Implementing linear regression in numpy.
Linear regression implemented using:
- closed form solution
- gradient descent
#### First, importing the data.

In [1]:
import pandas as pd

data = pd.read_csv('data/StudentPerformanceFactors.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6607 entries, 0 to 6606
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Hours_Studied               6607 non-null   int64 
 1   Attendance                  6607 non-null   int64 
 2   Parental_Involvement        6607 non-null   object
 3   Access_to_Resources         6607 non-null   object
 4   Extracurricular_Activities  6607 non-null   object
 5   Sleep_Hours                 6607 non-null   int64 
 6   Previous_Scores             6607 non-null   int64 
 7   Motivation_Level            6607 non-null   object
 8   Internet_Access             6607 non-null   object
 9   Tutoring_Sessions           6607 non-null   int64 
 10  Family_Income               6607 non-null   object
 11  Teacher_Quality             6529 non-null   object
 12  School_Type                 6607 non-null   object
 13  Peer_Influence              6607 non-null   obje

#### Separating features from target.

In [2]:
X = data.drop('Exam_Score', axis=1)
Y = data['Exam_Score']

#### Defining pipelines.

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.75)

num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object', 'category']).columns

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

#### Fitting the model and predicting data.
Models implementations are defined in ```linear_regression_models.py```

In [10]:
from src.linear_regression.models import LinearRegressionClosedForm, LinearRegressionGradientDescent
from sklearn.linear_model import LinearRegression

sklearn_LR = LinearRegression()
closed_form_LR = LinearRegressionClosedForm()
gradient_descent_LR = LinearRegressionGradientDescent()

sklearn_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', sklearn_LR)
])

closed_form_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', closed_form_LR)
])

gradient_descent_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', gradient_descent_LR)
])

sklearn_pipeline.fit(X_train, Y_train)
closed_form_pipeline.fit(X_train, Y_train)
gradient_descent_pipeline.fit(X_train, Y_train)

y_skl = sklearn_pipeline.predict(X_test)
y_cf = closed_form_pipeline.predict(X_test)
y_gd = gradient_descent_pipeline.predict(X_test)

#### Evaluating and comparing the models.

In [11]:
from src.linear_regression.model_evaluation import metrics_table, weights_table

df_weights = weights_table(data, X, [sklearn_LR, closed_form_LR, gradient_descent_LR], ['Sklearn', 'Closed Form', 'Gradient Descent'])

df_metrics = metrics_table(Y_test, [y_skl, y_cf, y_gd], ['Sklearn', 'Closed Form', 'Gradient Descent'])

print("Comparing coefficients:")
display(df_weights)

print("Evaluating models:")
display(df_metrics)

Comparing coefficients:


Unnamed: 0,Feature,Sklearn,Closed Form,Gradient Descent
0,Hours_Studied,1.754141,1.754141,1.752173
1,Attendance,2.320366,2.320366,2.314581
2,Sleep_Hours,-0.021305,-0.021305,-0.02303
3,Previous_Scores,0.680723,0.680723,0.676496
4,Tutoring_Sessions,0.590992,0.590992,0.590988
5,Physical_Activity,0.181076,0.181076,0.183625
6,Parental_Involvement - Low,0.984506,4.556374,4.557426
7,Parental_Involvement - Medium,-0.937754,2.634115,2.633818
8,Parental_Involvement - High,-0.046752,3.525116,3.527264
9,Access_to_Resources - High,1.025728,4.597596,4.600822


Evaluating models:


Unnamed: 0,R²,MSE,RMSE,MAE
Sklearn,0.653645,5.597457,2.365895,0.525349
Closed Form,0.653645,5.597457,2.365895,0.525349
Gradient Descent,0.653763,5.595542,2.36549,0.531324
