# Multiple Regression Analysis Notebook

This notebook performs multiple regression analysis on fields in a dataset.

In [1]:
# Import necessary libraries
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

## Step 1: Load the Dataset

In [2]:
# Replace 'your_dataset.csv' with the path to your dataset
file_path = 'datasets/student_lifestyle_dataset.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
display(data.head())

Unnamed: 0,Student_ID,Study_Hours_Per_Day,Extracurricular_Hours_Per_Day,Sleep_Hours_Per_Day,Social_Hours_Per_Day,Physical_Activity_Hours_Per_Day,GPA,Stress_Level
0,1,6.9,3.8,8.7,2.8,1.8,2.99,Moderate
1,2,5.3,3.5,8.0,4.2,3.0,2.75,Low
2,3,5.1,3.9,9.2,1.2,4.6,2.67,Low
3,4,6.5,2.1,7.2,1.7,6.5,2.88,Moderate
4,5,8.1,0.6,6.5,2.2,6.6,3.51,High


## Step 2: Specify Target and Predictor Variables

In [13]:
# Define the target and predictors
target = 'GPA'  # Replace with your target column
predictors = ['Study_Hours_Per_Day', 'Extracurricular_Hours_Per_Day', 'Sleep_Hours_Per_Day', 'Social_Hours_Per_Day', 'Physical_Activity_Hours_Per_Day']  # Replace with your predictor columns

# Ensure selected fields are in the dataset
if target not in data.columns or not set(predictors).issubset(data.columns):
    raise ValueError('Ensure the target and predictor fields are correctly specified and exist in the dataset.')

# Prepare data
X = data[predictors]
y = data[target]

## Step 3: Split the Data into Training and Testing Sets

In [14]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Step 4: Perform Multiple Regression Using Statsmodels

In [15]:
# Add constant for intercept
X_train_sm = sm.add_constant(X_train)
model = sm.OLS(y_train, X_train_sm).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                    GPA   R-squared:                       0.538
Model:                            OLS   Adj. R-squared:                  0.537
Method:                 Least Squares   F-statistic:                     465.2
Date:                Mon, 20 Jan 2025   Prob (F-statistic):          6.77e-266
Time:                        00:34:04   Log-Likelihood:                 291.65
No. Observations:                1600   AIC:                            -573.3
Df Residuals:                    1595   BIC:                            -546.4
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
const     

## Step 5: Perform Regression Using Scikit-learn

In [16]:
# Train the regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions
y_pred = lr_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'R² Score: {r2:.2f}')

Mean Squared Error (MSE): 0.04
R² Score: 0.55


## Step 6: Display Coefficients

In [17]:
# Display coefficients
coefficients = pd.DataFrame({
    'Feature': predictors,
    'Coefficient': lr_model.coef_
})
coefficients.loc[-1] = ['Intercept', lr_model.intercept_]  # Add intercept
coefficients.index = coefficients.index + 1
coefficients.sort_index(inplace=True)
print(coefficients)

                           Feature  Coefficient
0                        Intercept     2.685954
1              Study_Hours_Per_Day     0.125606
2    Extracurricular_Hours_Per_Day    -0.038285
3              Sleep_Hours_Per_Day    -0.031624
4             Social_Hours_Per_Day    -0.027592
5  Physical_Activity_Hours_Per_Day    -0.028104
