# Multiple Regression Analysis Notebook

This notebook performs multiple regression analysis on fields in a dataset.

In [1]:
# Import necessary libraries
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

## Step 1: Load the Dataset

In [2]:
# Replace 'your_dataset.csv' with the path to your dataset
file_path = 'datasets/Student_Performance_Data(SPD24).xlsx'
data = pd.read_excel(file_path)

# Display the first few rows of the dataset
display(data.head())

Unnamed: 0,Student ID,Gender,Age,Grade Level,Attendance Rate,Study Hours,Parental Education Level,Parental Involvement,Extracurricular Activities,Socioeconomic Status,...,Bullying Incidents,Special Education Services,Counseling Services,Learning Disabilities,Behavioral Issues,Attendance of Tutoring Sessions,School Climate,Parental Employment Status,Household Size,Performance Score
0,1,Male,15,12,80.4878,2.764496,Bachelor's,High,Yes,High,...,1,No,Yes,Yes,Yes,No,Negative,Unemployed,3,Low
1,2,Female,17,12,96.242678,4.534785,Bachelor's,High,Yes,Low,...,3,Yes,Yes,No,Yes,No,Negative,Employed,3,Medium
2,3,Male,14,9,84.649681,2.008148,Bachelor's,Medium,Yes,Low,...,0,Yes,No,Yes,No,No,Neutral,Employed,3,High
3,4,Male,14,10,86.158599,3.698293,High School,High,No,Low,...,3,Yes,No,No,Yes,No,Positive,Employed,4,Medium
4,5,Male,15,10,88.487638,3.408604,Associate,Low,No,Middle,...,1,Yes,Yes,No,No,No,Positive,Employed,6,Low


## Step 2: Specify Target and Predictor Variables

In [3]:
# Define the target and predictors
target = 'Reading Proficiency'  # Replace with your target column
predictors = ['Attendance Rate', 'Study Hours', 'Previous Academic Performance', 'Hours of Sleep', 'Homework Completion Rate']  # Replace with your predictor columns

# Ensure selected fields are in the dataset
if target not in data.columns or not set(predictors).issubset(data.columns):
    raise ValueError('Ensure the target and predictor fields are correctly specified and exist in the dataset.')

# Prepare data
X = data[predictors]
y = data[target]

## Step 3: Split the Data into Training and Testing Sets

In [4]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Step 4: Perform Multiple Regression Using Statsmodels

In [5]:
# Add constant for intercept
X_train_sm = sm.add_constant(X_train)
model = sm.OLS(y_train, X_train_sm).fit()
print(model.summary())

                             OLS Regression Results                            
Dep. Variable:     Reading Proficiency   R-squared:                       0.000
Model:                             OLS   Adj. R-squared:                 -0.000
Method:                  Least Squares   F-statistic:                    0.9524
Date:                 Sun, 19 Jan 2025   Prob (F-statistic):              0.446
Time:                         23:51:05   Log-Likelihood:            -3.0288e+05
No. Observations:                78400   AIC:                         6.058e+05
Df Residuals:                    78394   BIC:                         6.058e+05
Df Model:                            5                                         
Covariance Type:             nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
cons

## Step 5: Perform Regression Using Scikit-learn

In [6]:
# Train the regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions
y_pred = lr_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'R² Score: {r2:.2f}')

Mean Squared Error (MSE): 134.71
R² Score: -0.00


## Step 6: Display Coefficients

In [7]:
# Display coefficients
coefficients = pd.DataFrame({
    'Feature': predictors,
    'Coefficient': lr_model.coef_
})
coefficients.loc[-1] = ['Intercept', lr_model.intercept_]  # Add intercept
coefficients.index = coefficients.index + 1
coefficients.sort_index(inplace=True)
print(coefficients)

                         Feature  Coefficient
0                      Intercept    79.475150
1                Attendance Rate     0.008235
2                    Study Hours     0.041909
3  Previous Academic Performance    -0.003118
4                 Hours of Sleep    -0.024636
5       Homework Completion Rate     0.000639
