# Multiple Regression Analysis Notebook

This notebook performs multiple regression analysis on fields in a dataset.

In [3]:
# Import necessary libraries
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

## Step 1: Load the Datasets and join based on Student ID

In [4]:
# Replace 'your_dataset.csv' with the path to your dataset
file1_path = 'datasets/Student Engagement Level-Binary.csv'
file2_path = 'datasets/Student Performance Prediction-Multi.csv'
data1 = pd.read_csv(file1_path)
data2 = pd.read_csv(file2_path)

# Display the first few rows of the dataset
print("Dataset 1:")
display(data1.head())
print("\nDataset 2:")
display(data2.head())

join_field = 'Student ID'  # Specify the column used for joining

# Perform the join
merged_data = pd.merge(data1, data2, on=join_field, how='inner')

print("\nMerged Dataset:")
display(merged_data.head())

Dataset 1:


Unnamed: 0,Student ID,# Logins,# Content Reads,# Forum Reads,# Forum Posts,# Quiz Reviews before submission,Assignment 1 lateness indicator,Assignment 2 lateness indicator,Assignment 3 lateness indicator,Assignment 1 duration to submit (in hours),Assignment 2 duration to submit (in hours),Assignment 3 duration to submit (in hours),Average time to submit assignment (in hours),Engagement Level
0,student000000,143,344,58,0,3,0,0,0,178.166667,92.716667,116.166667,129.016667,H
1,student000001,70,342,0,0,4,0,0,0,294.033333,196.083333,217.75,235.955556,L
2,student000002,42,219,0,0,3,0,0,0,169.6,235.733333,260.333333,221.888889,L
3,student000003,92,271,2,0,6,0,0,0,341.15,245.9,271.216667,286.088889,L
4,student000004,116,379,0,0,1,0,0,0,325.5,236.283333,260.733333,274.172222,L



Dataset 2:


Unnamed: 0,Student ID,Quiz01 [10],Assignment01 [8],Midterm Exam [20],Assignment02 [12],Assignment03 [25],Final Exam [35],Course Grade,Total [100],Class
0,student000000,95,91,70,90,84,64,85,85,G
1,student000001,85,76,65,61,73,64,76,76,G
2,student000002,85,41,73,61,73,61,73,73,G
3,student000003,80,78,80,79,79,57,80,79,G
4,student000004,85,91,78,80,84,67,85,85,G



Merged Dataset:


Unnamed: 0,Student ID,# Logins,# Content Reads,# Forum Reads,# Forum Posts,# Quiz Reviews before submission,Assignment 1 lateness indicator,Assignment 2 lateness indicator,Assignment 3 lateness indicator,Assignment 1 duration to submit (in hours),...,Engagement Level,Quiz01 [10],Assignment01 [8],Midterm Exam [20],Assignment02 [12],Assignment03 [25],Final Exam [35],Course Grade,Total [100],Class
0,student000000,143,344,58,0,3,0,0,0,178.166667,...,H,95,91,70,90,84,64,85,85,G
1,student000001,70,342,0,0,4,0,0,0,294.033333,...,L,85,76,65,61,73,64,76,76,G
2,student000002,42,219,0,0,3,0,0,0,169.6,...,L,85,41,73,61,73,61,73,73,G
3,student000003,92,271,2,0,6,0,0,0,341.15,...,L,80,78,80,79,79,57,80,79,G
4,student000004,116,379,0,0,1,0,0,0,325.5,...,L,85,91,78,80,84,67,85,85,G


## Step 2: Specify Target and Predictor Variables

In [10]:
# Define the target and predictors
target = 'Course Grade'  # Replace with your target column
predictors = ["# Logins", "# Content Reads", "# Forum Reads", "# Forum Posts", "# Quiz Reviews before submission",
              "Assignment 1 duration to submit (in hours)", "Assignment 2 duration to submit (in hours)", "Assignment 3 duration to submit (in hours)", 
              "Average time to submit assignment (in hours)"]  # Replace with your predictor columns

# Ensure selected fields are in the dataset
if target not in merged_data.columns or not set(predictors).issubset(merged_data.columns):
    raise ValueError('Ensure the target and predictor fields are correctly specified and exist in the dataset.')

# Prepare data
X = merged_data[predictors]
y = merged_data[target]

## Step 3: Split the Data into Training and Testing Sets

In [11]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Step 4: Perform Multiple Regression Using Statsmodels

In [12]:
# Add constant for intercept
X_train_sm = sm.add_constant(X_train)
model = sm.OLS(y_train, X_train_sm).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:           Course Grade   R-squared:                       0.285
Model:                            OLS   Adj. R-squared:                  0.268
Method:                 Least Squares   F-statistic:                     16.72
Date:                Sun, 19 Jan 2025   Prob (F-statistic):           3.37e-23
Time:                        22:56:10   Log-Likelihood:                -1431.6
No. Observations:                 388   AIC:                             2883.
Df Residuals:                     378   BIC:                             2923.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                                                   coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------

## Step 5: Perform Regression Using Scikit-learn

In [13]:
# Train the regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions
y_pred = lr_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'R² Score: {r2:.2f}')

Mean Squared Error (MSE): 97.15
R² Score: 0.08


## Step 6: Display Coefficients

In [14]:
# Display coefficients
coefficients = pd.DataFrame({
    'Feature': predictors,
    'Coefficient': lr_model.coef_
})
coefficients.loc[-1] = ['Intercept', lr_model.intercept_]  # Add intercept
coefficients.index = coefficients.index + 1
coefficients.sort_index(inplace=True)
print(coefficients)

                                        Feature   Coefficient
0                                     Intercept  8.043924e+01
1                                      # Logins  4.474957e-02
2                               # Content Reads  1.240662e-02
3                                 # Forum Reads -6.186903e-02
4                                 # Forum Posts  1.452056e+00
5              # Quiz Reviews before submission  5.624687e-01
6    Assignment 1 duration to submit (in hours) -5.129948e+06
7    Assignment 2 duration to submit (in hours) -5.129948e+06
8    Assignment 3 duration to submit (in hours) -5.129948e+06
9  Average time to submit assignment (in hours)  1.538984e+07
