# Multiple Regression Analysis Notebook

This notebook performs multiple regression analysis on fields in a dataset.

In [1]:
# Import necessary libraries
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

## Step 1: Load the file and clean the data

In [21]:
# Replace 'your_dataset.csv' with the path to your dataset
file_path = 'datasets/Courses.csv'
data = pd.read_csv(file_path)

selected_fields = ['grade', 'nevents', 'ndays_act', 'nplay_video', 'nchapters', 'nforum_posts', 'incomplete_flag']
selected_data = data[selected_fields]
ft_data = selected_data.fillna(0)
ft_data = ft_data.apply(pd.to_numeric, errors='coerce')
ft_data = ft_data[ft_data['grade'] > 0]
ft_data = ft_data[selected_data['incomplete_flag'] != 1]
ft_data.head()

  ft_data = ft_data[selected_data['incomplete_flag'] != 1]


Unnamed: 0,grade,nevents,ndays_act,nplay_video,nchapters,nforum_posts,incomplete_flag
12,0.07,175.0,9.0,0.0,7.0,0,0.0
29,0.05,285.0,8.0,0.0,4.0,0,0.0
90,0.13,2796.0,23.0,891.0,6.0,0,0.0
108,0.35,1068.0,14.0,103.0,3.0,0,0.0
118,1.0,439.0,47.0,0.0,11.0,0,0.0


## Step 2: Specify Target and Predictor Variables

In [22]:
# Define the target and predictors
target = 'grade'  # Replace with your target column
predictors = ['nevents', 'ndays_act', 'nplay_video', 'nchapters', 'nforum_posts']  # Replace with your predictor columns

# Ensure selected fields are in the dataset
if target not in ft_data.columns or not set(predictors).issubset(ft_data.columns):
    raise ValueError('Ensure the target and predictor fields are correctly specified and exist in the dataset.')

# Prepare data
X = ft_data[predictors]
y = ft_data[target]

## Step 3: Split the Data into Training and Testing Sets

In [23]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Step 4: Perform Multiple Regression Using Statsmodels

In [24]:
# Add constant for intercept
X_train_sm = sm.add_constant(X_train)
model = sm.OLS(y_train, X_train_sm).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  grade   R-squared:                       0.605
Model:                            OLS   Adj. R-squared:                  0.605
Method:                 Least Squares   F-statistic:                 1.814e+04
Date:                Sat, 01 Feb 2025   Prob (F-statistic):               0.00
Time:                        01:44:45   Log-Likelihood:                 6754.6
No. Observations:               59154   AIC:                        -1.350e+04
Df Residuals:                   59148   BIC:                        -1.344e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const           -0.0467      0.001    -33.204   

## Step 5: Perform Regression Using Scikit-learn

In [25]:
# Train the regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions
y_pred = lr_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'R² Score: {r2:.2f}')

Mean Squared Error (MSE): 0.05
R² Score: 0.59


## Step 6: Display Coefficients

In [26]:
# Display coefficients
coefficients = pd.DataFrame({
    'Feature': predictors,
    'Coefficient': lr_model.coef_
})
coefficients.loc[-1] = ['Intercept', lr_model.intercept_]  # Add intercept
coefficients.index = coefficients.index + 1
coefficients.sort_index(inplace=True)
print(coefficients)

        Feature  Coefficient
0     Intercept    -0.046681
1       nevents     0.000015
2     ndays_act     0.004737
3   nplay_video    -0.000032
4     nchapters     0.022546
5  nforum_posts    -0.019956
