In [1]:
import parse_data as ps
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import pandas as pd
df = ps.get_data()

print("--- Missing Values Check ---")
print(df[['average_sleep', 'academic_pressure', 'anxiety', 'isolation', 'depression']].isnull().sum())
print(f"\nFinal dataset shape: {df.shape}")

--- Missing Values Check ---
average_sleep        0
academic_pressure    0
anxiety              0
isolation            0
depression           0
dtype: int64

Final dataset shape: (87, 21)


## Data Preparation
We select **Academic Pressure, Average Sleep, Anxiety, and Isolation** as independent variables ($X$) to predict the Depression score ($y$). The data is split into a training set (80%) and a testing set (20%) to evaluate the model's generalization performance.

In [2]:
features = ['academic_pressure', 'average_sleep', 'anxiety', 'isolation']
X = df[features]
y = df['depression']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

Training set size: 69
Testing set size: 18


In [3]:
model = LinearRegression()
model.fit(X_train, y_train)

train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print("--- Model Performance (R^2 Score) ---")
print(f"Train R^2: {train_score:.3f}")
print(f"Test R^2: {test_score:.3f}")

--- Model Performance (R^2 Score) ---
Train R^2: 0.791
Test R^2: 0.547


## Interpretation of Results
The model achieved a Train $R^2$ of [0.045] and a Test $R^2$ of [-0.116].

Insight: If the Test $R^2$ is significantly lower than the Train $R^2$, it suggests overfitting. In our improved analysis, we aim to bridge this gap by using techniques like PCA and Cross-Validation to ensure the model captures more generalized psychological patterns rather than noise in the small dataset ($N=87$).