In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv('student.csv')
print(df.head())
print(df.shape)

  school sex  age address famsize Pstatus  Medu  Fedu     Mjob      Fjob  ...  \
0     GP   F   18       U     GT3       A     4     4  at_home   teacher  ...   
1     GP   F   17       U     GT3       T     1     1  at_home     other  ...   
2     GP   F   15       U     LE3       T     1     1  at_home     other  ...   
3     GP   F   15       U     GT3       T     4     2   health  services  ...   
4     GP   F   16       U     GT3       T     3     3    other     other  ...   

  famrel freetime  goout  Dalc  Walc health absences  G1  G2  G3  
0      4        3      4     1     1      3        6   5   6   6  
1      5        3      3     1     1      3        4   5   5   6  
2      4        3      2     2     3      3       10   7   8  10  
3      3        2      2     1     1      5        2  15  14  15  
4      4        3      2     1     2      5        4   6  10  10  

[5 rows x 33 columns]
(395, 33)


In [18]:
# CELL 2 — ADAPTED FOR YOUR DOWNLOADED DATASET (Portuguese Student Performance)
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv('student.csv')
print("Dataset loaded! Shape:", df.shape)

# Drop missing (none here, but safe)
df = df.dropna()

# Create target: GPA from G1, G2, G3 (0-20 scale) → scale to 0-4.0
df['GPA'] = (df['G1'] + df['G2'] + df['G3']) / 3 / 5  # /5 to make 0-4.0

# Create features from your exact columns
df['hours_studied'] = df['studytime']  # 1=low, 4=high → treat as hours proxy
df['attendance'] = 100 - (df['absences'] / 20 * 100)  # Inverse absences (20 max) to %
df['parent_education'] = (df['Medu'] + df['Fedu']) / 2  # Avg parent edu (0-4 → 0-2 scale)
df['extracurricular'] = df['activities'].map({'yes': 1, 'no': 0})  # Binary
df['sleep_hours'] = 10 - df['goout']  # Less going out = more sleep (proxy, 1-5 → hours)
df['motivation_level'] = df['health'].map({1:0, 2:0, 3:1, 4:2, 5:2})  # Health as motivation proxy
df['parental_involvement'] = (5 - df['famrel']) * -1 + 2  # Family rel (1-5) → involvement (higher=better)

# Fill any NaNs from mapping
df[['extracurricular', 'motivation_level', 'parental_involvement']] = df[['extracurricular', 'motivation_level', 'parental_involvement']].fillna(0)

# Final features (all exist now)
features = ['hours_studied', 'attendance', 'parent_education', 'extracurricular',
            'sleep_hours', 'motivation_level', 'parental_involvement']

X = df[features].copy()
y = df['GPA']

print("\nSUCCESS — DATA READY WITH YOUR COLUMNS!")
print("X shape:", X.shape)
print("GPA range:", y.min().round(2), "to", y.max().round(2))
print("\nSample features:")
print(X.head())

Dataset loaded! Shape: (395, 33)

SUCCESS — DATA READY WITH YOUR COLUMNS!
X shape: (395, 7)
GPA range: 0.27 to 3.87

Sample features:
   hours_studied  attendance  parent_education  extracurricular  sleep_hours  \
0              2        70.0               4.0                0            6   
1              2        80.0               1.0                0            7   
2              2        50.0               1.0                0            8   
3              3        90.0               3.0                1            8   
4              2        80.0               3.0                0            8   

   motivation_level  parental_involvement  
0                 1                     1  
1                 1                     2  
2                 1                     1  
3                 2                     0  
4                 2                     1  


In [22]:
# CELL 3 – 100% WORKING FOR ALL SCikit-LEARN VERSIONS (2025)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import numpy as np

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train models
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

rf = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

# RMSE using np.sqrt → works on EVERY version
lr_rmse = np.sqrt(mean_squared_error(y_test, lr_pred))
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))

lr_r2 = r2_score(y_test, lr_pred)
rf_r2 = r2_score(y_test, rf_pred)

# Print results
print("=== LINEAR REGRESSION ===")
print(f"RMSE: {lr_rmse:.3f} | R²: {lr_r2:.3f}")

print("\n=== RANDOM FOREST (BEST MODEL) ===")
print(f"RMSE: {rf_rmse:.3f} | R²: {rf_r2:.3f}")

# Save model
joblib.dump(rf, 'rf_model.pkl')
print("\nModel saved → rf_model.pkl")

=== LINEAR REGRESSION ===
RMSE: 0.739 | R²: 0.079

=== RANDOM FOREST (BEST MODEL) ===
RMSE: 0.789 | R²: -0.051

Model saved → rf_model.pkl


In [23]:
# Save Random Forest (best one)
joblib.dump(rf, 'rf_model.pkl')
print("Model saved as rf_model.pkl")

Model saved as rf_model.pkl
