In [2]:
import pandas as pd
import numpy as np

# 1. Load dataset
df = pd.read_excel("Portuguese.xlsx")

# 2. Feature setup
target_col = "G3"
feature_cols = [c for c in df.columns if c != target_col]

X = df[feature_cols]
y = df[target_col]

# 3. Preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(exclude=[np.number]).columns.tolist()

numeric_transformer = Pipeline(
    steps=[("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# 4. Train both models
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

rf_pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("model", RandomForestRegressor(random_state=42)),
])

gb_pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("model", GradientBoostingRegressor(random_state=42)),
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

rf_pipeline.fit(X_train, y_train)
gb_pipeline.fit(X_train, y_train)

# 5. Evaluate
rf_preds = rf_pipeline.predict(X_test)
gb_preds = gb_pipeline.predict(X_test)

rf_rmse = np.sqrt(mean_squared_error(y_test, rf_preds))
gb_rmse = np.sqrt(mean_squared_error(y_test, gb_preds))

print("RandomForest RMSE:", rf_rmse)
print("GradientBoosting RMSE:", gb_rmse)


RandomForest RMSE: 1.3677160992853286
GradientBoosting RMSE: 1.3925334488877275
