In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import GradientBoostingRegressor  
import joblib
import boto3
import os


In [5]:
!pip install openpyxl

df = pd.read_excel("Portuguese.xlsx")  
print(df.shape)
print(df.head())
print(df.columns)

Collecting openpyxl
  Using cached openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Using cached et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Using cached openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Using cached et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
(651, 33)
  school sex  age address famsize Pstatus  Medu  Fedu     Mjob      Fjob  ...  \
0     GP   F   18       U     GT3       A     4     4  at_home   teacher  ...   
1     GP   F   17       U     GT3       T     1     1  at_home     other  ...   
2     GP   F   15       U     LE3       T     1     1  at_home     other  ...   
3     GP   F   15       U     GT3       T     4     2   health  services  ...   
4     GP   F   16       U     GT3       T     3     3    other     other  ...   

  famrel freetime  goout  Dalc  Walc health absences  G1  G2  G3  
0      4        3     

In [6]:
assert "G3" in df.columns, "Dataset must have G3 as target"

In [8]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

target_col = "G3"
feature_cols = [c for c in df.columns if c != target_col]

X = df[feature_cols]
y = df[target_col]

numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(exclude=[np.number]).columns.tolist()

numeric_transformer = Pipeline(
    steps=[("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

gb_model = GradientBoostingRegressor(random_state=42)

gb_pipeline = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", gb_model),
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

gb_pipeline.fit(X_train, y_train)

gb_preds = gb_pipeline.predict(X_test)

gb_mse = mean_squared_error(y_test, gb_preds)   
gb_rmse = np.sqrt(gb_mse)

print(f"Gradient Boosting RMSE on G3: {gb_rmse:.3f}")


Gradient Boosting RMSE on G3: 1.393


In [9]:
local_model_filename = "student_g3_gb_predict.pkl"  # Gradient Boosting model
joblib.dump(gb_pipeline, local_model_filename)

print("Saved model to:", local_model_filename)


Saved model to: student_g3_gb_predict.pkl


In [11]:
s3 = boto3.client("s3")

bucket_name = "cloudprojectmodel"
s3_key_gb = "model/student_g3_gb_predict.pkl"

s3.upload_file(local_model_filename, bucket_name, s3_key_gb)

print(f"Uploaded to s3://{bucket_name}/{s3_key_gb}")


Uploaded to s3://cloudprojectmodel/model/student_g3_gb_predict.pkl
