In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline

from preprocess import preprocessor, feature_selector  # your pre-built transformer

# 1. Load data
df = pd.read_csv("train.csv")
X = df.drop("SalePrice", axis=1)
y = df["SalePrice"]

# 2. Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [1]:
!pip install --upgrade scikit-learn --quiet

In [4]:
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# Example: Linear Regression Pipeline
linreg_pipeline = Pipeline([
    ('feature_select', feature_selector),
    ('preprocess', preprocessor),
    ('model', LinearRegression())
])

# Fit and evaluate
linreg_pipeline.fit(X_train, y_train)
print(f"Train R^2: {linreg_pipeline.score(X_train, y_train)}")

Train R^2: 0.768323532388255


In [5]:
# Example: Linear Regression Pipeline
xgb_pipeline = Pipeline([
    ('feature_select', feature_selector),
    ('preprocess', preprocessor),
    ('model', XGBRegressor())
])

# Fit and evaluate
xgb_pipeline.fit(X_train, y_train)
y_pred = xgb_pipeline.predict(X_test)
print(f"Train R^2: {xgb_pipeline.score(X_train, y_train)}")

Train R^2: 0.9361377954483032


In [6]:
print("Test R²:", xgb_pipeline.score(X_test, y_test))

Test R²: 0.8149953484535217


In [7]:
from sklearn.model_selection import GridSearchCV

param_grid_xgb = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [3, 5, 7],
    'model__learning_rate': [0.01, 0.1, 0.2]
}

grid_search_xgb = GridSearchCV(
    xgb_pipeline,             # your full pipeline with preprocessing
    param_grid_xgb,
    cv=5,
    scoring='neg_root_mean_squared_error',
    verbose=1,
    n_jobs=-1
)

grid_search_xgb.fit(X_train, y_train)

# Best pipeline
best_pipeline = grid_search_xgb.best_estimator_
y_pred = best_pipeline.predict(X_test)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [8]:
print("Best parameters:", grid_search_xgb.best_params_)

Best parameters: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100}


In [9]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

print("Best parameters:", grid_search_xgb.best_params_)

print("Test R²:", r2_score(y_test, y_pred))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

Best parameters: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100}
Test R²: 0.8387042880058289
Test RMSE: 35173.72473878762


In [10]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Best CV score (already neg RMSE)
print("Best CV neg RMSE:", grid_search_xgb.best_score_)

# Predict using best model
y_pred = grid_search_xgb.predict(X_test)

# Calculate RMSE
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Test RMSE:", test_rmse)

Best CV neg RMSE: -35037.546484375
Test RMSE: 35173.72473878762


In [11]:
# Best cross-validated score (negative RMSE in this case)
print("Best CV neg RMSE:", grid_search_xgb.best_score_)

# Test R² using the best pipeline
print("Test R²:", grid_search_xgb.score(X_test, y_test))

Best CV neg RMSE: -35037.546484375
Test R²: -35173.7265625


In [12]:
import joblib
best_pipeline = grid_search_xgb.best_estimator_
joblib.dump(best_pipeline, "xgb_pipeline.joblib")

['xgb_pipeline.joblib']

In [13]:
import boto3

s3 = boto3.client('s3')
s3.upload_file("xgb_pipeline.joblib", "kaggle-housing-pipeline-data", "models/xgb_pipeline.joblib")


In [16]:
import sagemaker

# Get compatible Scikit-learn image URI
image_uri = sagemaker.image_uris.retrieve(
    framework='sklearn',
    region=sagemaker.Session().boto_region_name,
    version='1.2-1'  # <-- MUST match your scikit-learn version
)

print(image_uri)

683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:1.2-1-cpu-py3
