# XGBoost using sklearn

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor
import pandas as pd

In [4]:
df_train = pd.read_csv("data/train.csv")

In [5]:
cols_to_keep = ["ExterQual", "LotArea", "YrSold", "SalePrice"]

In [6]:
df = df_train[cols_to_keep]

In [7]:
y = df["SalePrice"]
X = df.drop(["SalePrice"], axis = 1)

# Split data into train and test

In [10]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define which columns should be one-hot encoded and which should be scaled
categorical_cols = ['YrSold', 'ExterQual']
numeric_cols = ['LotArea']

# Preprocessing steps for different column types
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numeric_cols),  # MinMaxScaler for numeric columns
        ('cat', OneHotEncoder(), categorical_cols)  # OneHotEncoder for categorical columns
    ])

# Create the full pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Preprocessing step
    ('model', GradientBoostingRegressor())  # XGBoost model
])

# Fit the pipeline (including preprocessing and model) on the training data
pipeline.fit(X_train, y_train)

# Evaluate the pipeline on the test set
score = pipeline.score(X_test, y_test)
print("Pipeline score:", score)

Pipeline score: 0.6329993775542335
