### 1. Imports & Setup 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

### 2. Load Cleaned Data

In [2]:
df = pd.read_csv('../data/processed/cleaned_data.csv')

In [3]:
df = df.dropna(subset=['price', 'brand_popularity'])

### 3. Feature Engineering

In [6]:
df['desc_length'] = df['description'].astype(str).apply(len) #length of description

# one hot encode categorical columns
categorical_cols = ['brand', 'type']
df_encoded = pd.get_dummies(df[categorical_cols], drop_first=True)

# Final features
X = pd.concat([df[['brand_popularity', 'desc_length']], df_encoded], axis=1)
Y = df['price']

### 4. Train/Test Split

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

### 5. Models

In [10]:
# Linear Regression Model
lr_model = LinearRegression()
lr_model.fit(X_train, Y_train)
lr_predictions = lr_model.predict(X_test)

# XGBoost / Gradient Boosting 
gbr = GradientBoostingRegressor()
gbr.fit(X_train, Y_train)
gbr_predictions = gbr.predict(X_test)

### 6. Evaluation

In [11]:
def evaluate(name, y_true, y_pred):
    print(f"{name} MAE: {mean_absolute_error(y_true, y_pred):.2f}")
    print(f"{name} RMSE: {np.sqrt(mean_squared_error(y_true, y_pred)):.2f}")

evaluate("Linear Regression", Y_test, lr_predictions)
evaluate("Gradient Boosting", Y_test, gbr_predictions)

Linear Regression MAE: 699.76
Linear Regression RMSE: 925.06
Gradient Boosting MAE: 627.88
Gradient Boosting RMSE: 851.07
