# Problem Identification

We are creating an analytics type application where the user can explore the dataset provided with an interactive UI. One page in our application will allow the user to choose different features of a PC, Laptop, Or Partially built PC in order to provide the predicted price based on certain specifications.

# Dataset Creation

In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [65]:
# Simply import the new dataset created from the EDA
df = pd.read_csv('/Users/oliverholmes/Documents/BCSAI/SecondYear/Machine Learning/Assignments/PcPartPicker3000/assignment/df_engineered.csv')

## Dataset Preperation

In [66]:
df.head()

Unnamed: 0,titulo,tipo_de_producto,serie,pantalla_tecnologia,procesador,disco_duro_tipo_de_disco_duro,grafica_salida_de_video,conectividad_lector_de_tarjetas,comunicaciones_version_bluetooth,camara_webcam,...,disco_duro_capacidad_de_memoria_ssd_gb,procesador_frecuencia_turbo_max_ghz,procesador_potencia_maxima_de_diseno_termico_tdp_watts,medidas_y_peso_alto_cm,alimentacion_wattage_binned,volumen_cm3,custom_category,equip_altavoces_estéreo,equip_refrigeración_líquida,equip_usb_c
0,"Apple MacBook Air 13"" 2024 M3",Portátil multimedia,Apple MacBook Air,Retina,Apple M3,SSD,Thunderbolt 4,ninguno,Bluetooth 5.3,integrada,...,512.0,4.7,28.0,,,718.96,Laptop,1,0,1
1,"Apple MacBook Air 15"" 2024 M3",Portátil multimedia,Apple MacBook Air,Retina,Apple M3,SSD,Thunderbolt 4,ninguno,Bluetooth 5.3,integrada,...,512.0,4.7,28.0,,,971.04,Laptop,1,0,1
2,"Apple MacBook Pro 14"" M4 (2024)",Portátil multimedia,Apple MacBook Pro,Retina,Unknown,SSD,HDMI,integrado,Bluetooth 5.3,12.0 Megapixel,...,512.0,4.7,28.0,,,1106.768,Laptop,1,0,1
3,ASUS TUF Gaming A15 FA506 (2024),Portátil gaming,ASUS TUF Gaming,Full HD,Unknown,SSD,HDMI,integrado,Bluetooth 5.2,integrada,...,512.0,4.7,28.0,,,2113.792,Laptop,1,0,1
4,GigaByte Aero 16 OLED BKF,Portátil gaming,Gigabyte Aero,UHD+,Intel Core i7-13700H,SSD,HDMI,integrado,Bluetooth 5.2,integrada,...,1000.0,5.0,45.0,,,1985.94,Laptop,1,0,1


In [70]:
# Drop columns with high cardinality that are unique identifiers


### Categorical Encoding

# Model Selection

In [67]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
# If you have categorical features, you'll need an encoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Model Training

In [68]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
# If you have categorical features, you'll need an encoder
from sklearn.preprocessing import OneHotEncoder, StandardScaler # <--- Added StandardScaler here
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import lightgbm as lgb # <--- Added LightGBM import

# Ensure 'precio_mean' is not in X and is the target y
if 'precio_mean' not in df_desktop_pc.columns:
    raise ValueError("Target column 'precio_mean' not found in DataFrame")

X = df_desktop_pc.drop(['precio_mean', 'precio_max', 'precio_min', 'tipo'], axis=1)
y = df_desktop_pc['precio_mean']

# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object', 'category']).columns
numerical_features = X.select_dtypes(include=['number']).columns

# Create preprocessing pipelines for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a column transformer to apply different transformations to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the LightGBM model
lgb_model = lgb.LGBMRegressor(random_state=42)

# Create the full pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', lgb_model)])

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

# You can also print feature importances if you want
# Get feature names after one-hot encoding
feature_names = list(pipeline.named_steps['preprocessor'].transformers_[0][2]) # Numerical features
cat_feature_names = pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features)
feature_names.extend(cat_feature_names)

importances = pipeline.named_steps['regressor'].feature_importances_
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)

print("\\nFeature Importances:")
print(feature_importance_df.head(10)) # Display top 10 features

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002688 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 766
[LightGBM] [Info] Number of data points in the train set: 3148, number of used features: 192
[LightGBM] [Info] Start training from score 1399.350882
Mean Squared Error: 716107.0640794821
R-squared: 0.6633239275835138
\nFeature Importances:
                                     feature  importance
2              medidas_y_peso_profundidad_cm         291
3                    medidas_y_peso_ancho_cm         198
5                             ram_memoria_gb         165
9                     medidas_y_peso_alto_cm         156
8        procesador_frecuencia_turbo_max_ghz         132
4                        procesador_cache_mb         128
3164            tipo_de_producto_Workstation          98
3632      grafica_memoria_memoria compartida 

In [69]:
# ... (previous code) ...

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions on the training set
y_train_pred = pipeline.predict(X_train)

# Make predictions on the test set
y_test_pred = pipeline.predict(X_test)

# Calculate metrics for the training set
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
train_r2 = r2_score(y_train, y_train_pred)

# Calculate metrics for the test set
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, y_test_pred)

# Print the metrics
print("Training Set Metrics:")
print(f"Mean Squared Error (MSE): {train_mse}")
print(f"Root Mean Squared Error (RMSE): {train_rmse}")
print(f"R-squared: {train_r2}")

print("\nTest Set Metrics:")
print(f"Mean Squared Error (MSE): {test_mse}")
print(f"Root Mean Squared Error (RMSE): {test_rmse}")
print(f"R-squared: {test_r2}")

# ... (rest of your code) ...

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001717 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 766
[LightGBM] [Info] Number of data points in the train set: 3148, number of used features: 192
[LightGBM] [Info] Start training from score 1399.350882
Training Set Metrics:
Mean Squared Error (MSE): 163048.28539812498
Root Mean Squared Error (RMSE): 403.79237907385647
R-squared: 0.8802917236764781

Test Set Metrics:
Mean Squared Error (MSE): 716107.0640794821
Root Mean Squared Error (RMSE): 846.2310937796378
R-squared: 0.6633239275835138


# Model Assessment

# Model Optimization

# Model Deployment