# Problem Identification

We are creating an analytics type application where the user can explore the dataset provided with an interactive UI. One page in our application will allow the user to choose different features of a PC, Laptop, Or Partially built PC in order to provide the predicted price based on certain specifications.

## LightGBM

In [217]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor # Not used for LGBM in this script, but kept from original
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import lightgbm as lgb
from sklearn.impute import SimpleImputer
import numpy as np

# --- 1. DATA LOADING ---
# Ensure pandas (pd) is imported
# Replace with the correct paths to your datasets
print("Loading data...")
try:
    df = pd.read_csv('/Users/oliverholmes/Documents/BCSAI/SecondYear/Machine Learning/Assignments/PcPartPicker3000/assignment/df_engineered.csv')
    df_laptop = pd.read_csv('/Users/oliverholmes/Documents/BCSAI/SecondYear/Machine Learning/Assignments/PcPartPicker3000/assignment/df_engineered_laptop.csv')
    # !!! IMPORTANT: Verify and correct this path for your desktop PC data !!!
    df_desktop_pc = pd.read_csv('/Users/oliverholmes/Documents/BCSAI/SecondYear/Machine Learning/Assignments/PcPartPicker3000/assignment/df_engineered_desktop_pc.csv') # Or your actual desktop data file
    df_partial_pc = pd.read_csv('/Users/oliverholmes/Documents/BCSAI/SecondYear/Machine Learning/Assignments/PcPartPicker3000/assignment/df_engineered_partial_pc.csv')
    print("Data loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading data: {e}")
    print("Please ensure the file paths are correct, especially for df_desktop_pc.")
    # Stop execution if data isn't loaded, or handle appropriately
    raise

# --- 2. INITIAL DATA CLEANING (Example from your snippets) ---
columns_to_drop = ['titulo', 'precio_min', 'precio_max', 'tipo'] # Adjust if needed
# Check if columns exist before dropping to avoid errors
existing_cols_df = [col for col in columns_to_drop if col in df.columns]
existing_cols_laptop = [col for col in columns_to_drop if col in df_laptop.columns]
existing_cols_desktop = [col for col in columns_to_drop if col in df_desktop_pc.columns]
existing_cols_partial = [col for col in columns_to_drop if col in df_partial_pc.columns]

if existing_cols_df:
    df = df.drop(columns=existing_cols_df)
if existing_cols_laptop:
    df_laptop = df_laptop.drop(columns=existing_cols_laptop)
if existing_cols_desktop:
    df_desktop_pc = df_desktop_pc.drop(columns=existing_cols_desktop)
if existing_cols_partial:
    df_partial_pc = df_partial_pc.drop(columns=existing_cols_partial)
print(f"Dropped columns (if they existed): {columns_to_drop}")


# --- 3. SANITY CHECKS FOR df_desktop_pc ---
if 'precio_mean' not in df_desktop_pc.columns:
    raise ValueError("Target column 'precio_mean' not found in df_desktop_pc after initial processing.")
if df_desktop_pc.empty:
    raise ValueError("The DataFrame df_desktop_pc is empty. Cannot train the model.")

# --- 4. DEFINE INITIAL FEATURES FOR SELECTION ---
# These are all columns in df_desktop_pc EXCEPT the target variable 'precio_mean'
initial_columns_to_consider = [col for col in df_desktop_pc.columns if col != 'precio_mean']
if not initial_columns_to_consider:
    raise ValueError("No feature columns found in df_desktop_pc after excluding 'precio_mean'.")
print(f"Initial columns for feature selection consideration: {initial_columns_to_consider}")

# --- 5. FEATURE SELECTION BLOCK ---
print("\n--- Starting Feature Selection Process ---")

X_initial = df_desktop_pc[initial_columns_to_consider]
y_initial = df_desktop_pc['precio_mean']

categorical_features_initial = X_initial.select_dtypes(include=['object', 'category']).columns
numerical_features_initial = X_initial.select_dtypes(include=['number']).columns

print(f"Initial categorical features for FS: {list(categorical_features_initial)}")
print(f"Initial numerical features for FS: {list(numerical_features_initial)}")


numerical_transformer_fs = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer_fs = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor_fs = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer_fs, numerical_features_initial),
        ('cat', categorical_transformer_fs, categorical_features_initial)
    ],
    remainder='passthrough'
)

# We only need a training set to get feature importances for selection
X_train_fs, _, y_train_fs, _ = train_test_split(X_initial, y_initial, test_size=0.2, random_state=42)

lgbm_fs = lgb.LGBMRegressor(random_state=42, verbose=-1)
pipeline_fs = Pipeline(steps=[('preprocessor', preprocessor_fs),
                              ('regressor', lgbm_fs)])

print("Fitting pipeline for feature selection...")
pipeline_fs.fit(X_train_fs, y_train_fs)
print("Feature selection pipeline fitting complete.")

importances_fs = pipeline_fs.named_steps['regressor'].feature_importances_
transformed_feature_names_fs = pipeline_fs.named_steps['preprocessor'].get_feature_names_out()

original_feature_importances = {col: 0.0 for col in initial_columns_to_consider}

for i, full_transformed_name in enumerate(transformed_feature_names_fs):
    importance_value = importances_fs[i]
    name_parts = full_transformed_name.split('__', 1)
    if len(name_parts) < 2:
        print(f"Warning: Could not parse transformed feature name for FS: {full_transformed_name}")
        continue
    transformer_prefix = name_parts[0]
    internal_name = name_parts[1]

    mapped_to_original = False
    if transformer_prefix == 'num':
        if internal_name in original_feature_importances:
            original_feature_importances[internal_name] += importance_value
            mapped_to_original = True
    elif transformer_prefix == 'cat':
        for original_cat_col in categorical_features_initial:
            if internal_name == original_cat_col or internal_name.startswith(original_cat_col + "_"):
                original_feature_importances[original_cat_col] += importance_value
                mapped_to_original = True
                break
    elif transformer_prefix == 'remainder':
         if internal_name in original_feature_importances: 
             original_feature_importances[internal_name] += importance_value
             mapped_to_original = True

# Sort features by aggregated importance
sorted_original_features = sorted(original_feature_importances.items(), key=lambda item: item[1], reverse=True)

print("\nFeature importances (aggregated from initial set):")
for feature, score in sorted_original_features:
    print(f"{feature}: {score:.4f}")

# MODIFIED PART: Select features with importance score > 0
columns_to_keep = [feature for feature, score in sorted_original_features if score > 0]

if not columns_to_keep:
    print("\nWarning: No features found with importance score > 0. The main model might not train correctly or perform well.")
else:
    print(f"\nSelected {len(columns_to_keep)} features with importance > 0 for main model: {columns_to_keep}")
print("--- Feature Selection Process Complete ---")

# --- 6. MAIN MODEL TRAINING AND EVALUATION (Using selected features) ---
if not columns_to_keep:
    # If no features are selected, we cannot proceed with model training as X will be empty or cause errors.
    print("\nHalting script: No features selected for the main model based on importance scores.")
else:
    print("\n--- Starting Main Model Training and Evaluation with Hyperparameter Tuning ---")
    X = df_desktop_pc[columns_to_keep]
    y = df_desktop_pc['precio_mean']
    
    categorical_features = X.select_dtypes(include=['object', 'category']).columns
    numerical_features = X.select_dtypes(include=['number']).columns
    
    print(f"Categorical features for main model: {list(categorical_features)}")
    print(f"Numerical features for main model: {list(numerical_features)}")
    
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'
    )
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Define the LightGBM model within the pipeline for GridSearchCV
    # Original Param Frig done for full testing
    # param_grid = {
    #     'regressor__n_estimators': [100, 200, 300],
    #     # 'regressor__learning_rate': [0.01, 0.05, 0.1],
    #     # 'regressor__num_leaves': [20, 31, 40],
    #     # 'regressor__max_depth': [-1, 10, 20], # -1 means no limit
    #     # 'regressor__min_child_samples': [10, 20, 30],
    #     # 'regressor__reg_alpha': [0, 0.1, 0.5],
    #     # 'regressor__reg_lambda': [0, 0.1, 0.5]
    # }

    # Param Grid for LightGBM tested and best results
    param_grid = {
        'regressor__n_estimators': [500],
        'regressor__learning_rate': [0.05],
        'regressor__num_leaves': [70],
        'regressor__max_depth': [10],
        'regressor__min_child_samples': [20],
        'regressor__colsample_bytree': [0.8],
        'regressor__subsample': [0.8],
        'regressor__reg_alpha': [0, 0.001, 0.01, 0.1, 0.5, 1],
        'regressor__reg_lambda': [0, 0.001, 0.01, 0.1, 0.5, 1]
    }
    
    # Create the pipeline (regressor is just a placeholder here, GridSearchCV will set it)
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', lgb.LGBMRegressor(random_state=42))])
    
    # Note: 'neg_mean_squared_error' is commonly used for regression.
    # R-squared can also be used: scoring='r2'
    # For RMSE, you'd make a custom scorer: make_scorer(lambda y, y_pred: np.sqrt(mean_squared_error(y, y_pred)), greater_is_better=False)
    
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
    
    print("Fitting GridSearchCV...")
    grid_search.fit(X_train, y_train)
    
    print("\nBest parameters found by GridSearchCV:")
    print(grid_search.best_params_)
    
    best_model = grid_search.best_estimator_
    print("\nMain model pipeline fitting complete using best estimator from GridSearchCV.")
    
    # Evaluate on the Test Set using the best model from GridSearchCV
    y_pred_test = best_model.predict(X_test)
    test_mse = mean_squared_error(y_test, y_pred_test)
    test_rmse = np.sqrt(test_mse)
    test_r2 = r2_score(y_test, y_pred_test)
    
    print("\nMetrics on the TEST Set (using best model from GridSearchCV):")
    print(f"Mean Squared Error (MSE): {test_mse:.4f}")
    print(f"Root Mean Squared Error (RMSE): {test_rmse:.4f}")
    print(f"R-squared: {test_r2:.4f}")
    
    # Evaluate on the Training Set
    y_pred_train = best_model.predict(X_train)
    train_mse = mean_squared_error(y_train, y_pred_train)
    train_rmse = np.sqrt(train_mse)
    train_r2 = r2_score(y_train, y_pred_train)
    
    print("\nMetrics on the TRAINING Set (using best model from GridSearchCV):")
    print(f"Mean Squared Error (MSE): {train_mse:.4f}")
    print(f"Root Mean Squared Error (RMSE): {train_rmse:.4f}")
    print(f"R-squared: {train_r2:.4f}")
    
    print("\n--- Script Execution Complete ---")

Loading data...
Data loaded successfully.
Dropped columns (if they existed): ['titulo', 'precio_min', 'precio_max', 'tipo']
Initial columns for feature selection consideration: ['tipo_de_producto', 'serie', 'procesador', 'disco_duro_tipo_de_disco_duro', 'grafica_salida_de_video', 'comunicaciones_estandar_wifi', 'procesador_fabricante', 'procesador_tipo', 'disco_duro_numero_de_discos_duros_instalados', 'grafica_memoria', 'sistema_operativo_sistema_operativo', 'procesador_nombre_clave', 'ram_tipo', 'procesador_numero_nucleos', 'grafica_tipo_memoria', 'grafica_tarjeta', 'comunicaciones_estandar_lan', 'propiedades_de_la_carcasa_tipo_de_caja', 'adecuado_para', 'procesador_zocalo_de_cpu', 'medidas_y_peso_profundidad_cm', 'medidas_y_peso_ancho_cm', 'procesador_cache_mb', 'ram_memoria_gb', 'ram_frecuencia_de_la_memoria_mhz', 'disco_duro_capacidad_de_memoria_ssd_gb', 'procesador_frecuencia_turbo_max_ghz', 'medidas_y_peso_alto_cm', 'alimentacion_wattage_binned', 'custom_category', 'equip_altavoc

KeyboardInterrupt: 

## XGBoost

In [215]:
import pandas as pd
import numpy as np
import xgboost as xgb # <--- Added XGBoost import
from sklearn.model_selection import train_test_split, GridSearchCV
# from sklearn.ensemble import GradientBoostingRegressor # Kept if you might switch back
from sklearn.metrics import mean_squared_error, r2_score # make_scorer is available if needed
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import lightgbm as lgb # Kept for the feature selection part, can be removed if FS model also changes

# --- 1. DATA LOADING ---
print("Loading data...")
try:
    # !!! IMPORTANT: Verify and correct this path for your desktop PC data !!!
    df_desktop_pc = pd.read_csv('/Users/oliverholmes/Documents/BCSAI/SecondYear/Machine Learning/Assignments/PcPartPicker3000/assignment/df_engineered_desktop_pc.csv') # Or your actual desktop data file
    print("Desktop PC data loaded successfully.")
    # Load other dataframes if they are used elsewhere in your notebook, otherwise they can be removed.
    # df = pd.read_csv('/Users/oliverholmes/Documents/BCSAI/SecondYear/Machine Learning/Assignments/PcPartPicker3000/assignment/df_engineered.csv')
    # df_laptop = pd.read_csv('/Users/oliverholmes/Documents/BCSAI/SecondYear/Machine Learning/Assignments/PcPartPicker3000/assignment/df_engineered_laptop.csv')
    # df_partial_pc = pd.read_csv('/Users/oliverholmes/Documents/BCSAI/SecondYear/Machine Learning/Assignments/PcPartPicker3000/assignment/df_engineered_partial_pc.csv')
except FileNotFoundError as e:
    print(f"Error loading desktop PC data: {e}")
    print("Please ensure the file path for df_desktop_pc is correct.")
    raise

# --- 2. INITIAL DATA CLEANING (Example from your snippets) ---
# Adjust if these columns are not relevant or if other cleaning is needed for df_desktop_pc
columns_to_drop = ['titulo', 'precio_min', 'precio_max', 'tipo']
existing_cols_desktop = [col for col in columns_to_drop if col in df_desktop_pc.columns]
if existing_cols_desktop:
    df_desktop_pc = df_desktop_pc.drop(columns=existing_cols_desktop)
    print(f"Dropped columns from df_desktop_pc (if they existed): {existing_cols_desktop}")


# --- 3. SANITY CHECKS FOR df_desktop_pc ---
if 'precio_mean' not in df_desktop_pc.columns:
    raise ValueError("Target column 'precio_mean' not found in df_desktop_pc after initial processing.")
if df_desktop_pc.empty:
    raise ValueError("The DataFrame df_desktop_pc is empty. Cannot train the model.")

# --- 4. DEFINE INITIAL FEATURES FOR SELECTION ---
initial_columns_to_consider = [col for col in df_desktop_pc.columns if col != 'precio_mean']
if not initial_columns_to_consider:
    raise ValueError("No feature columns found in df_desktop_pc after excluding 'precio_mean'.")
print(f"Initial columns for feature selection consideration: {initial_columns_to_consider}")

# --- 5. FEATURE SELECTION BLOCK (using LightGBM for importances, can be changed) ---
# This block remains the same, using LightGBM to determine feature importances.
# If you want to use XGBoost for feature selection as well, this part would need modification.
print("\n--- Starting Feature Selection Process (using LightGBM for importances) ---")

X_initial = df_desktop_pc[initial_columns_to_consider]
y_initial = df_desktop_pc['precio_mean']

categorical_features_initial = X_initial.select_dtypes(include=['object', 'category']).columns
numerical_features_initial = X_initial.select_dtypes(include=['number']).columns

print(f"Initial categorical features for FS: {list(categorical_features_initial)}")
print(f"Initial numerical features for FS: {list(numerical_features_initial)}")

numerical_transformer_fs = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer_fs = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
preprocessor_fs = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer_fs, numerical_features_initial),
        ('cat', categorical_transformer_fs, categorical_features_initial)
    ], remainder='passthrough')

X_train_fs, _, y_train_fs, _ = train_test_split(X_initial, y_initial, test_size=0.2, random_state=42)

lgbm_fs = lgb.LGBMRegressor(random_state=42, verbose=-1) # Using LightGBM for feature selection
pipeline_fs = Pipeline(steps=[('preprocessor', preprocessor_fs), ('regressor', lgbm_fs)])

print("Fitting pipeline for feature selection (LightGBM)...")
pipeline_fs.fit(X_train_fs, y_train_fs)
print("Feature selection pipeline fitting complete.")

importances_fs = pipeline_fs.named_steps['regressor'].feature_importances_
transformed_feature_names_fs = pipeline_fs.named_steps['preprocessor'].get_feature_names_out()
original_feature_importances = {col: 0.0 for col in initial_columns_to_consider}

for i, full_transformed_name in enumerate(transformed_feature_names_fs):
    importance_value = importances_fs[i]
    name_parts = full_transformed_name.split('__', 1)
    if len(name_parts) < 2: continue
    transformer_prefix, internal_name = name_parts
    if transformer_prefix == 'num' and internal_name in original_feature_importances:
        original_feature_importances[internal_name] += importance_value
    elif transformer_prefix == 'cat':
        for original_cat_col in categorical_features_initial:
            if internal_name == original_cat_col or internal_name.startswith(original_cat_col + "_"):
                original_feature_importances[original_cat_col] += importance_value
                break
    elif transformer_prefix == 'remainder' and internal_name in original_feature_importances:
        original_feature_importances[internal_name] += importance_value

sorted_original_features = sorted(original_feature_importances.items(), key=lambda item: item[1], reverse=True)
print("\nFeature importances (aggregated from initial set using LightGBM):")
for feature, score in sorted_original_features: print(f"{feature}: {score:.4f}")

columns_to_keep = [feature for feature, score in sorted_original_features if score > 0]
if not columns_to_keep:
    print("\nWarning: No features found with importance score > 0 from LightGBM FS.")
else:
    print(f"\nSelected {len(columns_to_keep)} features (importance > 0 from LightGBM FS) for main model: {columns_to_keep}")
print("--- Feature Selection Process Complete ---")


# --- 6. MAIN MODEL TRAINING AND EVALUATION with XGBoost ---
if not columns_to_keep:
    print("\nHalting script: No features selected for the main model.")
else:
    print("\n--- Starting Main Model Training and Evaluation with XGBoost ---")
    X = df_desktop_pc[columns_to_keep]
    y = df_desktop_pc['precio_mean']

    categorical_features = X.select_dtypes(include=['object', 'category']).columns
    numerical_features = X.select_dtypes(include=['number']).columns

    print(f"Categorical features for XGBoost model: {list(categorical_features)}")
    print(f"Numerical features for XGBoost model: {list(numerical_features)}")

    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ], remainder='passthrough')

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # XGBoost Parameter Grid for GridSearchCV
    # Note: XGBoost can handle categorical features with its 'enable_categorical=True' experimental feature,
    # but it's often more robust to one-hot encode them as we are doing.
    param_grid_xgb = {
        'regressor__n_estimators': [100, 200, 300],
        # 'regressor__learning_rate': [0.01, 0.05, 0.1],
        # 'regressor__max_depth': [3, 5, 7], # Typical values for XGBoost
        # 'regressor__subsample': [0.7, 0.8, 1.0],
        # 'regressor__colsample_bytree': [0.7, 0.8, 1.0],
        # 'regressor__gamma': [0, 0.1, 0.2], # Minimum loss reduction required to make a further partition
        # 'regressor__reg_alpha': [0, 0.01, 0.1], # L1 regularization
        # 'regressor__reg_lambda': [1, 0.1, 0.01]  # L2 regularization (XGBoost default is 1)
    }

    # Create the pipeline with XGBoost Regressor
    # XGBoost may issue warnings about unsupported 'verbose' if passed from LGBM feature selector;
    # we define a new regressor instance here.
    pipeline_xgb = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('regressor', xgb.XGBRegressor(random_state=42, objective='reg:squarederror'))])
                                   # objective='reg:squarederror' suppresses a warning in newer XGBoost versions.

    grid_search_xgb = GridSearchCV(pipeline_xgb, param_grid_xgb, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

    print("Fitting GridSearchCV with XGBoost...")
    grid_search_xgb.fit(X_train, y_train)

    print("\nBest parameters found by GridSearchCV for XGBoost:")
    print(grid_search_xgb.best_params_)

    best_model_xgb = grid_search_xgb.best_estimator_
    print("\nXGBoost model pipeline fitting complete using best estimator from GridSearchCV.")

    # Evaluate on the Test Set
    y_pred_test_xgb = best_model_xgb.predict(X_test)
    test_mse_xgb = mean_squared_error(y_test, y_pred_test_xgb)
    test_rmse_xgb = np.sqrt(test_mse_xgb)
    test_r2_xgb = r2_score(y_test, y_pred_test_xgb)

    print("\nMetrics on the TEST Set (using best XGBoost model):")
    print(f"Mean Squared Error (MSE): {test_mse_xgb:.4f}")
    print(f"Root Mean Squared Error (RMSE): {test_rmse_xgb:.4f}")
    print(f"R-squared: {test_r2_xgb:.4f}")

    # Evaluate on the Training Set
    y_pred_train_xgb = best_model_xgb.predict(X_train)
    train_mse_xgb = mean_squared_error(y_train, y_pred_train_xgb)
    train_rmse_xgb = np.sqrt(train_mse_xgb)
    train_r2_xgb = r2_score(y_train, y_pred_train_xgb)

    print("\nMetrics on the TRAINING Set (using best XGBoost model):")
    print(f"Mean Squared Error (MSE): {train_mse_xgb:.4f}")
    print(f"Root Mean Squared Error (RMSE): {train_rmse_xgb:.4f}")
    print(f"R-squared: {train_r2_xgb:.4f}")

print("\n--- Script Execution Complete ---")

Loading data...
Desktop PC data loaded successfully.
Dropped columns from df_desktop_pc (if they existed): ['titulo', 'precio_min', 'precio_max', 'tipo']
Initial columns for feature selection consideration: ['tipo_de_producto', 'serie', 'procesador', 'disco_duro_tipo_de_disco_duro', 'grafica_salida_de_video', 'comunicaciones_estandar_wifi', 'procesador_fabricante', 'procesador_tipo', 'disco_duro_numero_de_discos_duros_instalados', 'grafica_memoria', 'sistema_operativo_sistema_operativo', 'procesador_nombre_clave', 'ram_tipo', 'procesador_numero_nucleos', 'grafica_tipo_memoria', 'grafica_tarjeta', 'comunicaciones_estandar_lan', 'propiedades_de_la_carcasa_tipo_de_caja', 'adecuado_para', 'procesador_zocalo_de_cpu', 'medidas_y_peso_profundidad_cm', 'medidas_y_peso_ancho_cm', 'procesador_cache_mb', 'ram_memoria_gb', 'ram_frecuencia_de_la_memoria_mhz', 'disco_duro_capacidad_de_memoria_ssd_gb', 'procesador_frecuencia_turbo_max_ghz', 'medidas_y_peso_alto_cm', 'alimentacion_wattage_binned', 'cu

## Nueral Net

In [None]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor # <--- Added MLPRegressor import
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import lightgbm as lgb # Kept for the feature selection part

# --- 1. DATA LOADING ---
print("Loading data...")
try:
    # !!! IMPORTANT: Verify and correct this path for your desktop PC data !!!
    df_desktop_pc = pd.read_csv('/Users/oliverholmes/Documents/BCSAI/SecondYear/Machine Learning/Assignments/PcPartPicker3000/assignment/df_engineered_desktop_pc.csv') # Or your actual desktop data file
    print("Desktop PC data loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading desktop PC data: {e}")
    print("Please ensure the file path for df_desktop_pc is correct.")
    raise

# --- 2. INITIAL DATA CLEANING ---
columns_to_drop = ['titulo', 'precio_min', 'precio_max', 'tipo']
existing_cols_desktop = [col for col in columns_to_drop if col in df_desktop_pc.columns]
if existing_cols_desktop:
    df_desktop_pc = df_desktop_pc.drop(columns=existing_cols_desktop)
    print(f"Dropped columns from df_desktop_pc (if they existed): {existing_cols_desktop}")

# --- 3. SANITY CHECKS FOR df_desktop_pc ---
if 'precio_mean' not in df_desktop_pc.columns:
    raise ValueError("Target column 'precio_mean' not found in df_desktop_pc.")
if df_desktop_pc.empty:
    raise ValueError("The DataFrame df_desktop_pc is empty.")

# --- 4. DEFINE INITIAL FEATURES FOR SELECTION ---
initial_columns_to_consider = [col for col in df_desktop_pc.columns if col != 'precio_mean']
if not initial_columns_to_consider:
    raise ValueError("No feature columns found in df_desktop_pc after excluding 'precio_mean'.")
print(f"Initial columns for feature selection: {initial_columns_to_consider}")

# --- 5. FEATURE SELECTION BLOCK (using LightGBM for importances) ---
print("\n--- Starting Feature Selection Process (using LightGBM for importances) ---")
X_initial = df_desktop_pc[initial_columns_to_consider]
y_initial = df_desktop_pc['precio_mean']

categorical_features_initial = X_initial.select_dtypes(include=['object', 'category']).columns
numerical_features_initial = X_initial.select_dtypes(include=['number']).columns

numerical_transformer_fs = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
categorical_transformer_fs = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])
preprocessor_fs = ColumnTransformer(transformers=[('num', numerical_transformer_fs, numerical_features_initial), ('cat', categorical_transformer_fs, categorical_features_initial)], remainder='passthrough')

X_train_fs, _, y_train_fs, _ = train_test_split(X_initial, y_initial, test_size=0.2, random_state=42)
lgbm_fs = lgb.LGBMRegressor(random_state=42, verbose=-1)
pipeline_fs = Pipeline(steps=[('preprocessor', preprocessor_fs), ('regressor', lgbm_fs)])

print("Fitting pipeline for feature selection (LightGBM)...")
pipeline_fs.fit(X_train_fs, y_train_fs)
print("Feature selection pipeline fitting complete.")

importances_fs = pipeline_fs.named_steps['regressor'].feature_importances_
transformed_feature_names_fs = pipeline_fs.named_steps['preprocessor'].get_feature_names_out()
original_feature_importances = {col: 0.0 for col in initial_columns_to_consider}

for i, full_transformed_name in enumerate(transformed_feature_names_fs):
    importance_value = importances_fs[i]
    name_parts = full_transformed_name.split('__', 1)
    if len(name_parts) < 2: continue
    transformer_prefix, internal_name = name_parts
    if transformer_prefix == 'num' and internal_name in original_feature_importances:
        original_feature_importances[internal_name] += importance_value
    elif transformer_prefix == 'cat':
        for original_cat_col in categorical_features_initial:
            if internal_name == original_cat_col or internal_name.startswith(original_cat_col + "_"):
                original_feature_importances[original_cat_col] += importance_value
                break
    elif transformer_prefix == 'remainder' and internal_name in original_feature_importances:
        original_feature_importances[internal_name] += importance_value

sorted_original_features = sorted(original_feature_importances.items(), key=lambda item: item[1], reverse=True)
print("\nFeature importances (aggregated from LightGBM FS):")
for feature, score in sorted_original_features: print(f"{feature}: {score:.4f}")

columns_to_keep = [feature for feature, score in sorted_original_features if score > 0]
if not columns_to_keep:
    print("\nWarning: No features found with importance > 0 from LightGBM FS.")
else:
    print(f"\nSelected {len(columns_to_keep)} features (importance > 0 from LightGBM FS) for main model: {columns_to_keep}")
print("--- Feature Selection Process Complete ---")

# --- 6. MAIN MODEL TRAINING AND EVALUATION with MLPRegressor ---
if not columns_to_keep:
    print("\nHalting script: No features selected for the main model.")
else:
    print("\n--- Starting Main Model Training and Evaluation with MLPRegressor ---")
    X = df_desktop_pc[columns_to_keep]
    y = df_desktop_pc['precio_mean']

    categorical_features = X.select_dtypes(include=['object', 'category']).columns
    numerical_features = X.select_dtypes(include=['number']).columns

    print(f"Categorical features for MLP model: {list(categorical_features)}")
    print(f"Numerical features for MLP model: {list(numerical_features)}")

    numerical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) # Scaling is crucial for NNs
    categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])
    preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_features), ('cat', categorical_transformer, categorical_features)], remainder='passthrough')

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # MLPRegressor Parameter Grid for GridSearchCV
    param_grid_mlp = {
        'regressor__hidden_layer_sizes': [(50,), (100,), (50,25)], # One or two hidden layers
        # 'regressor__activation': ['relu', 'tanh'],
        # 'regressor__solver': ['adam'], # Adam is often a good default
        'regressor__alpha': [0.0001, 0.001, 0.01], # L2 regularization
        # 'regressor__learning_rate_init': [0.001, 0.01],
        # 'regressor__max_iter': [300, 500] # Allow more iterations for convergence
        'regressor__early_stopping': [True], # Can help prevent overfitting and speed up grid search
        # 'regressor__n_iter_no_change': [10]   # Used with early_stopping
    }

    pipeline_mlp = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('regressor', MLPRegressor(random_state=42))])

    grid_search_mlp = GridSearchCV(pipeline_mlp, param_grid_mlp, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

    print("Fitting GridSearchCV with MLPRegressor...")
    grid_search_mlp.fit(X_train, y_train) # NNs can take longer to train, especially with GridSearchCV

    print("\nBest parameters found by GridSearchCV for MLPRegressor:")
    print(grid_search_mlp.best_params_)

    best_model_mlp = grid_search_mlp.best_estimator_
    print("\nMLPRegressor model pipeline fitting complete using best estimator.")

    # Evaluate on the Test Set
    y_pred_test_mlp = best_model_mlp.predict(X_test)
    test_mse_mlp = mean_squared_error(y_test, y_pred_test_mlp)
    test_rmse_mlp = np.sqrt(test_mse_mlp)
    test_r2_mlp = r2_score(y_test, y_pred_test_mlp)

    print("\nMetrics on the TEST Set (using best MLPRegressor model):")
    print(f"Mean Squared Error (MSE): {test_mse_mlp:.4f}")
    print(f"Root Mean Squared Error (RMSE): {test_rmse_mlp:.4f}")
    print(f"R-squared: {test_r2_mlp:.4f}")

    # Evaluate on the Training Set
    y_pred_train_mlp = best_model_mlp.predict(X_train)
    train_mse_mlp = mean_squared_error(y_train, y_pred_train_mlp)
    train_rmse_mlp = np.sqrt(train_mse_mlp)
    train_r2_mlp = r2_score(y_train, y_pred_train_mlp)

    print("\nMetrics on the TRAINING Set (using best MLPRegressor model):")
    print(f"Mean Squared Error (MSE): {train_mse_mlp:.4f}")
    print(f"Root Mean Squared Error (RMSE): {train_rmse_mlp:.4f}")
    print(f"R-squared: {train_r2_mlp:.4f}")

print("\n--- Script Execution Complete ---")

Loading data...
Desktop PC data loaded successfully.
Dropped columns from df_desktop_pc (if they existed): ['titulo', 'precio_min', 'precio_max', 'tipo']
Initial columns for feature selection: ['tipo_de_producto', 'serie', 'procesador', 'disco_duro_tipo_de_disco_duro', 'grafica_salida_de_video', 'comunicaciones_estandar_wifi', 'procesador_fabricante', 'procesador_tipo', 'disco_duro_numero_de_discos_duros_instalados', 'grafica_memoria', 'sistema_operativo_sistema_operativo', 'procesador_nombre_clave', 'ram_tipo', 'procesador_numero_nucleos', 'grafica_tipo_memoria', 'grafica_tarjeta', 'comunicaciones_estandar_lan', 'propiedades_de_la_carcasa_tipo_de_caja', 'adecuado_para', 'procesador_zocalo_de_cpu', 'medidas_y_peso_profundidad_cm', 'medidas_y_peso_ancho_cm', 'procesador_cache_mb', 'ram_memoria_gb', 'ram_frecuencia_de_la_memoria_mhz', 'disco_duro_capacidad_de_memoria_ssd_gb', 'procesador_frecuencia_turbo_max_ghz', 'medidas_y_peso_alto_cm', 'alimentacion_wattage_binned', 'custom_category'




Best parameters found by GridSearchCV for MLPRegressor:
{'regressor__alpha': 0.0001, 'regressor__early_stopping': True, 'regressor__hidden_layer_sizes': (50, 25)}

MLPRegressor model pipeline fitting complete using best estimator.

Metrics on the TEST Set (using best MLPRegressor model):
Mean Squared Error (MSE): 1114243.4764
Root Mean Squared Error (RMSE): 1055.5773
R-squared: 0.4761

Metrics on the TRAINING Set (using best MLPRegressor model):
Mean Squared Error (MSE): 350534.4911
Root Mean Squared Error (RMSE): 592.0595
R-squared: 0.7426

--- Script Execution Complete ---


# Dataset Creation

## Dataset Preperation

### Categorical Encoding

# Model Selection

After 230 minutes of running, this was the result of the cross validation:

GridSearchCV Complete.
Best parameters found: {'regressor__colsample_bytree': 0.8, 'regressor__learning_rate': 0.05, 'regressor__max_depth': 10, 'regressor__min_child_samples': 20, 'regressor__n_estimators': 500, 'regressor__num_leaves': 70, 'regressor__subsample': 0.8}

Best cross-validation score (negative MSE): -375894.4673686046

Best cross-validation RMSE: 613.1023302586646

Metrics on the Test Set using the Best Model from GridSearchCV:

Mean Squared Error (MSE): 347437.5443584199

Root Mean Squared Error (RMSE): 589.4383295633394

R-squared: 0.7957941987120332
Metrics on the Training Set using the Best Model from GridSearchCV:

Mean Squared Error (MSE): 73745.4619638589

Root Mean Squared Error (RMSE): 271.5611569496987

R-squared: 0.9516823528897075

# Model Training

# Model Assessment

# Model Optimization

# Model Deployment