In [3]:
!pip install prince

Collecting prince
  Downloading prince-0.13.1-py3-none-any.whl.metadata (639 bytes)
Downloading prince-0.13.1-py3-none-any.whl (415 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m415.8/415.8 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: prince
Successfully installed prince-0.13.1


In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import numpy as np
from math import sqrt

# Load the dataset
df = pd.read_csv("train.csv")

# Separate numerical and categorical columns
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

# Impute numerical columns with the mean
num_imputer = SimpleImputer(strategy='mean')
df[num_cols] = num_imputer.fit_transform(df[num_cols])

# Impute categorical columns with the most frequent value
cat_imputer = SimpleImputer(strategy='most_frequent')
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

# Replace `prince.FAMD` with column transformer for numerical/categorical handling
# Assuming a custom FAMD implementation or equivalent transformation
X_num = df[num_cols].drop(columns=['SalePrice'])
X_cat = pd.get_dummies(df[cat_cols], drop_first=True)  # One-hot encode categorical features
X = pd.concat([X_num, X_cat], axis=1)

# Define the target variable
y = df['SalePrice']

# Define PCA toggle in pipeline
def create_pipeline(model, use_pca=False):
    steps = [
        ('preprocessor', ColumnTransformer(
            transformers=[('scaler', StandardScaler(), X_num.columns)],
            remainder='passthrough'  # Pass categorical columns through
        ))
    ]
    if use_pca:
        steps.append(('pca', PCA(n_components=0.95)))
    steps.append(('model', model))
    return Pipeline(steps)

# Define scorer (using RMSE for calculation, NRMSE for reporting)
rmse_scorer = make_scorer(mean_squared_error, squared=False)

# Define models and hyperparameters
alphas = [0.01, 0.1, 1, 10, 100]
models = [
    ('LinearRegression', LinearRegression(), None),
    ('Ridge', Ridge(), {'model__alpha': alphas}),
    ('Lasso', Lasso(max_iter=10000), {'model__alpha': alphas}),
]

# Initialize a list to store results
results = []

# Train models on datasets with/without PCA
for use_pca in [False, True]:
    for model_name, model, param_grid in models:
        pipeline = create_pipeline(model, use_pca)

        if param_grid:
            grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring=rmse_scorer, n_jobs=-1)
            grid_search.fit(X, y)
            best_model = grid_search.best_estimator_
            rmse = -grid_search.best_score_
            best_params = grid_search.best_params_
        else:
            scores = cross_val_score(pipeline, X, y, cv=10, scoring=rmse_scorer, n_jobs=-1)
            rmse = -scores.mean()
            best_params = None

        nrmse = rmse / y.mean()

        # Store results
        results.append({
            'Dataset': 'Complete',  # Adjust if multiple datasets are tested
            'PCA': use_pca,
            'Model': model_name,
            'Best Params': best_params,
            'RMSE': rmse,
            'NRMSE': nrmse
        })

# Convert results to DataFrame for analysis
results_df = pd.DataFrame(results)

# Sort results by NRMSE for better interpretation
results_df = results_df.sort_values(by='NRMSE').reset_index(drop=True)
print(results_df)


    Dataset    PCA             Model             Best Params          RMSE  \
0  Complete  False  LinearRegression                    None -7.653694e+14   
1  Complete  False             Lasso  {'model__alpha': 0.01} -4.096312e+04   
2  Complete  False             Ridge  {'model__alpha': 0.01} -3.934767e+04   
3  Complete   True  LinearRegression                    None -3.328655e+04   
4  Complete   True             Lasso  {'model__alpha': 0.01} -3.328652e+04   
5  Complete   True             Ridge  {'model__alpha': 0.01} -3.328642e+04   

          NRMSE  
0 -4.230402e+09  
1 -2.264142e-01  
2 -2.174851e-01  
3 -1.839837e-01  
4 -1.839836e-01  
5 -1.839830e-01  


In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.impute import SimpleImputer
import numpy as np
from math import sqrt

# Load the dataset
df = pd.read_csv("train.csv")

# Separate numerical and categorical columns
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

# Impute missing values
num_imputer = SimpleImputer(strategy='mean')
df[num_cols] = num_imputer.fit_transform(df[num_cols])

cat_imputer = SimpleImputer(strategy='most_frequent')
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

# Define the target variable
y = df['SalePrice']

# Dataset preprocessing
def preprocess_dataset(df, apply_pca=False):
    """Preprocess dataset with optional PCA."""
    # Separate numerical and categorical features
    X_num = df[num_cols].drop(columns=['SalePrice'], errors='ignore')
    X_cat = pd.get_dummies(df[cat_cols], drop_first=True)
    X = pd.concat([X_num, X_cat], axis=1)

    if apply_pca:
        pca = PCA(n_components=0.95)  # Retain 95% variance
        X = pca.fit_transform(X)
        X = pd.DataFrame(X)  # Wrap PCA-transformed array as DataFrame
    return X

# Define NRMSE scorer
def normalized_rmse(y_true, y_pred):
    rmse = sqrt(mean_squared_error(y_true, y_pred))
    return rmse / y_true.mean()

nrmse_scorer = make_scorer(normalized_rmse, greater_is_better=False)

# Hyperparameters
alphas = [0.01, 0.1, 1, 10, 100]
polynomial_degree = 2  # Degree of polynomial features

# Models and pipelines
def build_pipeline(model, degree, apply_pca=False):
    """Build a pipeline with scaling, polynomial features, and an optional PCA step."""
    steps = [
        ('scaler', StandardScaler()),
        ('poly', PolynomialFeatures(degree=degree))
    ]
    if apply_pca:
        steps.append(('pca', PCA(n_components=0.95)))
    steps.append(('model', model))
    return Pipeline(steps)

# Initialize a list to store results
results = []

# Evaluate models across datasets and settings
for apply_pca in [False, True]:
    X = preprocess_dataset(df, apply_pca=apply_pca)

    # --- Polynomial Regression ---
    poly_pipeline = build_pipeline(LinearRegression(), polynomial_degree, apply_pca)
    poly_rmse_scores = -cross_val_score(poly_pipeline, X, y, cv=10, scoring=nrmse_scorer, n_jobs=-1)
    results.append({
        'Dataset': 'Complete',
        'PCA': apply_pca,
        'Model': 'PolynomialRegression',
        'Best Alpha': None,
        'NRMSE': poly_rmse_scores.mean()
    })

    # --- Ridge Regression ---
    ridge_pipeline = build_pipeline(Ridge(), polynomial_degree, apply_pca)
    ridge_param_grid = {'model__alpha': alphas}
    ridge_grid = GridSearchCV(ridge_pipeline, ridge_param_grid, cv=10, scoring=nrmse_scorer, n_jobs=-1)
    ridge_grid.fit(X, y)
    results.append({
        'Dataset': 'Complete',
        'PCA': apply_pca,
        'Model': 'RidgePoly',
        'Best Alpha': ridge_grid.best_params_['model__alpha'],
        'NRMSE': -ridge_grid.best_score_
    })

    # --- Lasso Regression ---
    lasso_pipeline = build_pipeline(Lasso(max_iter=10000), polynomial_degree, apply_pca)
    lasso_param_grid = {'model__alpha': alphas}
    lasso_grid = GridSearchCV(lasso_pipeline, lasso_param_grid, cv=10, scoring=nrmse_scorer, n_jobs=-1)
    lasso_grid.fit(X, y)
    results.append({
        'Dataset': 'Complete',
        'PCA': apply_pca,
        'Model': 'LassoPoly',
        'Best Alpha': lasso_grid.best_params_['model__alpha'],
        'NRMSE': -lasso_grid.best_score_
    })

# Convert results to DataFrame and sort
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='NRMSE').reset_index(drop=True)
print(results_df)
