In [3]:
!pip install prince

Collecting prince
  Downloading prince-0.13.1-py3-none-any.whl.metadata (639 bytes)
Downloading prince-0.13.1-py3-none-any.whl (415 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m415.8/415.8 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: prince
Successfully installed prince-0.13.1


In [14]:
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import GridSearchCV
import numpy as np
from math import sqrt
import prince
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score

# Load the dataset (assuming the original dataset is loaded into df)
df = pd.read_csv("train.csv")

# Separate numerical and categorical columns
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

# Impute numerical columns with the mean
num_imputer = SimpleImputer(strategy='mean')
df[num_cols] = num_imputer.fit_transform(df[num_cols])

# Impute categorical columns with the most frequent value
cat_imputer = SimpleImputer(strategy='most_frequent')
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

# Apply FAMD after imputation
famd = prince.FAMD(n_components=10, random_state=42)
famd_df = famd.fit_transform(df.drop(columns=['SalePrice']))

# Define the target variable
y = df['SalePrice']

# Define datasets based on available features (replace these with your own data preprocessing as needed)
datasets = {
    "numerical": df[['GrLivArea', 'TotalBsmtSF', 'YearBuilt', 'LotArea']],
    "complete": df.drop(columns=['SalePrice']),  # Assuming all features except target
    "famd": famd_df  # Replace `famd_df` with your actual famd dataset if preprocessed separately
}

# Define NRMSE scorer
def normalized_rmse(y_true, y_pred):
    rmse = sqrt(mean_squared_error(y_true, y_pred))
    return rmse / y_true.mean()

nrmse_scorer = make_scorer(normalized_rmse, greater_is_better=False)

# Define Ridge and Lasso hyperparameters
alphas = [0.01, 0.1, 1, 10, 100]

# Initialize a list to store results
results = []

# Loop through datasets
for dataset_name, X in datasets.items():
    # Loop through PCA options (with and without)
    for apply_pca in [False, True]:
        # Apply PCA if selected
        if apply_pca:
            # One-hot encode categorical columns before applying PCA
            X_transformed = pd.get_dummies(X, drop_first=True)
            pca = PCA(n_components=0.95)  # Adjusted to retain 95% of variance
            X_transformed = pca.fit_transform(X_transformed)
            # Wrap the PCA-transformed data in a DataFrame
            X_transformed = pd.DataFrame(X_transformed)
        else:
            # If no PCA, one-hot encode categorical columns
            X_transformed = pd.get_dummies(X, drop_first=True)

        # Standard scaler to ensure features are on the same scale
        scaler = StandardScaler()

        # --- Regular Linear Regression ---
        linear_pipeline = Pipeline([
            ('scaler', scaler),
            ('linear', LinearRegression())
        ])

        # Evaluate Linear Regression with cross-validation
        linear_rmse_scores = -cross_val_score(linear_pipeline, X_transformed, y, cv=10, scoring=nrmse_scorer, n_jobs=-1)
        linear_rmse = linear_rmse_scores.mean()

        # Store Linear Regression results
        results.append({
            'dataset': dataset_name,
            'pca': apply_pca,
            'model': 'LinearRegression',
            'best_alpha': None,
            'nrmse': linear_rmse
        })

        # --- Ridge Regression with GridSearchCV ---
        ridge_pipeline = Pipeline([
            ('scaler', scaler),
            ('ridge', Ridge())
        ])

        ridge_param_grid = {'ridge__alpha': alphas}

        ridge_grid_search = GridSearchCV(ridge_pipeline, ridge_param_grid, cv=10, scoring=nrmse_scorer, n_jobs=-1)
        ridge_grid_search.fit(X_transformed, y)

        # Get best Ridge model and its NRMSE
        best_ridge_model = ridge_grid_search.best_estimator_
        best_ridge_nrmse = -ridge_grid_search.best_score_

        # Store Ridge results
        results.append({
            'dataset': dataset_name,
            'pca': apply_pca,
            'model': 'Ridge',
            'best_alpha': ridge_grid_search.best_params_['ridge__alpha'],
            'nrmse': best_ridge_nrmse
        })

        # --- Lasso Regression with GridSearchCV ---
        lasso_pipeline = Pipeline([
            ('scaler', scaler),
            ('lasso', Lasso(max_iter=10000))
        ])

        lasso_param_grid = {'lasso__alpha': alphas}

        lasso_grid_search = GridSearchCV(lasso_pipeline, lasso_param_grid, cv=10, scoring=nrmse_scorer, n_jobs=-1)
        lasso_grid_search.fit(X_transformed, y)

        # Get best Lasso model and its NRMSE
        best_lasso_model = lasso_grid_search.best_estimator_
        best_lasso_nrmse = -lasso_grid_search.best_score_

        # Store Lasso results
        results.append({
            'dataset': dataset_name,
            'pca': apply_pca,
            'model': 'Lasso',
            'best_alpha': lasso_grid_search.best_params_['lasso__alpha'],
            'nrmse': best_lasso_nrmse
        })

# Convert results to DataFrame for analysis
results_df = pd.DataFrame(results)
print(results_df)



  X = self.scaler_.transform(X.to_numpy())
  X = self.scaler_.transform(X.to_numpy())


      dataset    pca             model  best_alpha         nrmse
0   numerical  False  LinearRegression         NaN  2.353773e-01
1   numerical  False             Ridge        0.01  2.353774e-01
2   numerical  False             Lasso        0.01  2.353773e-01
3   numerical   True  LinearRegression         NaN  4.264154e-01
4   numerical   True             Ridge      100.00  4.256501e-01
5   numerical   True             Lasso      100.00  4.263713e-01
6    complete  False  LinearRegression         NaN  1.447483e+10
7    complete  False             Ridge      100.00  1.779988e-01
8    complete  False             Lasso      100.00  2.057392e-01
9    complete   True  LinearRegression         NaN  4.263776e-01
10   complete   True             Ridge      100.00  4.256105e-01
11   complete   True             Lasso      100.00  4.263335e-01
12       famd  False  LinearRegression         NaN  2.547594e-01
13       famd  False             Ridge       10.00  2.547271e-01
14       famd  False     

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import GridSearchCV, cross_val_score
import numpy as np
from math import sqrt
import prince
from sklearn.impute import SimpleImputer

# Load the dataset (assuming the original dataset is loaded into df)
df = pd.read_csv("train.csv")

# Separate numerical and categorical columns
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

# Impute numerical columns with the mean
num_imputer = SimpleImputer(strategy='mean')
df[num_cols] = num_imputer.fit_transform(df[num_cols])

# Impute categorical columns with the most frequent value
cat_imputer = SimpleImputer(strategy='most_frequent')
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

# Apply FAMD (Factor Analysis of Mixed Data) after imputation
famd = prince.FAMD(n_components=10, random_state=42)
famd_df = famd.fit_transform(df.drop(columns=['SalePrice']))

# Define the target variable
y = df['SalePrice']

# Define datasets based on available features
datasets = {
    "numerical": df[['GrLivArea', 'TotalBsmtSF', 'YearBuilt', 'LotArea']],  # Select features
    "complete": df.drop(columns=['SalePrice']),  # All features except target
    "famd": famd_df  # FAMD-transformed data
}

# Define NRMSE scorer (Normalized RMSE)
def normalized_rmse(y_true, y_pred):
    rmse = sqrt(mean_squared_error(y_true, y_pred))  # RMSE calculation
    return rmse / y_true.mean()  # Normalize RMSE

nrmse_scorer = make_scorer(normalized_rmse, greater_is_better=False)

# Define Ridge and Lasso hyperparameters
alphas = [0.01, 0.1, 1, 10, 100]

# Initialize a list to store results
results = []

# Loop through datasets
for dataset_name, X in datasets.items():
    # Loop through PCA options (with and without PCA)
    for apply_pca in [False, True]:
        # Apply PCA if selected
        if apply_pca:
            # One-hot encode categorical columns before applying PCA
            X_transformed = pd.get_dummies(X, drop_first=True)
            pca = PCA(n_components=0.95)  # Retain 95% of variance
            X_transformed = pca.fit_transform(X_transformed)
            # Wrap the PCA-transformed data in a DataFrame
            X_transformed = pd.DataFrame(X_transformed)
        else:
            # If no PCA, one-hot encode categorical columns
            X_transformed = pd.get_dummies(X, drop_first=True)

        # Standard scaler to ensure features are on the same scale
        scaler = StandardScaler()

        # --- Polynomial Regression (Linear) ---
        # Create a pipeline for Polynomial Regression (degree=2 as an example)
        polynomial_pipeline = Pipeline([
            ('scaler', scaler),
            ('poly', PolynomialFeatures(degree=2)),
            ('linear', LinearRegression())
        ])

        # Evaluate Polynomial Regression with cross-validation
        poly_rmse_scores = -cross_val_score(polynomial_pipeline, X_transformed, y, cv=10, scoring=nrmse_scorer, n_jobs=-1)
        poly_rmse = poly_rmse_scores.mean()

        # Store Polynomial Regression results
        results.append({
            'dataset': dataset_name,
            'pca': apply_pca,
            'model': 'PolynomialRegression',
            'best_alpha': None,
            'nrmse': poly_rmse
        })

        # --- Ridge Regression with Polynomial Features ---
        ridge_poly_pipeline = Pipeline([
            ('scaler', scaler),
            ('poly', PolynomialFeatures(degree=2)),  # Same degree as polynomial regression
            ('ridge', Ridge())
        ])

        ridge_param_grid = {'ridge__alpha': alphas}

        ridge_poly_grid_search = GridSearchCV(ridge_poly_pipeline, ridge_param_grid, cv=10, scoring=nrmse_scorer, n_jobs=-1)
        ridge_poly_grid_search.fit(X_transformed, y)

        # Get best Ridge model with polynomial features and its NRMSE
        best_ridge_poly_model = ridge_poly_grid_search.best_estimator_
        best_ridge_poly_nrmse = -ridge_poly_grid_search.best_score_

        # Store Ridge with Polynomial features results
        results.append({
            'dataset': dataset_name,
            'pca': apply_pca,
            'model': 'RidgePoly',
            'best_alpha': ridge_poly_grid_search.best_params_['ridge__alpha'],
            'nrmse': best_ridge_poly_nrmse
        })

        # --- Lasso Regression with Polynomial Features ---
        lasso_poly_pipeline = Pipeline([
            ('scaler', scaler),
            ('poly', PolynomialFeatures(degree=2)),  # Same degree as polynomial regression
            ('lasso', Lasso(max_iter=10000))
        ])

        lasso_param_grid = {'lasso__alpha': alphas}

        lasso_poly_grid_search = GridSearchCV(lasso_poly_pipeline, lasso_param_grid, cv=10, scoring=nrmse_scorer, n_jobs=-1)
        lasso_poly_grid_search.fit(X_transformed, y)

        # Get best Lasso model with polynomial features and its NRMSE
        best_lasso_poly_model = lasso_poly_grid_search.best_estimator_
        best_lasso_poly_nrmse = -lasso_poly_grid_search.best_score_

        # Store Lasso with Polynomial features results
        results.append({
            'dataset': dataset_name,
            'pca': apply_pca,
            'model': 'LassoPoly',
            'best_alpha': lasso_poly_grid_search.best_params_['lasso__alpha'],
            'nrmse': best_lasso_poly_nrmse
        })

# Convert results to DataFrame for analysis
results_df = pd.DataFrame(results)
print(results_df)


  X = self.scaler_.transform(X.to_numpy())
  X = self.scaler_.transform(X.to_numpy())
