In [None]:
!pip install prince

Collecting prince
  Downloading prince-0.13.1-py3-none-any.whl.metadata (639 bytes)
Downloading prince-0.13.1-py3-none-any.whl (415 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m415.8/415.8 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: prince
Successfully installed prince-0.13.1


In [9]:
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.compose import ColumnTransformer
from data_processor import DataProcessor

# Load and preprocess the dataset
dp = DataProcessor(pd.read_csv("train.csv"))
X_num = dp.numerical_data().drop(columns=['SalePrice'])
X_cat = dp.categorical_data()
X = pd.concat([X_num, X_cat], axis=1)
y = dp.numerical_data()['SalePrice']

# Define a function to create pipelines
def create_pipeline(model, use_pca=False):
    steps = []

    # Preprocessor for numerical data (scaling)
    preprocessor = ColumnTransformer(
        transformers=[('scaler', StandardScaler(), X_num.columns)],
        remainder='passthrough'  # Keep one-hot-encoded categorical features
    )
    steps.append(('preprocessor', preprocessor))

    # Optionally add PCA
    if use_pca:
        steps.append(('pca', PCA(n_components=0.95)))

    # Add the model
    steps.append(('model', model))

    return Pipeline(steps)

# Models and hyperparameters
alphas = [0.01, 0.1, 1, 10, 100]
models = [
    ('LinearRegression', LinearRegression(), None),
    ('Ridge', Ridge(), {'model__alpha': alphas}),
]

# Store results
results = []

# Train models on datasets with/without PCA
for use_pca in [False, True]:
    for model_name, model, param_grid in models:
        # Create pipeline
        pipeline = create_pipeline(model, use_pca)

        if param_grid:
            # Add grid search if hyperparameters are provided
            grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring=scorer, n_jobs=-1)
            grid_search.fit(X, y)
            best_model = grid_search.best_estimator_
            rmse = -grid_search.best_score_
            best_params = grid_search.best_params_
        else:
            # Use cross_val_score if no hyperparameters to tune
            scores = cross_val_score(pipeline, X, y, cv=10, scoring=scorer, n_jobs=-1)
            rmse = -scores.mean()
            best_params = None

        # Calculate normalized RMSE
        nrmse = rmse / y.mean()

        # Append results
        results.append({
            'Dataset': 'Train',  # Change this to 'Test' for test dataset
            'PCA': use_pca,
            'Model': model_name,
            'Best Params': best_params,
            'RMSE': rmse,
            'NRMSE': nrmse
        })

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Sort results for better readability
results_df = results_df.sort_values(by='NRMSE').reset_index(drop=True)

# Display the results
print(results_df)


  Dataset    PCA             Model            Best Params          RMSE  \
0   Train  False             Ridge  {'model__alpha': 100}  3.088792e+04   
1   Train   True             Ridge  {'model__alpha': 100}  3.120703e+04   
2   Train   True  LinearRegression                   None  3.198423e+04   
3   Train  False  LinearRegression                   None  2.648838e+14   

          NRMSE  
0  1.707258e-01  
1  1.724896e-01  
2  1.767854e-01  
3  1.464084e+09  


In [18]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer
from math import sqrt
from data_processor import DataProcessor

# Define a function to calculate NRMSE (Normalized Root Mean Squared Error)
def normalized_rmse(y_true, y_pred):
    rmse = sqrt(((y_true - y_pred) ** 2).mean())
    return rmse / y_true.mean()

# Load and preprocess the dataset
dp = DataProcessor(pd.read_csv("train.csv"))
X_num = dp.numerical_data().drop(columns=['SalePrice'])
X_cat = dp.categorical_data()
X = pd.concat([X_num, X_cat], axis=1)
y = dp.numerical_data()['SalePrice']

# Define a function to create pipelines
def create_pipeline(model, use_pca=False, degree=2):
    steps = []

    # Preprocessor for numerical data (scaling)
    preprocessor = ColumnTransformer(
        transformers=[('scaler', StandardScaler(), X_num.columns)],
        remainder='passthrough'  # Keep one-hot-encoded categorical features
    )
    steps.append(('preprocessor', preprocessor))

    # Add polynomial features for models like Polynomial Regression
    if degree > 1:
        steps.append(('poly', PolynomialFeatures(degree=degree, include_bias=False)))

    # Optionally add PCA
    if use_pca:
        steps.append(('pca', PCA(n_components=0.95)))

    # Add the model (Linear Regression for Polynomial Regression)
    steps.append(('model', model))

    return Pipeline(steps)

# Models and hyperparameters
degrees = [1, 2]  # Polynomial degrees to try (scaled down)
models = [
    ('PolynomialRegression', LinearRegression(), None),  # Polynomial regression with Linear Regression
]

# Store results
results = []

# Train models on datasets with/without PCA
for use_pca in [False]:  # Turn off PCA for now (reduce complexity)
    for degree in degrees:  # Loop through polynomial degrees (scaled down)
        for model_name, model, param_grid in models:
            # Create pipeline
            pipeline = create_pipeline(model, use_pca, degree)

            # Use cross_val_score if no hyperparameters to tune
            scores = cross_val_score(pipeline, X, y, cv=5, scoring=make_scorer(normalized_rmse, greater_is_better=False), n_jobs=1)
            rmse = -scores.mean()  # Cross-validation RMSE (negative because cross_val_score returns negative values)
            best_params = None

            # Calculate normalized RMSE
            nrmse = rmse / y.mean()

            # Append results
            results.append({
                'Dataset': 'Train',  # Change this to 'Test' for test dataset
                'PCA': use_pca,
                'Polynomial Degree': degree,  # Show polynomial degree
                'Model': model_name,
                'Best Params': best_params,
                'RMSE': rmse,
                'NRMSE': nrmse
            })

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Sort results for better readability
results_df = results_df.sort_values(by='NRMSE').reset_index(drop=True)

# Display the results
print(results_df)



  Dataset    PCA  Polynomial Degree                 Model Best Params  \
0   Train  False                  2  PolynomialRegression        None   
1   Train  False                  1  PolynomialRegression        None   

           RMSE         NRMSE  
0  1.801977e-01  9.960009e-07  
1  8.086131e+08  4.469422e+03  
