In [6]:
import pandas as pd
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import mean_squared_error as mse, r2_score

df = pd.read_csv('Final_Cleaned.csv', sep=';')
# Filter out columns that exist in the dataset and need to be dropped
columns_to_drop_actual = ['road', 'seller', 'latitude', 'longitude', 'full_address', 'zip_address',
                          'link to seller', 'city_3', 'zip address', 'city_2', 'zip',
                          'extra', 'days at current seller']
columns_to_drop_actual = [col for col in columns_to_drop_actual if col in df.columns]

# Drop the identified columns
df_clean = df.drop(columns=columns_to_drop_actual)

# Rest of the preprocessing steps
new_column_names = {column: column.lower().replace(" ", "_") for column in df_clean.columns}
df_clean.rename(columns=new_column_names, inplace=True)
df_clean.loc[df_clean['land_area'].isna(), 'land_area'] = df_clean['area']
df_clean = df_clean.dropna().reset_index(drop=True)
df_final = df_clean.drop(columns=['city', 'municipal', 'distance_to_copenhagen', 'distance_to_aarhus', 
                                  'distance_to_odense', 'distance_to_aalborg','sqm_price'])
building_type_counts = df_final['property_type'].value_counts()
top_building_types = building_type_counts.head(4).index.tolist()
df_final = df_final[df_final['property_type'].isin(top_building_types)]
dummy_1 = pd.get_dummies(df_final['property_type'], prefix='type', drop_first=True)
dummy_2 = pd.get_dummies(df_final['energy'], prefix='energy', drop_first=True)
dummy_3 = pd.get_dummies(df_final['closest_city'], prefix='close', drop_first=True)
df_final = pd.concat([df_final, dummy_1, dummy_2, dummy_3], axis=1)
df_final = df_final.drop(columns=['energy', 'property_type', 'closest_city'])
columns_to_remove_outliers = ['price', 'area', 'land_area', 'rooms', 'ejerudg', 'days_active']
for column in columns_to_remove_outliers:
    Q1 = df_final[column].quantile(0.25)
    Q3 = df_final[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df_final = df_final[(df_final[column] >= lower_bound) & (df_final[column] <= upper_bound)]
df_final = df_final.drop_duplicates()

# Define target and features
y = df_final['price']
X = df_final.drop(columns=['price'])

# Split the data
X_dev, X_test, y_dev, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

# Polynomial feature creation and scaling
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly_dev = poly.fit_transform(X_dev)
X_poly_test = poly.transform(X_test)
scaler = StandardScaler()
X_poly_dev_scaled = scaler.fit_transform(X_poly_dev)
X_poly_test_scaled = scaler.transform(X_poly_test)

# Defining the models dictionary which includes the regression models and their hyperparameters
models = {
    'Linear Regression': {
        'model': LinearRegression(),
        'params': {}
    },
    'Lasso': {
        'model': Lasso(),
        'params': {
            'alpha': np.logspace(-6, 6, 13)
        }
    },
    'Ridge': {
        'model': Ridge(),
        'params': {
            'alpha': np.logspace(-6, 6, 13)
        }
    },
    'Elastic Net': {
        'model': ElasticNet(),
        'params': {
            'alpha': np.logspace(-6, 6, 13),
            'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
        }
    }
}

# Given the code you provided, now I'll skip the data processing since it's already completed and 
# proceed directly with the model training, hyperparameter tuning, and evaluation

train_rmse_results = {}
test_rmse_results = {}
r2_train_results = {}
r2_test_results = {}



# Storing model details for later analysis
model_details = {}

# Wrap the models.items() with tqdm to display the progress bar
for name, model_info in tqdm(models.items(), desc="Training Models", unit="model"):
    if model_info['params']:
        gs = GridSearchCV(model_info['model'], model_info['params'], cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
        gs.fit(X_poly_dev_scaled, y_dev)
        best_model = gs.best_estimator_
        best_alpha = gs.best_params_['alpha']
    else:
        best_model = model_info['model']
        best_model.fit(X_poly_dev_scaled, y_dev)
        best_alpha = None  # For Linear Regression
    
    # Evaluation
    y_train_pred = best_model.predict(X_poly_dev_scaled)
    train_rmse = np.sqrt(mse(y_dev, y_train_pred))
    r2_train = r2_score(y_dev, y_train_pred)
    y_test_pred = best_model.predict(X_poly_test_scaled)
    test_rmse = np.sqrt(mse(y_test, y_test_pred))
    r2_test = r2_score(y_test, y_test_pred)
    
    # Storing results for later analysis
    train_rmse_results[name] = train_rmse
    test_rmse_results[name] = test_rmse
    r2_train_results[name] = r2_train
    r2_test_results[name] = r2_test
    
    # Storing model details
    model_details[name] = {
        'Weights': best_model.coef_,
        'Alpha (if applicable)': best_alpha,
    }

# Return the model details for analysis
model_details
# Extracting weights and alpha
weights = best_model.coef_
if 'alpha' in gs.best_params_:
    best_alpha = gs.best_params_['alpha']
else:
    best_alpha = None  # For Linear Regression

# train_rmse_results, test_rmse_results, r2_train_results, r2_test_results
print(train_rmse_results, test_rmse_results, r2_train_results, r2_test_results)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
Training Models: 100%|██████████| 4/4 [10:02<00:00, 150.64s/model]

{'Linear Regression': 635500.8930906449, 'Lasso': 643170.9036516276, 'Ridge': 635482.7214652431, 'Elastic Net': 643000.7545314424} {'Linear Regression': 613605.0626204548, 'Lasso': 620847.0334965013, 'Ridge': 613478.9135530685, 'Elastic Net': 620916.3487885406} {'Linear Regression': 0.7843696239032598, 'Lasso': 0.7791332257856263, 'Ridge': 0.7843819552734945, 'Elastic Net': 0.7792500697417685} {'Linear Regression': 0.7947134353310672, 'Lasso': 0.7898391194057212, 'Ridge': 0.7947978350475862, 'Elastic Net': 0.7897921894045328}





In [7]:
print("Training RSME: ", train_rmse_results)
print("Test RSME: ", test_rmse_results)
print("Training R^2: ", r2_train_results)
print("Test R^2: ", r2_test_results)

Training RSME:  {'Linear Regression': 635500.8930906449, 'Lasso': 643170.9036516276, 'Ridge': 635482.7214652431, 'Elastic Net': 643000.7545314424}
Test RSME:  {'Linear Regression': 613605.0626204548, 'Lasso': 620847.0334965013, 'Ridge': 613478.9135530685, 'Elastic Net': 620916.3487885406}
Training R^2:  {'Linear Regression': 0.7843696239032598, 'Lasso': 0.7791332257856263, 'Ridge': 0.7843819552734945, 'Elastic Net': 0.7792500697417685}
Test R^2:  {'Linear Regression': 0.7947134353310672, 'Lasso': 0.7898391194057212, 'Ridge': 0.7947978350475862, 'Elastic Net': 0.7897921894045328}


In [8]:
model_details

{'Linear Regression': {'Weights': array([ 1.84245842e+06,  1.56747947e+06,  5.51034506e+05, -8.26120005e+05,
         -3.53628645e+05, -9.18184106e+06,  9.25515127e+06,  1.13381739e+06,
         -5.53557754e+05,  3.20459342e+05, -1.71629185e+16,  2.27614942e+17,
          4.17800166e+17,  1.21082083e+17,  4.43933898e+17, -1.39225512e+17,
          2.15430250e+17,  3.14242766e+17,  5.42736303e+16, -2.16135018e+17,
         -1.63823864e+05, -2.15910032e+04, -1.46746393e+05,  2.18751323e+05,
         -2.78316500e+04,  5.74957813e+04,  8.79384839e+04, -2.88025106e+05,
          8.55843232e+04, -1.17980141e+05, -9.50904161e+05, -4.28218356e+05,
         -1.01550376e+06, -1.09354400e+06, -8.19776000e+05, -5.14016000e+05,
         -3.81012000e+05,  5.55200000e+04,  9.43440000e+04,  1.19544000e+05,
          1.43296000e+05,  4.67040000e+04, -6.18400000e+04,  4.93120000e+04,
         -4.35120000e+04, -8.23680000e+04, -1.54799300e+06,  5.79520000e+04,
         -8.08870000e+04, -2.56554000e+05,  

In [9]:
weights

array([ 6.71278973e+05, -6.68203398e+05, -1.64924317e+05,  1.18791954e+06,
       -2.06425233e+05, -1.14622788e+06, -4.40115004e+05, -1.22929501e+05,
       -5.24588297e+05,  2.84497836e+05,  3.33054963e+05,  1.28998084e+04,
       -1.19649056e+05, -1.78652978e+05, -1.96527879e+04,  5.26946345e+03,
        2.35907803e+04,  3.66278697e+05, -4.35716221e+04,  1.48508839e+05,
       -1.20085694e+05, -6.99890969e+04, -2.35192976e+05,  2.70215817e+05,
       -2.14306944e+04,  1.11400463e+05,  1.19652718e+05,  6.36482860e+05,
        1.17800987e+05, -1.52866761e+05, -9.18703047e+05, -2.73629642e+05,
       -7.44934511e+05, -8.37069358e+05, -6.45639169e+05, -3.98929126e+05,
       -2.97166571e+05,  8.92621053e+04,  1.14947425e+05,  1.43132286e+05,
        1.50294376e+05,  6.41966657e+04, -2.49656272e+03,  4.81687779e+04,
       -2.05781299e+04, -4.90248714e+04,  7.14868710e+05, -2.18878558e+03,
       -7.48794746e+04, -2.62311441e+05,  3.48246713e+04,  1.62960903e+04,
       -1.60701579e+04, -

In [10]:
best_alpha

0.0001