In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use("fivethirtyeight")

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
df_ = pd.read_csv('/Users/ayushyapare/Desktop/Ayushyas_Life/Work/Projects/Housing_Price_Prediction_Madrid/data/raw/houses_Madrid.csv')
target_column = 'buy_price'


In [2]:
import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append('/Users/ayushyapare/Desktop/Ayushyas_Life/Work/Projects/Snippets')

from DataFrame_Analysis import analyze_dataframe

In [3]:
df = df_[['sq_mt_built', 'sq_mt_useful', 'n_rooms',
       'n_bathrooms', 'n_floors', 'sq_mt_allotment','is_exact_address_hidden',
        'portal', 'floor', 'is_floor_under', 'door',
       'neighborhood_id', 'operation', 'rent_price', 'rent_price_by_area',
       'is_rent_price_known', 'buy_price', 'buy_price_by_area',
       'is_buy_price_known', 'house_type_id', 'is_renewal_needed',
       'is_new_development', 'built_year', 'has_central_heating',
       'has_individual_heating', 'are_pets_allowed', 'has_ac',
       'has_fitted_wardrobes', 'has_lift', 'is_exterior', 'has_garden',
       'has_pool', 'has_terrace', 'has_balcony', 'has_storage_room',
       'is_furnished', 'is_kitchen_equipped', 'is_accessible',
       'has_green_zones', 'energy_certificate', 'has_parking',
       'has_private_parking', 'has_public_parking',
       'is_parking_included_in_price', 'parking_price', 'is_orientation_north',
       'is_orientation_west', 'is_orientation_south', 'is_orientation_east']]
      

In [4]:
# Perform EDA now
# analyze_dataframe(df)

In [5]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Assuming the dataset is already loaded into df
# Define features and target variable
X = df.drop(target_column, axis=1)
y = df[target_column]

# Identify numerical and categorical columns
# Identify numerical and categorical columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns

# Define preprocessing for numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with the mean
    ('scaler', StandardScaler())  # Standardize numerical features
])

# Define preprocessing for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with the most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [6]:

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor()
}

# Train and evaluate models
for name, model in models.items():
    # Create a pipeline that includes preprocessing and the model
    clf = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    print(f'{name} - RMSE: {rmse:.2f}, R2: {r2:.2f}')

Linear Regression - RMSE: 267375.11, R2: 0.87
Ridge Regression - RMSE: 267383.03, R2: 0.87
Lasso Regression - RMSE: 267358.95, R2: 0.87


KeyboardInterrupt: 

In [2]:
#conda install -c conda-forge pycaret

Channels:
 - conda-forge
 - defaults
 - anaconda
Platform: osx-arm64
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/anaconda3/envs/housing_project

  added / updated specs:
    - pycaret


The following packages will be UPDATED:

  ca-certificates    pkgs/main::ca-certificates-2024.3.11-~ --> conda-forge::ca-certificates-2024.7.4-hf0a4a13_0 
  openssl              pkgs/main::openssl-3.0.14-h80987f9_0 --> conda-forge::openssl-3.3.1-hfb2fe0b_1 

The following packages will be SUPERSEDED by a higher-priority channel:

  certifi            pkgs/main/osx-arm64::certifi-2024.6.2~ --> conda-forge/noarch::certifi-2024.6.2-pyhd8ed1ab_0 



Downloading and Extracting Packages:

Preparing transaction: done
Verifying transaction: done
Executing transaction: done

Note: you may need to restart the kernel to use updated packages.


In [2]:
from pycaret.classification import *

ImportError: cannot import name '_print_elapsed_time' from 'sklearn.utils' (/opt/anaconda3/envs/housing_project/lib/python3.9/site-packages/sklearn/utils/__init__.py)

In [7]:
#import mlflow
#import mlflow.sklearn
from pycaret.regression import setup, compare_models, pull, save_model, load_model

# Setup the environment in PyCaret
regression_setup = setup(
    data=df,
    target = target_column,
    session_id=9,
    #max_encoding_ohe=600, # columns with 600 or less categories will be One-hot encoded ELSE target encoding
    #rare_to_value=0.008, # Categories with less than 0.008 (0.8%) of the data will be grouped into a new category (Other)
    #rare_value='Other',
    fix_imbalance = True,
    fix_imbalance_method = 'SMOTE',
    transformation = True,
    transformation_method = 'yeo-johnson',
    #experiment_name='Clsfctn_tel_cust_ayushya_(dm)',
    log_experiment = False,
    normalize=True,  # True, False
    normalize_method='zscore',  # 'zscore', 'minmax', 'maxabs', 'robust'
    n_jobs=-1)

# Start an MLflow run
mlflow.start_run()

# Log the parameters and metrics with MLflow
best_model = compare_models()

# Log the best model with MLflow
mlflow.sklearn.log_model(best_model, "best_model")

# Get the comparison results
comparison_results = pull()

# Log metrics
for index, row in comparison_results.iterrows():
    mlflow.log_metric(f"{row['Model']}_RMSE", row['RMSE'])
    mlflow.log_metric(f"{row['Model']}_R2", row['R2'])

# End the MLflow run
mlflow.end_run()

# Save the best model
save_model(best_model, 'best_regression_model')

# Load the model later if needed
# loaded_model = load_model('best_regression_model')

# Predict on the test set
predictions = predict_model(best_model)
print(predictions)

# Use Ngrok to expose the MLflow UI
from pyngrok import ngrok

# Start MLflow UI
get_ipython().system_raw("mlflow ui --port 5000 &")

# Expose the MLflow UI using Ngrok
ngrok_tunnel = ngrok.connect(5000)
print("MLflow UI is running at:", ngrok_tunnel.public_url)

# Open the MLflow UI in your browser
import webbrowser
webbrowser.open(ngrok_tunnel.public_url)

ImportError: cannot import name 'if_delegate_has_method' from 'sklearn.utils.metaestimators' (/opt/anaconda3/envs/ml_project_environment/lib/python3.9/site-packages/sklearn/utils/metaestimators.py)