<a href="https://colab.research.google.com/github/ojumoolatimi/buenos-aires-housing-ml/blob/main/price_prediction(location%2C_area_and_neighborhood).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings
from glob import glob
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from category_encoders import OneHotEncoder

from ipywidgets import Dropdown, FloatSlider, IntSlider, interact
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline

warnings.simplefilter(action="ignore", category=FutureWarning)

In [None]:
def wrangle(filepath):
    df = pd.read_csv(filepath)

    mask_ba = df['place_with_parent_names'].str.contains('Capital Federal')
    mask_ba = df['property_type'] == 'apartment'
    mask_price = df['price_aprox_usd'] < 400000



    df = df[mask_ba & mask_ba & mask_price]

    low, high = df['surface_covered_in_m2'].quantile([0.1, 0.9])
    mask_surf = df['surface_covered_in_m2'].between(low, high)
    df = df[mask_surf]

    df[['lat', 'lon']] = df['lat-lon'].str.split(',', expand = True).astype('float')
    df.drop(columns = 'lat-lon', inplace = True)

    #
    df['neighborhood'] = df['place_with_parent_names'].str.split('|', expand = True)[3]
    df.drop(columns = 'place_with_parent_names', inplace = True)

    #columns missing

    df.drop(columns = ['floor', 'expenses'], inplace = True)

    #drop low and high cardinality

    df.drop(columns = ['operation', 'property_type', 'properati_url', 'currency' ], inplace = True)

    #drop leakage columns
    df.drop(columns = ['price', 'price_aprox_local_currency', 'price_per_m2', 'price_usd_per_m2', ], inplace = True)

    #drop columns with multicollinearity

    df.drop(columns = ['rooms', 'surface_total_in_m2'], inplace = True)


    return df


In [None]:
frame1 = wrangle('/content/drive/MyDrive/dataset/buenos-aires-real-estate-1.csv')
print(frame1.shape)
frame1.head()

In [None]:
files = glob('/content/drive/MyDrive/dataset/buenos-aires-real-estate-*.csv')
files

In [None]:
frames = [wrangle (file) for file in files]
frames[0]

In [None]:
df = pd.concat(frames, ignore_index = True)
print(df.info())
df.head()

In [None]:
df.isna().sum()/len(df)*100

In [None]:
df.nunique()

In [None]:
sorted(df.columns.to_list())

In [None]:
corr = df.select_dtypes('number').drop(columns = 'price_aprox_usd').corr()
sns.heatmap(corr, annot = True)

split

In [None]:
features = ['surface_covered_in_m2', 'lat', 'lon', 'neighborhood' ]
X_train = df[features]
X_train

In [None]:
target = 'price_aprox_usd'
y_train = df[target]
y_train

In [None]:
print(X_train.shape)
print(y_train.shape)

baseline model

In [None]:
y_mean = y_train.mean()
print('y_train_mean:', y_mean.round(2))

In [None]:
y_pred_baseline = [y_mean]*len(y_train)
y_pred_baseline[:5]

In [None]:
mae_baseline = mean_absolute_error(y_train, y_pred_baseline)
print('y_train_mean:', y_mean.round(2))
print('mae_baseline', round(mae_baseline, 2))

model training

In [None]:
model = make_pipeline(OneHotEncoder(), SimpleImputer(), Ridge())
model.fit(X_train, y_train)

Evaluation

In [None]:
y_pred_training = model.predict(X_train)
y_pred_training[:5]

In [None]:
MAE_training = mean_absolute_error(y_train, y_pred_training)
print('MAE_training:', round(MAE_training, 2))

In [None]:
result_df = pd.DataFrame({'actual': y_train[:10], 'predicted' : y_pred_training[:10]})
display(result_df)

Result Communication

In [None]:
def make_prediction(area, lat, lon, neighborhood):
     data = {'surface_covered_in_m2': area,
     'lat': lat,
     'lon': lon,
     'neighborhood': neighborhood}
     df = pd.DataFrame(data, index = [0])
     prediction = model.predict(df).round(2)

     return f"Predicted apartment price: ${prediction}"


In [None]:
make_prediction(80, -60.51,	-45.93, 'Tigre'	)

interactive dashboard, where a user can supply values and receive a prediction.

In [None]:
interact(make_prediction,
         area = IntSlider(
             min = X_train['surface_covered_in_m2'].min(),
             max = X_train['surface_covered_in_m2'].max(),
             value = X_train['surface_covered_in_m2'].mean()),
         lat = FloatSlider(
             min = X_train['lat'].min(),
             max = X_train['lat'].max(),
             step = 0.01,
             value = X_train['lat'].mean()),
         lon = FloatSlider(
             min = X_train['lon'].min(),
             max = X_train['lon'].max(),
             step = 0.01,
             value = X_train['lon'].mean()),
         neighborhood = Dropdown(options = sorted(X_train['neighborhood'].unique())))
