In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/housing/data5.csv
/kaggle/input/housing/data4.csv
/kaggle/input/housing/data2.csv
/kaggle/input/housing/data3.csv
/kaggle/input/housing/data1.csv
/kaggle/input/housing/test_features.csv


In [2]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, Ridge
from category_encoders import OneHotEncoder
from sklearn.impute import SimpleImputer
from glob import glob

In [3]:
def wrangle(filepath:str)->pd.DataFrame:
    # Read CSV file
    df = pd.read_csv(filepath)

    # Subset data: Apartments in "Capital Federal", less than 400,000
    mask_ba = df["place_with_parent_names"].str.contains("Capital Federal")
    mask_apt = df["property_type"] == "apartment"
    mask_price = df["price_aprox_usd"] < 400_000
    df = df[mask_ba & mask_apt & mask_price]

    # Subset data: Remove outliers for "surface_covered_in_m2"
    low, high = df["surface_covered_in_m2"].quantile([0.1, 0.9])
    mask_area = df["surface_covered_in_m2"].between(low, high)
    df = df[mask_area]

    # Split "lat-lon" column
    df[["lat", "lon"]] = df["lat-lon"].str.split(",", expand=True).astype(float)
    df.drop(columns="lat-lon", inplace=True)

    # Get place name
    df["neighborhood"] = df["place_with_parent_names"].str.split("|", expand=True)[3]
    df.drop(columns="place_with_parent_names", inplace=True)
    
    df.drop(columns=["floor", "expenses"], inplace=True)
    df.drop(columns=["operation", "property_type", "currency", "properati_url"],inplace=True)
    df.drop(columns=['price',
 'price_aprox_local_currency',
 'price_per_m2',
 'price_usd_per_m2'], inplace=True)
    #dropcolums with multiple linearlity
    df.drop(columns=["surface_total_in_m2", "rooms"], inplace=True)
    
    return df

In [4]:
frames=[]
files = glob('/kaggle/input/housing/data*.csv')
for i in files:
    df = wrangle(i)
    frames.append(df)
    


In [5]:
df = pd.concat(frames, ignore_index=True)

In [6]:
features = ["lat", "lon", "surface_covered_in_m2", "neighborhood"]
target = "price_aprox_usd"

In [7]:
X_train  = df[features]
y_train = df[target]

In [8]:
y_pred_baseline = [y_train.mean()]*len(y_train)
mae = mean_absolute_error(y_train, y_pred_baseline)

print("Baseline MAE:", mae)

Baseline MAE: 44860.10834274133


In [9]:
model = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    SimpleImputer(),
    Ridge()
)

In [10]:
model.fit(X_train, y_train)

In [11]:
y_pred_training = model.predict(X_train)
mae = mean_absolute_error(y_train, y_pred_training)

print("Training MAE:", mae)

Training MAE: 24207.107190330353


In [12]:
X_test = pd.read_csv("/kaggle/input/housing/test_features.csv")[features]

In [13]:
y_test = pd.Series(model.predict(X_test))

In [14]:
y_test

0       231122.403569
1       162572.942392
2        68477.949626
3        63521.438989
4       105694.463885
            ...      
1481     99448.896609
1482    174820.183967
1483    127364.495816
1484    143104.250505
1485    118673.708770
Length: 1486, dtype: float64

In [15]:
def make_prediction(area, lat, lon, neighborhood):
    data = {
    "lat":lat,
    "lon":lon,
    "surface_covered_in_m2":area,  
    "neighborhood":neighborhood
    }
    df = pd.DataFrame(data,index=[0])
    prediction = model.predict(df)
    return f"Predicted apartment price: ${prediction[0].round(2)}"

In [16]:
make_prediction(-34.60,-58.46,49,"Villa Crespo")

'Predicted apartment price: $5906579.59'