# Predicting Apartment Prices in Mexico City 🇲🇽

In [1]:
# Import libraries here
import warnings

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from glob import glob

# Prepare Data
## Import

Write a `wrangle` function that takes the name of a CSV file as input and returns a DataFrame. The function should do the following steps:

1. Subset the data in the CSV file and return only apartments in Mexico City (`"Distrito Federal"`) that cost less than \$100,000.
2. Remove outliers by trimming the bottom and top 10\% of properties in terms of `"surface_covered_in_m2"`.
3. Create separate `"lat"` and `"lon"` columns.
4. Mexico City is divided into [15 boroughs](https://en.wikipedia.org/wiki/Boroughs_of_Mexico_City). Create a `"borough"` feature from the `"place_with_parent_names"` column.
5. Drop columns that are more than 50\% null values.
6. Drop columns containing low- or high-cardinality categorical values. 
7. Drop any columns that would constitute leakage for the target `"price_aprox_usd"`.
8. Drop any columns that would create issues of multicollinearity. 


In [13]:
# Build your `wrangle` function
def wrangle(filepath):
    # Read CSV file
    df = pd.read_csv(filepath)

    # Subset data: Apartments in "Distrito Federal", less than 100,000
    mask_ba = df["place_with_parent_names"].str.contains("Distrito Federal")
    mask_apt = df["property_type"] == "apartment"
    mask_price = df["price_aprox_usd"] < 100_000
    df = df[mask_ba & mask_apt & mask_price]

    # Subset data: Remove outliers for "surface_covered_in_m2"
    low, high = df["surface_covered_in_m2"].quantile([0.1, 0.9])
    mask_area = df["surface_covered_in_m2"].between(low, high)
    df = df[mask_area]

    # Split "lat-lon" column
    df[["lat", "lon"]] = df["lat-lon"].str.split(",", expand=True).astype(float)
    df.drop(columns="lat-lon", inplace=True)

    # Get place name
    df["borough"] = df["place_with_parent_names"].str.split("|", expand=True)[1]
    df.drop(columns="place_with_parent_names", inplace=True)
    
    # Drop features with high null counts
    df.drop(columns=["floor","expenses","rooms"], inplace=True)

    # Drop low and high cardinalityy categorical variables
    df.drop(columns=["operation","property_type","currency", "properati_url"], inplace=True)
    
     # Drop leaky columns
    df.drop(columns=['price','price_aprox_local_currency','price_per_m2','price_usd_per_m2'], inplace=True)
    
    df.drop(columns=['surface_total_in_m2'],inplace=True)
    
    return df

In [15]:
wrangle("data/mexico-city-real-estate-1.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'data/mexico-city-real-estate-1.csv'

In [14]:
# Use this cell to test your wrangle function and explore the data
df.head()

NameError: name 'df' is not defined

Use glob to create the list `files`. It should contain the filenames of all the Mexico City real estate CSVs in the `./data` directory, except for `mexico-city-test-features.csv`.

In [11]:
files = glob("/Users/macos/Documents/GitHub/wqu_projects/housing_buenos_aires/buenos-aires-real-estate-*.csv")
files

['/Users/macos/Documents/GitHub/wqu_projects/housing_buenos_aires/buenos-aires-real-estate-3.csv',
 '/Users/macos/Documents/GitHub/wqu_projects/housing_buenos_aires/buenos-aires-real-estate-2.csv',
 '/Users/macos/Documents/GitHub/wqu_projects/housing_buenos_aires/buenos-aires-real-estate-1.csv',
 '/Users/macos/Documents/GitHub/wqu_projects/housing_buenos_aires/buenos-aires-real-estate-5.csv',
 '/Users/macos/Documents/GitHub/wqu_projects/housing_buenos_aires/buenos-aires-real-estate-4.csv']

In [12]:
frames = []
for file in files:
    df = wrangle(file)
    frames.append(df)

ValueError: Columns must be same length as key

In [10]:
df = pd.concat(frames, ignore_index = True)
print(df.info())
df.head()

ValueError: No objects to concatenate