In [12]:
import pandas as pd
import re
import unidecode

In [18]:
# 1) Upload CSV file containing scraped raw data (web_scraping.ipynb)
df = pd.read_csv(r'C:\Users\DAV\Documents\Python\Python_Project\House_price_project\data\raw\8_1_2024.csv')

In [19]:
# 2) Delete first column (unnamed) using its index.
df.drop(df.columns[0], axis=1, inplace=True)

In [20]:
# 3) Replace NaN with 0. 
df['square_meters'] = df['square_meters'].fillna(0)
df['rooms'] = df['rooms'].fillna(0)
df['parking'] = df['parking'].fillna(0)
df['baths'] = df['baths'].fillna(0)

In [21]:
# 4) Convert columns to integers.
df['square_meters'] = df['square_meters'].astype(int)
df['rooms'] = df['rooms'].astype(int)
df['parking'] = df['parking'].astype(int)
df['baths'] = df['baths'].astype(int)

In [22]:
# 5) Delete those records that have similarity in: title, square_meters, rooms, parking and baths.

#  Count rows number before removing duplicates.
initial_count = df.shape[0]

# Remove duplicates considering specified columns.
df.drop_duplicates(subset=['title', 'square_meters', 'rooms', 'parking', 'baths'], inplace=True)

# Count rows number after removing duplicates.
final_count = df.shape[0]

# Calculate how many rows were deleted.
removed_count = initial_count - final_count

# Message about how many rows were deleted.
print(f"\n{removed_count} duplicate rows were removed.")


33 duplicate rows were removed.


In [23]:
# 6) Convert price to integer, removing symbols ("[ ]",","$","Q") and applying a conversion rate.

# Define conversion rate ($ to Q).
conversion_rate = 7.8

def convert_price(price):
    # Remove extra spaces and brackets, and take first value in the list.
    price = price.strip("[]'")  # Remove brackets and quotes.

    # Check if price is in quetzales or dollars.
    if 'Q' in price:
        return int(price.replace('Q', '').replace(',', ''))  # Convert to integer.
    elif '$' in price:
        # Convert price from dollars to quetzales adjustable based on current exchange rate).
        usd_value = int(price.replace('$', '').replace(',', ''))  # Convert to integer.
        return int(usd_value * conversion_rate)  
    else:
        return None  # Or handle another way if it is not in the expected format

# Apply function and create a new column 'price_in_quetzales'
df['price_in_quetzales'] = df['price'].apply(convert_price)

In [42]:
# 7) Normalize text function.
def normalize_text(text):
    if isinstance(text, str):
        text = re.sub(r"[.,|()*!-/•:]", " ", text)  # Replace ".", "," and "|", etc. for space.
        text = re.sub(r'\b(zona)\s*(\d+)', r'\1 \2', text)  # Ensure space between "zone" and number.
        text = re.sub(r'(\d+)([a-zA-Z])', r'\1 \2', text)  # Space between number and letter.
        text = re.sub(r'\s+', ' ', text).strip()  # Delete multiple spaces.
        text = text.lower().replace("blvd", "boulevard")  # Replace "blvd" to "boulevard"
        return unidecode.unidecode(text.strip())
    return ''  # Return empty string if not a string.

# Normalize titles
df['normalized_title'] = df['title'].apply(normalize_text)
# Normalize titles
df['normalized_description'] = df['adv_description'].apply(normalize_text)

In [None]:
df.head()

In [67]:
df.head()

Unnamed: 0,title,price,square_meters,rooms,parking,baths,location,img_url,url,adv_description
0,Casas en venta,"['Q965,000']",138.0,3.0,2.0,3.0,Villa Canales,https://photos.encuentra24.com/t_or_fh_m/f_aut...,https://www.encuentra24.com/guatemala-es/biene...,Se venden bonitas casas en Altos de Village ki...
1,Venta apartamento zona 10,"['Q560,000']",23.0,1.0,,1.0,Zona 10,https://photos.encuentra24.com/t_or_fh_m/f_aut...,https://www.encuentra24.com/guatemala-es/biene...,"Precio de venta: Q.560,000 Mantenimiento: Q...."
2,Casa en venta el Residenciales Lo de Valdez,"['Q1,200,000']",320.0,4.0,3.0,3.0,San José Pinula,https://photos.encuentra24.com/t_or_fh_m/f_aut...,https://www.encuentra24.com/guatemala-es/biene...,"Casa en venta Residenciales A lo de Valdez, km..."
3,Vendo Casa para estrenar 4 habitaciones km 14...,"['$229,000']",256.0,4.0,2.0,3.0,Santa Catarina Pinula,https://photos.encuentra24.com/t_or_fh_m/f_aut...,https://www.encuentra24.com/guatemala-es/biene...,Preciosas Casa de 4 Habitaciones en desniveles...
4,Apartamentos en venta Cendana zona 9,"['Q1,159,541']",77.0,2.0,1.0,2.0,Zona 9,https://photos.encuentra24.com/t_or_fh_m/f_aut...,https://www.encuentra24.com/guatemala-es/biene...,"Apartamentos en venta de 1 y 2 habitaciones, C..."
