In [3]:
import numpy as np
import pandas as pd

# Set random seed for reproducibility
np.random.seed(42)

# Number of data points
n = 500

# Generate synthetic data for apartment prices
square_area = np.random.randint(40, 200, n)  # in square meters
num_rooms = np.random.randint(1, 6, n)  # number of rooms
age_of_building = np.random.randint(1, 40, n)  # age of the building in years
floor_level = np.random.randint(1, 20, n)  # floor level of the apartment
city = np.random.choice(['Amman', 'Irbid', 'Aqaba'], n)  # categorical feature: city

# Assume base price is influenced by square_area, num_rooms, age_of_building, floor_level, and city
base_price = 300  # base price per square meter in JDs
price_per_room = 5000  # additional price per room
price_per_year = -1000  # depreciation due to age of the building
price_per_floor = 1000  # increase in price based on floor level

# City factor
city_factor = {'Amman': 1.5, 'Irbid': 1.0, 'Aqaba': 1.2}

# Generate the target variable (price)
price = (square_area * base_price + 
         num_rooms * price_per_room + 
         age_of_building * price_per_year + 
         floor_level * price_per_floor) * [city_factor[c] for c in city]

# Convert to DataFrame
df = pd.DataFrame({
    'Square_Area': square_area,
    'Num_Rooms': num_rooms,
    'Age_of_Building': age_of_building,
    'Floor_Level': floor_level,
    'City': city,
    'Price': price
})

# Save the DataFrame to a CSV file
file_path = '../datasets/apartment_prices.csv'
df.to_csv(file_path, index=False)

# Display the first few rows of the dataset
# import ace_tools as tools; tools.display_dataframe_to_user(name="Synthetic Apartment Prices Dataset", dataframe=df)
