In [2]:
import pandas as pd
import numpy as np

In [4]:
# Function to generate synthetic data
def generate_construction_cost_data(num_records=1000):
    np.random.seed(42)
    
    # Generate synthetic features
    project_ids = np.arange(1, num_records + 1)
    project_types = np.random.choice(["Residential", "Commercial", "Mixed-use"], size=num_records)
    total_area = np.random.randint(500, 10000, size=num_records)  # Total project area in sq. ft
    num_floors = np.random.randint(1, 20, size=num_records)
    material_cost_per_sqft = np.round(np.random.uniform(20, 150, size=num_records), 2)  # Material cost per sq. ft
    labor_cost_per_sqft = np.round(np.random.uniform(10, 100, size=num_records), 2)  # Labor cost per sq. ft
    project_duration = np.random.randint(6, 36, size=num_records)  # Project duration in months
    location_types = np.random.choice(["Urban", "Semi-Urban", "Rural"], size=num_records)
    transportation_cost = np.round(np.random.uniform(5000, 50000, size=num_records), 2)  # Transportation cost
    inflation_rate = np.round(np.random.uniform(1.5, 6.0, size=num_records), 2)  # Inflation rate percentage
    complexity = np.random.choice(["Simple", "Medium", "Complex"], size=num_records)

    # Derived target variable: Total Construction Cost
    base_cost = total_area * (material_cost_per_sqft + labor_cost_per_sqft)
    complexity_factor = [1.0 if c == "Simple" else 1.2 if c == "Medium" else 1.5 for c in complexity]
    location_factor = [1.2 if loc == "Urban" else 1.1 if loc == "Semi-Urban" else 1.0 for loc in location_types]
    inflation_adjustment = 1 + (inflation_rate / 100)
    total_construction_cost = np.round(base_cost * np.array(complexity_factor) * np.array(location_factor) * inflation_adjustment + transportation_cost, 2)
    
    # Create DataFrame
    data = {
        "Project_ID": project_ids,
        "Project_Type": project_types,
        "Total_Area_SqFt": total_area,
        "Number_of_Floors": num_floors,
        "Material_Cost_per_SqFt": material_cost_per_sqft,
        "Labor_Cost_per_SqFt": labor_cost_per_sqft,
        "Project_Duration_Months": project_duration,
        "Location_Type": location_types,
        "Transportation_Cost": transportation_cost,
        "Inflation_Rate": inflation_rate,
        "Complexity": complexity,
        "Total_Construction_Cost": total_construction_cost
    }
    
    return pd.DataFrame(data)

# Generate dataset
num_records = 5000
construction_dataset = generate_construction_cost_data(num_records)

# Save to CSV
construction_dataset.to_csv("construction_cost_prediction_dataset.csv", index=False)
print(f"Dataset with {num_records} rows saved as 'construction_cost_prediction_dataset.csv'")


Dataset with 5000 rows saved as 'construction_cost_prediction_dataset.csv'
