In [1]:
import pandas as pd
import numpy as np

print("Starting data generation...")

# --- 1. Define Headers and Settings ---

# All column headers identified from your two images
headers = [
    'price', 'crime_rate', 'resid_area', 'air_qual', 'room_num', 'age', 
    'dist1', 'dist2', 'dist3', 'dist4', 'teachers', 'poor_prop', 
    'airport', 'n_hos_beds', 'n_hot_rooms', 'waterbody', 'rainfall', 
    'bus_ter', 'parks'
]

# The number of observations (rows) you requested
num_observations = 506

# --- 2. Generate Synthetic Data ---

# Create a dictionary to hold our data.
# We'll generate random data that mimics the look of your screenshots.
data = {
    'price': np.random.uniform(10.0, 50.0, num_observations).round(1),
    'crime_rate': np.random.uniform(0.01, 10.0, num_observations).round(5),
    'resid_area': np.random.uniform(30.0, 60.0, num_observations).round(2),
    'air_qual': np.random.uniform(0.4, 0.9, num_observations).round(4),
    'room_num': np.random.uniform(4.0, 9.0, num_observations).round(4),
    'age': np.random.uniform(20.0, 100.0, num_observations).round(1),
    'dist1': np.random.uniform(1.0, 10.0, num_observations).round(2),
    'dist2': np.random.uniform(1.0, 10.0, num_observations).round(2),
    'dist3': np.random.uniform(1.0, 10.0, num_observations).round(2),
    'dist4': np.random.uniform(1.0, 10.0, num_observations).round(2),
    'teachers': np.random.uniform(18.0, 25.0, num_observations).round(1),
    'poor_prop': np.random.uniform(2.0, 30.0, num_observations).round(2),
    
    # Categorical data
    'airport': np.random.choice(['YES', 'NO'], num_observations, p=[0.55, 0.45]),
    'n_hos_beds': np.random.uniform(5.0, 10.0, num_observations).round(4),
    'n_hot_rooms': np.random.uniform(10.0, 20.0, num_observations).round(4),
    'waterbody': np.random.choice(
        ['None', 'Lake', 'River', 'Lake and River'], 
        num_observations, 
        p=[0.25, 0.25, 0.25, 0.25]
    ),
    'rainfall': np.random.randint(2, 7, num_observations),
    
    # In your image, 'bus_ter' is always 'YES', so we'll make it static.
    'bus_ter': ['YES'] * num_observations, 
    
    'parks': np.random.uniform(0.03, 0.09, num_observations).round(6),
}

# --- 3. Create DataFrame ---

# Create the pandas DataFrame from our data dictionary
# We specify the 'columns' argument to ensure they are in the correct order
df = pd.DataFrame(data, columns=headers)

# --- 4. Export to Files ---

# Define the output filenames
excel_filename = 'House_Price_Data.xlsx'
csv_filename = 'House_Price_Data.csv'

# Export to Excel
# index=False means we don't save the pandas row numbers (0, 1, 2...)
df.to_excel(excel_filename, index=False)
print(f"Successfully created Excel file: {excel_filename}")

# Export to CSV
# index=False serves the same purpose here
df.to_csv(csv_filename, index=False)
print(f"Successfully created CSV file: {csv_filename}")

print("Data generation complete.")

Starting data generation...
Successfully created Excel file: House_Price_Data.xlsx
Successfully created CSV file: House_Price_Data.csv
Data generation complete.


In [2]:
import pandas as pd
import numpy as np

print("Starting data generation...")

# --- 1. Define Headers and Settings ---
headers = [
    'price', 'crime_rate', 'resid_area', 'air_qual', 'room_num', 'age', 
    'dist1', 'dist2', 'dist3', 'dist4', 'teachers', 'poor_prop', 
    'airport', 'n_hos_beds', 'n_hot_rooms', 'waterbody', 'rainfall', 
    'bus_ter', 'parks'
]
num_observations = 506

# --- 2. Generate "Clean" Synthetic Data ---
data = {
    'price': np.random.uniform(10.0, 50.0, num_observations).round(1),
    'crime_rate': np.random.uniform(0.01, 10.0, num_observations).round(5),
    'resid_area': np.random.uniform(30.0, 60.0, num_observations).round(2),
    'air_qual': np.random.uniform(0.4, 0.9, num_observations).round(4),
    'room_num': np.random.uniform(4.0, 9.0, num_observations).round(4),
    'age': np.random.uniform(20.0, 100.0, num_observations).round(1),
    'dist1': np.random.uniform(1.0, 10.0, num_observations).round(2),
    'dist2': np.random.uniform(1.0, 10.0, num_observations).round(2),
    'dist3': np.random.uniform(1.0, 10.0, num_observations).round(2),
    'dist4': np.random.uniform(1.0, 10.0, num_observations).round(2),
    'teachers': np.random.uniform(18.0, 25.0, num_observations).round(1),
    'poor_prop': np.random.uniform(2.0, 30.0, num_observations).round(2),
    'airport': np.random.choice(['YES', 'NO'], num_observations, p=[0.55, 0.45]),
    'n_hos_beds': np.random.uniform(5.0, 10.0, num_observations).round(4),
    'n_hot_rooms': np.random.uniform(10.0, 20.0, num_observations).round(4),
    'waterbody': np.random.choice(
        ['None', 'Lake', 'River', 'Lake and River'], 
        num_observations, 
        p=[0.25, 0.25, 0.25, 0.25]
    ),
    'rainfall': np.random.randint(2, 7, num_observations),
    'bus_ter': ['YES'] * num_observations, 
    'parks': np.random.uniform(0.03, 0.09, num_observations).round(6),
}

# --- 3. Create DataFrame ---
df = pd.DataFrame(data, columns=headers)
print("Clean data generated. Now introducing imperfections...")

# --- 3a. Introduce Missing Data (NaN) ---
# We'll create a helper function to get random indices to "damage"
def get_random_indices(data_frame, fraction_to_corrupt):
    num_to_select = int(len(data_frame) * fraction_to_corrupt)
    # Get a list of random row indices, 'replace=False' ensures no duplicates
    return np.random.choice(data_frame.index, size=num_to_select, replace=False)

# Add missing data to 'n_hos_beds' (approx 5% missing)
missing_indices_beds = get_random_indices(df, 0.05)
df.loc[missing_indices_beds, 'n_hos_beds'] = np.nan

# Add missing data to 'room_num' (approx 3% missing)
missing_indices_rooms = get_random_indices(df, 0.03)
df.loc[missing_indices_rooms, 'room_num'] = np.nan

# --- 3b. Introduce Outliers ---

# Add outliers to 'price' (approx 2% of rows)
# Make these prices 10 times higher than they were
outlier_indices_price = get_random_indices(df, 0.02)
df.loc[outlier_indices_price, 'price'] = df.loc[outlier_indices_price, 'price'] * 10

# Add outliers to 'n_hot_rooms' (approx 1.5% of rows)
# We'll set these to a fixed, unrealistic number (e.g., 100)
outlier_indices_hot_rooms = get_random_indices(df, 0.015)
df.loc[outlier_indices_hot_rooms, 'n_hot_rooms'] = 100.0

# Add outliers to 'crime_rate' (approx 1% of rows)
# We'll make these 20x higher
outlier_indices_crime = get_random_indices(df, 0.01)
df.loc[outlier_indices_crime, 'crime_rate'] = df.loc[outlier_indices_crime, 'crime_rate'] * 20

print("Missing data and outliers have been added.")

# --- 4. Export to Files ---

# Define the new output filenames
excel_filename = 'House_Price_Data_Dirty.xlsx'
csv_filename = 'House_Price_Data_Dirty.csv'

# Export to Excel
df.to_excel(excel_filename, index=False)
print(f"Successfully created 'dirty' Excel file: {excel_filename}")

# Export to CSV
df.to_csv(csv_filename, index=False)
print(f"Successfully created 'dirty' CSV file: {csv_filename}")

print("Data generation complete.")

Starting data generation...
Clean data generated. Now introducing imperfections...
Missing data and outliers have been added.
Successfully created 'dirty' Excel file: House_Price_Data_Dirty.xlsx
Successfully created 'dirty' CSV file: House_Price_Data_Dirty.csv
Data generation complete.


In [1]:
pwd

'/Users/malvernbright/Desktop/Work/DSML'