In [1]:
import pandas as pd
import numpy as np

# Set seed for reproducibility
np.random.seed(42)
num_observations = 506

# --- 1. Generate the Predictor: Room Number ---
# Modeled after Boston Housing: Mean ~6.3, Std ~0.7
room_num = np.random.normal(6.3, 0.7, num_observations)
# Clip to realistic range
room_num = np.clip(room_num, 3.5, 9.0).round(3)

# --- 2. Generate the Target: Price (The Linear Relationship) ---
# Formula: Base + (Slope * Rooms) + Noise
# We want Price to be roughly median 22k, range 5k-50k
noise = np.random.normal(0, 4.0, num_observations) # Standard deviation of 4k
price = 10.0 + (5.0 * room_num) + noise 

# Ensure no negative prices
price = np.maximum(price, 5.0).round(1)

# --- 3. Generate Other Variables (Context) ---
# These are just to match your file structure
crime_rate = np.random.exponential(3, num_observations).round(5)
resid_area = np.random.uniform(30.0, 60.0, num_observations).round(2)
air_qual = np.random.uniform(0.4, 0.9, num_observations).round(4)
age = np.random.uniform(5, 100, num_observations).round(1)
teachers = np.random.uniform(18.0, 25.0, num_observations).round(1)
poor_prop = np.random.uniform(2.0, 35.0, num_observations).round(2)
n_hos_beds = np.random.uniform(5.0, 10.0, num_observations).round(4)
n_hot_rooms = np.random.uniform(10.0, 15.0, num_observations).round(4)
rainfall = np.random.randint(20, 60, num_observations)
parks = np.random.uniform(0.03, 0.09, num_observations).round(6)

# Categorical
airport = np.random.choice(['YES', 'NO'], num_observations, p=[0.55, 0.45])
waterbody = np.random.choice(['None', 'Lake', 'River', 'Lake and River'], num_observations)
bus_ter = ['YES'] * num_observations

# Distances (just random)
dist1 = np.random.uniform(1, 10, num_observations).round(2)
dist2 = dist1 + np.random.normal(0, 0.2, num_observations).round(2)
dist3 = dist1 + np.random.normal(0, 0.2, num_observations).round(2)
dist4 = dist1 + np.random.normal(0, 0.2, num_observations).round(2)

# --- 4. Assemble DataFrame ---
data = {
    'price': price,
    'crime_rate': crime_rate,
    'resid_area': resid_area,
    'air_qual': air_qual,
    'room_num': room_num,
    'age': age,
    'dist1': dist1,
    'dist2': dist2,
    'dist3': dist3,
    'dist4': dist4,
    'teachers': teachers,
    'poor_prop': poor_prop,
    'airport': airport,
    'n_hos_beds': n_hos_beds,
    'n_hot_rooms': n_hot_rooms,
    'waterbody': waterbody,
    'rainfall': rainfall,
    'bus_ter': bus_ter,
    'parks': parks
}

df = pd.DataFrame(data)

# --- 5. CORRUPT THE DATA (Add Dirt) ---

# A. Insert Missing Values in 'room_num'
# Mask 5% of the values. 
# CRITICAL: This forces you to use Imputation. 
# If you just delete these rows, you lose data. If you impute wrong, you skew the line.
nan_indices = np.random.choice(df.index, size=int(0.05 * num_observations), replace=False)
df.loc[nan_indices, 'room_num'] = np.nan

# B. Insert Extreme Outliers in 'price'
# Select 10 random houses and make them cost $300k+ (normal max is ~50k)
# CRITICAL: These pull the regression line up, ruining the slope.
# You MUST cap/remove these to see the line.
outlier_indices = np.random.choice(df.index, size=12, replace=False)
df.loc[outlier_indices, 'price'] = df.loc[outlier_indices, 'price'] * 6 + 100

# Save
df.to_csv('House_Price_Data_Dirty.csv', index=False)
print("File 'House_Price_Data_Dirty.csv' generated.")
print("Logic: Price = 10 + 5 * RoomNum + Noise")
print("Dirt: 5% missing RoomNum, 12 Extreme Price Outliers.")

File 'House_Price_Data_Dirty.csv' generated.
Logic: Price = 10 + 5 * RoomNum + Noise
Dirt: 5% missing RoomNum, 12 Extreme Price Outliers.
