In [1]:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler

# Set random seed for reproducibility
np.random.seed(42)
num_samples = 1000

# Generate random height data (normally distributed)
heights = np.random.normal(loc=170, scale=10, size=num_samples)

# Generate an initial weight variable that is correlated with height
weights = 50 + (heights - np.mean(heights)) * (15 / 10)

# Introduce controlled noise to adjust correlation
noise = np.random.normal(loc=0, scale=5, size=num_samples)

# Standardize both height and weight
scaler = StandardScaler()
heights_std = scaler.fit_transform(heights.reshape(-1, 1)).flatten()
weights_std = scaler.fit_transform(weights.reshape(-1, 1)).flatten()

# Adjust correlation using weighted sum
target_corr = 0.931142
weights_adjusted = target_corr * heights_std + (1 - target_corr) * noise

# Rescale weights back to original scale
weights_final = (weights_adjusted - np.mean(weights_adjusted)) / np.std(weights_adjusted) * 15 + 70

# Verify correlation
final_corr, _ = pearsonr(heights, weights_final)
print(f"Final correlation: {final_corr}")

# Create DataFrame and save as CSV
df_final = pd.DataFrame({"Weight (kg)": weights_final, "Height (cm)": heights})
df_final.to_csv("height_weight_target_correlation.csv", index=False)

print("CSV file saved as height_weight_target_correlation.csv")


Final correlation: 0.9366700135969876
CSV file saved as height_weight_target_correlation.csv
