In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import t

In [4]:
# Load the dataset
original_data = pd.read_csv('/Users/jeffkole/Documents/External/Demos/census.csv')


In [5]:
# Set a seed for reproducibility
np.random.seed(42)


In [6]:
# Get the number of rows in the original dataset
num_rows = original_data.shape[0]


In [7]:
# Step 1: Generate mock data for numerical columns based on the distribution of the original data
numerical_columns = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

# Generate a dictionary to hold our mock data
mock_data = {}
for col in numerical_columns:
    # Get the mean and standard deviation of the original data
    mean = original_data[col].mean()
    std = original_data[col].std()
    # Generate random data following a normal distribution with the same mean and standard deviation as the original data
    mock_data[col] = np.random.normal(mean, std, num_rows)
    
# Display the generated mock data for numerical columns
pd.DataFrame(mock_data).head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,45.45379,257644.12842,10.022219,7186.471746,502.581967,35.823934
1,36.747911,215745.301026,10.066896,-604.469621,-573.962002,38.901136
2,47.523726,-85300.942749,10.739108,5430.894026,-96.44755,41.377983
3,59.525101,313520.366747,6.36509,-13794.349946,1611.741234,39.842961
4,35.433223,278269.694215,12.668068,-7328.405043,-257.210399,41.360519


In [8]:
# Step 2: Generate mock data for categorical columns based on the distribution of the original data
categorical_columns = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country', 'income']
for col in categorical_columns:
    # Get the distribution of categories in the original data
    categories = original_data[col].value_counts(normalize=True).index.tolist()
    probabilities = original_data[col].value_counts(normalize=True).values
    # Generate random data following the same category distribution as the original data
    mock_data[col] = np.random.choice(categories, num_rows, p=probabilities)
    
# Create a DataFrame from the mock data
mock_data_df = pd.DataFrame(mock_data)

# Display the first few rows of the mock data
mock_data_df.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,workclass,education,marital_status,occupation,relationship,race,sex,native_country,income
0,45.45379,257644.12842,10.022219,7186.471746,502.581967,35.823934,State-gov,10th,Married-civ-spouse,Transport-moving,Own-child,White,Male,United-States,<=50K
1,36.747911,215745.301026,10.066896,-604.469621,-573.962002,38.901136,State-gov,HS-grad,Married-spouse-absent,Farming-fishing,Not-in-family,White,Male,United-States,<=50K
2,47.523726,-85300.942749,10.739108,5430.894026,-96.44755,41.377983,Private,Some-college,Never-married,Other-service,Unmarried,White,Male,United-States,>50K
3,59.525101,313520.366747,6.36509,-13794.349946,1611.741234,39.842961,State-gov,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
4,35.433223,278269.694215,12.668068,-7328.405043,-257.210399,41.360519,Private,Bachelors,Married-civ-spouse,Craft-repair,Husband,White,Male,United-States,<=50K


In [9]:
# Step 3: Set a lower limit of 0 for "capital_gain" and "capital_loss" columns
mock_data_df['capital_gain'] = mock_data_df['capital_gain'].apply(lambda x: max(0, x))
mock_data_df['capital_loss'] = mock_data_df['capital_loss'].apply(lambda x: max(0, x))


In [10]:
# Step 4: Synchronize "education" and "education_num" columns
# Create a mapping from "education" to "education_num" based on the original dataset
education_to_num_mapping = original_data.groupby('education')['education_num'].mean().to_dict()

# Update the "education_num" column in the mock dataset based on the "education" column
mock_data_df['education_num'] = mock_data_df['education'].map(education_to_num_mapping)

# Display the first few rows of the updated mock dataset
mock_data_df.head()


Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,workclass,education,marital_status,occupation,relationship,race,sex,native_country,income
0,45.45379,257644.12842,6.0,7186.471746,502.581967,35.823934,State-gov,10th,Married-civ-spouse,Transport-moving,Own-child,White,Male,United-States,<=50K
1,36.747911,215745.301026,9.0,0.0,0.0,38.901136,State-gov,HS-grad,Married-spouse-absent,Farming-fishing,Not-in-family,White,Male,United-States,<=50K
2,47.523726,-85300.942749,10.0,5430.894026,0.0,41.377983,Private,Some-college,Never-married,Other-service,Unmarried,White,Male,United-States,>50K
3,59.525101,313520.366747,13.0,0.0,1611.741234,39.842961,State-gov,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
4,35.433223,278269.694215,13.0,0.0,0.0,41.360519,Private,Bachelors,Married-civ-spouse,Craft-repair,Husband,White,Male,United-States,<=50K


In [11]:
# Define the path to save the mock dataset
mock_data_path = '/Users/jeffkole/Documents/External/Demos/census_mock_data1.csv'

# Save the mock dataset as a CSV file
mock_data_df.to_csv(mock_data_path, index=False)
mock_data_path


'/Users/jeffkole/Documents/External/Demos/census_mock_data1.csv'