In [None]:
import pandas as pd
import numpy as np

In [None]:
# Load the sales data to get customer IDs
sales_df = pd.read_csv('tesla_sales_data_with_vin.csv')

In [None]:
# Create unique customer IDs
customer_ids = sales_df['VIN'].unique()

In [None]:
# Generate customer demographics
np.random.seed(42)
customer_data = []

for cust_id in customer_ids:
    age = np.random.gamma(shape=6, scale=6) + 25  # Age distribution skewed towards younger adults
    income = np.random.gamma(shape=3, scale=30000) + 50000  # Income distribution with long tail
    
    occupation_probs = [0.3, 0.25, 0.2, 0.15, 0.1]  # Probabilities for each occupation
    occupation = np.random.choice(
        ['Professional', 'Manager', 'Entrepreneur', 'Engineer', 'Other'],
        p=occupation_probs
    )
    
    family_size_probs = [0.2, 0.3, 0.25, 0.15, 0.1]  # Probabilities for each family size
    family_size = np.random.choice([1, 2, 3, 4, 5], p=family_size_probs)
    
    # Ensure prev_ev_owner_prob is between 0 and 1
    prev_ev_owner_prob = np.clip(0.3 + (age - 25) * 0.005, 0, 1)  # Probability increases with age, but capped at 1
    prev_ev_owner = np.random.choice([True, False], p=[prev_ev_owner_prob, 1 - prev_ev_owner_prob])
    
    env_concern = np.random.beta(5, 2) * 10  # Environmental concern skewed towards higher values
    
    customer_data.append({
        'CustomerID': cust_id,
        'Age': int(age),
        'Income': int(income),
        'Occupation': occupation,
        'FamilySize': family_size,
        'PreviousEVOwner': prev_ev_owner,
        'EnvironmentalConcern': round(env_concern, 2)
    })

In [None]:
customer_df = pd.DataFrame(customer_data)
customer_df.to_csv('tesla_customer_demographics.csv', index=False)
print("Customer demographics data saved to tesla_customer_demographics.csv")