In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta

# --- 1. SETUP PARAMETERS BASED ON PROVIDED DATA ---

N_VERSIONS = 50
START_DATE = pd.to_datetime('2024-01-01')

# Base ranges observed in the 5 rows:
PATCH_SIZE_MB_RANGE = (8.0, 23.0)
LINES_CHANGED_RANGE = (1000, 5500)
AVG_DEVICE_AGE_RANGE = (90, 750) # Days

# Correlation weights (must sum to 1) for the target variable:
W_CHURN = 0.50  # Code churn is the main risk driver
W_HOTFIX = 0.25 # Hotfixes/Security fixes are inherently riskier
W_LAG = 0.25    # Previous version's error rate matters
BASE_ERROR_RATE = 2000

# --- 2. GENERATE TIME AND VERSION STRUCTURE ---

versions = []
release_dates = []
current_version = [10, 0, 0]
current_date = START_DATE

for i in range(N_VERSIONS):
    # Determine the time gap (average 25 days, with some randomness)
    days_gap = int(np.random.normal(loc=25, scale=10))
    if days_gap < 2: # Prevent zero/negative gaps
        days_gap = 2

    current_date += timedelta(days=days_gap)
    release_dates.append(current_date)

    # Increment version number (e.g., 10.0.0 -> 10.0.1 -> 10.1.0)
    if np.random.rand() < 0.2: # 20% chance of a major/minor bump
        current_version[1] += 1
        current_version[2] = 0
    else:
        current_version[2] += 1
    
    versions.append(f"{current_version[0]}.{current_version[1]}.{current_version[2]}")

df = pd.DataFrame({'firmware_version': versions, 'release_date': release_dates})
df['release_date'] = pd.to_datetime(df['release_date'])

# Calculate lagged features
df['previous_version'] = df['firmware_version'].shift(1).fillna('9.9.9')
df['previous_release_date'] = df['release_date'].shift(1)
df['days_since_previous_release'] = (df['release_date'] - df['previous_release_date']).dt.days.fillna(90) # Assume 90 days for first release

# --- 3. SYNTHESIZE RISK FEATURES ---

# Patch Type synthesis (similar distribution to your data)
patch_types = np.random.choice(
    ['bugfix', 'feature', 'security', 'hotfix'], 
    size=N_VERSIONS, 
    p=[0.55, 0.20, 0.15, 0.10]
)

# Churn/Size synthesis (enforcing correlation: larger patch = more churn)
df['patch_type'] = patch_types
df['patch_size_mb'] = np.random.uniform(PATCH_SIZE_MB_RANGE[0], PATCH_SIZE_MB_RANGE[1], size=N_VERSIONS)
df['is_hotfix'] = np.where(df['patch_type'] == 'hotfix', True, False)
df['patch_security'] = np.where(df['patch_type'] == 'security', True, False)

# Adjust size and churn based on type to enforce correlation
df.loc[df['patch_type'] == 'feature', 'patch_size_mb'] *= 1.5 
df['files_changed'] = np.round(df['patch_size_mb'] * np.random.normal(loc=15, scale=5)).astype(int)
df['lines_changed'] = np.round(df['files_changed'] * np.random.normal(loc=20, scale=10)).astype(int)
df['lines_changed'] = df['lines_changed'].clip(lower=100) # Minimum change of 100 lines

# Calculate composite risk feature (code churn)
df['code_churn_score'] = (
    df['lines_changed'] / df['days_since_previous_release'].clip(lower=1) + df['files_changed']
) / 2000 # Scaling factor to keep it manageable

# Synthesize device age and previous error rate
df['avg_device_age_days'] = np.random.uniform(AVG_DEVICE_AGE_RANGE[0], AVG_DEVICE_AGE_RANGE[1], size=N_VERSIONS)
df['previous_version_error_rate'] = np.random.normal(loc=BASE_ERROR_RATE, scale=1000, size=N_VERSIONS)
df['previous_version_error_rate'] = df['previous_version_error_rate'].clip(lower=0).shift(1).fillna(0)

# --- 4. SYNTHESIZE TARGET (ERROR RATE) BASED ON CORRELATION LOGIC ---

# Base error rate plus a noise component
error_base = np.random.normal(loc=BASE_ERROR_RATE, scale=500, size=N_VERSIONS)

# Calculate risk components based on defined weights
risk_churn = df['code_churn_score'] * W_CHURN * 1000
risk_hotfix = df['is_hotfix'] * W_HOTFIX * 1500
risk_lag = df['previous_version_error_rate'] * W_LAG * 0.5 

# Final Error Rate: Base + Risk Factors + Noise
df['error_rate_per_10k'] = (
    error_base + risk_churn + risk_hotfix + risk_lag + np.random.normal(0, 500, size=N_VERSIONS)
).clip(lower=100) # Ensure no negative error rates

# --- 4B. GENERATE SUPPORT + ERROR COUNT FEATURES (synthetic but correlated) ---

# 1. Pre-error counts (baseline before patch release)
df['pre_errors'] = np.random.poisson(
    lam=(df['previous_version_error_rate'] / 100),
    size=N_VERSIONS
).clip(min=0)

# 2. Post-error counts (scaled by the modeled continuous error rate)
df['post_errors'] = (
    df['error_rate_per_10k'] / 100
    + np.random.normal(0, 10, size=N_VERSIONS)
).clip(min=0).astype(int)

# 3. Error lift
df['delta_errors'] = df['post_errors'] - df['pre_errors']
df['post_pre_ratio'] = (
    (df['post_errors'] + 1) / (df['pre_errors'] + 1)
)

# 4. Support ticket volume (correlated with error rate)
df['tickets'] = (
    df['post_errors'] * np.random.uniform(3, 6, size=N_VERSIONS)
).astype(int).clip(min=1)

# 5. RMAs (subset of tickets)
df['rmas'] = (
    df['tickets'] * np.random.uniform(0.01, 0.10, size=N_VERSIONS)
).astype(int)

df['rma_rate'] = df['rmas'] / df['tickets']

# 6. Error event count target (Poisson-friendly)
df['error_events'] = np.random.poisson(
    lam=(df['error_rate_per_10k'] / 50),
    size=N_VERSIONS
).clip(min=0)

# Create the final clean ML table with the 50 rows
synthetic_ml_df = df[[
    'firmware_version',
    'release_date',
    'previous_version',
    'days_since_previous_release',
    'patch_size_mb',
    'files_changed',
    'lines_changed',
    'is_hotfix',
    'patch_security',
    'code_churn_score',
    'avg_device_age_days',
    'previous_version_error_rate',
    'error_rate_per_10k',
    'pre_errors',
    'post_errors',
    'delta_errors',
    'post_pre_ratio',
    'tickets',
    'rmas',
    'rma_rate',
    'error_events'
]].copy()

# --- 5. SAVE THE DATA ---

# Note: We are saving the single ML-ready feature set for simplicity.
output_file_name = '../data/synthetic_firmware_features_50rows.csv'

synthetic_ml_df.to_csv(output_file_name, index=False)

print(f"Successfully generated {len(synthetic_ml_df)} rows of synthetic data.")
print(f"File saved as: {output_file_name}")
print("\nFirst 5 rows of the new synthetic data:")
print(synthetic_ml_df.head())

Successfully generated 50 rows of synthetic data.
File saved as: ../data/synthetic_firmware_features_50rows.csv

First 5 rows of the new synthetic data:
  firmware_version release_date previous_version  days_since_previous_release  \
0           10.0.1   2024-01-07            9.9.9                         90.0   
1           10.0.2   2024-01-24           10.0.1                         17.0   
2           10.0.3   2024-02-19           10.0.2                         26.0   
3           10.0.4   2024-03-29           10.0.3                         39.0   
4           10.1.0   2024-04-11           10.0.4                         13.0   

   patch_size_mb  files_changed  lines_changed  is_hotfix  patch_security  \
0      19.437984            466           8436      False           False   
1      11.815477            283           5123      False           False   
2      10.562794            253           4580      False           False   
3       9.799312            235           4254      