In [11]:
import pandas as pd
import numpy as np

# Parameters
num_users = 10000

# List of states (as an example)
states = ['California', 'Texas', 'New York', 'Florida', 'Illinois']

# Generate synthetic data
data = {
    'user_id': range(1, num_users + 1),
    'session_count': np.random.poisson(15, num_users),
    'avg_session_duration': np.random.uniform(15, 60, num_users),
    'pageviews_per_session': np.random.poisson(4, num_users),
    'days_since_last_visit': np.random.randint(0, 30, num_users),
    'feature_usage_rate': np.random.uniform(0, 1, num_users),
    'clicks_on_premium_content': np.random.poisson(3, num_users),
    'referral_source': np.random.choice(['organic', 'ad', 'referral'], num_users),
    'state': np.random.choice(states, num_users),
    'device_type': np.random.choice(['mobile', 'desktop', 'tablet'], num_users),
    'number_of_visits_last_week': np.random.poisson(3, num_users),
    'number_of_visits_last_30_days': np.random.poisson(10, num_users),
    'messages_sent_count': np.random.poisson(5, num_users),
    'messages_received_count': np.random.poisson(4, num_users),
    'posts_count': np.random.poisson(2, num_users),
    'reactions_count': np.random.poisson(10, num_users),
    'community_sessions_count': np.random.poisson(8, num_users),
    'number_of_trainings_watched': np.random.poisson(8, num_users),
    'number_of_trainings_finished': np.random.poisson(5, num_users),
    'daily_steps': np.random.poisson(5000, num_users),
    'daily_workout_minutes': np.random.poisson(30, num_users)
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Adjust features to meet specified conditions
df['community_sessions_count'] = np.minimum(df['community_sessions_count'], df['session_count'] - 1)  # condition 1
df['number_of_visits_last_30_days'] = np.maximum(df['number_of_visits_last_30_days'], df['number_of_visits_last_week'])  # condition 2
df['number_of_trainings_finished'] = np.minimum(df['number_of_trainings_finished'], df['number_of_trainings_watched'])  # condition 3

# Apply similar logic for subscription probability with the state feature integrated (without disclosing details)
df['subscription_probability'] = (
    0.15 * (df['session_count'] / df['session_count'].max()) +
    0.1 * (df['pageviews_per_session'] / df['pageviews_per_session'].max()) +
    0.2 * (df['feature_usage_rate']) +
    0.1 * (df['clicks_on_premium_content'] / df['clicks_on_premium_content'].max()) +
    0.1 * (df['number_of_visits_last_week'] / df['number_of_visits_last_week'].max()) +
    0.1 * (df['number_of_visits_last_30_days'] / df['number_of_visits_last_30_days'].max()) +
    0.05 * (df['messages_sent_count'] / df['messages_sent_count'].max()) +
    0.05 * (df['messages_received_count'] / df['messages_received_count'].max()) +
    0.05 * (df['posts_count'] / df['posts_count'].max()) +
    0.05 * (df['reactions_count'] / df['reactions_count'].max()) +
    0.05 * (df['community_sessions_count'] / df['community_sessions_count'].max()) +
    0.05 * (df['number_of_trainings_watched'] / df['number_of_trainings_watched'].max()) +
    0.05 * (df['number_of_trainings_finished'] / df['number_of_trainings_finished'].max()) +
    0.05 * (df['daily_steps'] / df['daily_steps'].max()) +
    0.05 * (df['daily_workout_minutes'] / df['daily_workout_minutes'].max())
)

# Apply some noise to represent real-world randomness
df['subscription_probability'] = df['subscription_probability'] + np.random.normal(0, 0.05, num_users)

# Set conditions for higher likelihood of conversion
df['subscription_status'] = (df['subscription_probability'] > np.random.uniform(0.4, 0.8, num_users)).astype(int)

# Drop intermediate subscription_probability column
df = df.drop(columns=['subscription_probability'])

print(df.head())


   user_id  session_count  avg_session_duration  pageviews_per_session  \
0        1             19             54.026249                      5   
1        2             15             54.766158                      1   
2        3             12             19.629210                      3   
3        4             20             57.280056                      3   
4        5             18             49.216602                      4   

   days_since_last_visit  feature_usage_rate  clicks_on_premium_content  \
0                     17            0.800647                          3   
1                      7            0.194363                          3   
2                     18            0.253486                          2   
3                     19            0.482543                          1   
4                      0            0.693894                          3   

  referral_source     state device_type  ...  messages_received_count  \
0        referral   Florida    

In [12]:
df.to_csv('C:/Users/VanDoan/OneDrive - Rightpoint/Analysis/strong-app-user-behaviors.csv', index=False)