In [107]:
from sklearn.datasets import make_multilabel_classification, make_blobs
from sklearn.utils import resample
import pandas as pd
import numpy as np
import random

# 1. sign up prediction

In [3]:
centers_signup_adjusted = [
    [100, 100, 100, 200, 100, 500, 100, 100, 300, 1200, 50, 120000, 0, 0, 0],  # Group with signup_flag = 1
    [90, 80, 110, 201, 70, 490, 90, 90, 290, 1100, 30, 120000, 0, 0, 0]           # Group with signup_flag = 0
]

In [4]:
n_samples = 100000
n_features = len(centers_signup_adjusted[0])

In [5]:
X, y = make_blobs(n_samples=n_samples, centers=centers_signup_adjusted, cluster_std=100, n_features=n_features, random_state=42)

In [6]:
columns = [
    "pages_visited", "clicks", "features_explored", "signup_offer_used", "referral_flag",
    "video_views", "forms_completed", "likes_given", "search_queries", "scroll_depth",
    "age", "income", "location", "time_of_day", "device_type"
]

In [7]:
df1 = pd.DataFrame(X, columns=columns)

In [8]:
df1['signup_flag'] = y

In [9]:
int_columns = [
    "pages_visited", "clicks", "features_explored", "signup_offer_used", "referral_flag",
    "video_views", "forms_completed", "likes_given", "search_queries", "scroll_depth",
    "age", "location", "time_of_day", "device_type"
]

In [10]:
df1[int_columns] = df1[int_columns].round(0).astype(int)

In [11]:
df1.head()

Unnamed: 0,pages_visited,clicks,features_explored,signup_offer_used,referral_flag,video_views,forms_completed,likes_given,search_queries,scroll_depth,age,income,location,time_of_day,device_type,signup_flag
0,190,29,172,279,115,725,173,19,208,1257,14,120047.346759,-163,-82,13,0
1,234,216,-52,146,117,563,121,175,215,1342,-19,119914.671927,57,-122,-18,0
2,238,145,198,-11,17,518,-1,130,307,1171,113,120040.380392,-62,-17,-110,0
3,-25,55,106,316,-21,562,182,103,512,977,-11,119930.859135,-7,-12,179,1
4,-39,162,312,221,-17,582,67,115,274,1229,149,120008.866412,0,-8,146,0


In [12]:
location_values = ['urban', 'suburban', 'rural']
time_of_day_values = ['afternoon', 'evening', 'night', 'morning']
device_type_values = ['mobile', 'tablet', 'desktop']

In [13]:
df1['location'] = np.random.choice(location_values, size=len(df1))
df1['time_of_day'] = np.random.choice(time_of_day_values, size=len(df1))
df1['device_type'] = np.random.choice(device_type_values, size=len(df1))

In [14]:
df1["customer_id"] = range(1, len(df1) + 1)

In [15]:
df1['pages_visited'] = np.clip(df1['pages_visited'], 0, 150)
df1['clicks'] = np.clip(df1['clicks'], 0, 150)
df1['features_explored'] = np.clip(df1['features_explored'], 0, 150)
df1['signup_offer_used'] = np.clip(df1['signup_offer_used'], 0, 250)
df1['referral_flag'] = np.clip(df1['referral_flag'], 0, 150)
df1['video_views'] = np.clip(df1['video_views'], 0, 600)
df1['forms_completed'] = np.clip(df1['forms_completed'], 0, 150)
df1['likes_given'] = np.clip(df1['likes_given'], 0, 150)
df1['search_queries'] = np.clip(df1['search_queries'], 0, 300)
df1['scroll_depth'] = np.clip(df1['scroll_depth'], 0, 1500)
df1['age'] = np.clip(df1['age'], 18, 75)
df1['income'] = np.clip(df1['income'], 30000, 120000)

In [16]:
df1

Unnamed: 0,pages_visited,clicks,features_explored,signup_offer_used,referral_flag,video_views,forms_completed,likes_given,search_queries,scroll_depth,age,income,location,time_of_day,device_type,signup_flag,customer_id
0,150,29,150,250,115,600,150,19,208,1257,18,120000.000000,urban,afternoon,mobile,0,1
1,150,150,0,146,117,563,121,150,215,1342,18,119914.671927,urban,afternoon,desktop,0,2
2,150,145,150,0,17,518,0,130,300,1171,75,120000.000000,urban,night,desktop,0,3
3,0,55,106,250,0,562,150,103,300,977,18,119930.859135,rural,night,tablet,1,4
4,0,150,150,221,0,582,67,115,274,1229,75,120000.000000,urban,morning,tablet,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,57,12,76,169,67,473,0,150,189,1051,75,120000.000000,suburban,afternoon,mobile,1,99996
99996,150,0,104,250,0,587,150,139,300,1108,18,120000.000000,rural,afternoon,mobile,1,99997
99997,89,37,150,194,150,576,123,71,300,1032,18,119732.806885,urban,night,mobile,1,99998
99998,0,20,150,228,150,489,31,22,251,1045,75,119987.209684,rural,evening,desktop,0,99999


In [17]:
def randomly_delete_target_1_multilabel(df, target_column_list, fraction_to_keep=0.2, random_state=42):
    
    df_list = []
    
    # Loop over each target column
    for target_column in target_column_list:
        # Identify rows where the target column has value 1
        df_target_1 = df[df[target_column] == 1]
        # Randomly sample rows to retain for target = 1
        df_target_1_sampled = df_target_1.sample(frac=fraction_to_keep, random_state=random_state)
        # Append sampled rows to the list
        df_list.append(df_target_1_sampled)
    
    # Combine all sampled rows for target columns with 1s
    df_target_1_combined = pd.concat(df_list).drop_duplicates().reset_index(drop=True)
    
    # Combine with rows where all target columns are 0 (no multilabel match for 1)
    df_target_0 = df[(df[target_column_list] == 0).all(axis=1)]
    df_final = pd.concat([df_target_0, df_target_1_combined]).reset_index(drop=True)
    
    return df_final

In [18]:
df1_imbalanced = randomly_delete_target_1_multilabel(df1, ['signup_flag'])

In [19]:
df1_imbalanced.shape

(60000, 17)

In [20]:
df1_imbalanced.head()

Unnamed: 0,pages_visited,clicks,features_explored,signup_offer_used,referral_flag,video_views,forms_completed,likes_given,search_queries,scroll_depth,age,income,location,time_of_day,device_type,signup_flag,customer_id
0,150,29,150,250,115,600,150,19,208,1257,18,120000.0,urban,afternoon,mobile,0,1
1,150,150,0,146,117,563,121,150,215,1342,18,119914.671927,urban,afternoon,desktop,0,2
2,150,145,150,0,17,518,0,130,300,1171,75,120000.0,urban,night,desktop,0,3
3,0,150,150,221,0,582,67,115,274,1229,75,120000.0,urban,morning,tablet,0,5
4,0,35,0,82,83,554,0,150,193,1208,18,119944.962249,rural,afternoon,tablet,0,9


In [21]:
df1_imbalanced.describe()

Unnamed: 0,pages_visited,clicks,features_explored,signup_offer_used,referral_flag,video_views,forms_completed,likes_given,search_queries,scroll_depth,age,income,signup_flag,customer_id
count,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0
mean,87.637667,86.8252,89.541467,180.87185,85.731083,489.678267,87.896517,86.990717,258.77705,1183.60345,46.485733,119959.822894,0.166667,49964.502233
std,58.223519,58.395568,57.958951,72.254772,58.591538,87.049241,58.23005,58.438104,59.48569,107.08066,26.273588,58.710845,0.372681,28856.439402
min,0.0,0.0,0.0,0.0,0.0,84.0,0.0,0.0,0.0,749.0,18.0,119571.691333,0.0,1.0
25%,31.0,29.0,34.0,132.0,27.0,430.0,31.0,29.0,230.0,1112.0,18.0,119932.161283,0.0,24996.5
50%,98.0,97.0,102.0,201.0,95.0,498.0,99.0,97.0,297.0,1185.0,46.5,120000.0,0.0,50007.5
75%,150.0,150.0,150.0,250.0,150.0,565.0,150.0,150.0,300.0,1257.0,75.0,120000.0,0.0,74881.25
max,150.0,150.0,150.0,250.0,150.0,600.0,150.0,150.0,300.0,1500.0,75.0,120000.0,1.0,100000.0


In [22]:
df1_imbalanced.to_csv('app_signup_dataset.csv', index=False)

In [23]:
***

SyntaxError: invalid syntax (2938429678.py, line 1)

# 2. segmentation

In [None]:
centroids = [
    [50000, 15, 5, 10, 2, 1000000, 20, 30, 25, 35, 50, 0, 0, 0, 0, 5, 30, 70, 3],   # Conservative investors
    [1500000, 5, 15, 30, 1, 8000000, 30, 20, 40, 10, 25, 1, 1, 1, 1, 9, 60, 40, 8],  # High-risk investors
    [500000, 10, 10, 20, 1.5, 4000000, 25, 25, 30, 20, 35, 1, 1, 1, 1, 7, 50, 50, 6], # Balanced investors
    [200000, 20, 7, 15, 1.8, 2000000, 15, 35, 20, 40, 45, 0, 0, 0, 0, 6, 40, 60, 4],  # Low-risk investors
    [800000, 8, 12, 25, 1.2, 6000000, 28, 28, 35, 25, 30, 1, 1, 1, 1, 8, 55, 45, 7],  # Growth-focused investors
    [1000000, 12, 8, 18, 1.7, 5000000, 22, 22, 28, 22, 40, 0, 0, 0, 0, 7, 48, 52, 5],  # Moderate-risk investors
]

In [None]:
n_samples = 100000
n_features = len(centroids[0])
n_clusters = len(centroids)

In [None]:
X, y = make_blobs(n_samples=n_samples, centers=centroids, n_features=n_features, random_state=42)

In [None]:
feature_names = [
    "Total_Investment", "Investment_Tenure", "Average_Return", "Portfolio_Volatility",
    "Expense_Ratio", "Fund_Size", "Sector_Allocation_Tech", "Sector_Allocation_Healthcare",
    "Sector_Allocation_Finance", "Sector_Allocation_RealEstate", "Transaction_Frequency",
    "Risk_Appetite_Flag", "Redemption_Frequency_Flag", "Preferred_Equity_Flag",
    "Preferred_Debt_Flag", "Diversification_Index", "Top_3_Fund_Concentration",
    "Remaining_Fund_Allocation", "Average_Holding_Period"
]

In [None]:
df2 = pd.DataFrame(X, columns=feature_names)

In [None]:
df2["Cluster"] = y

In [None]:
int_columns = [
    "Fund_Size", "Sector_Allocation_Tech", 
    "Sector_Allocation_Healthcare", "Sector_Allocation_Finance",
    "Sector_Allocation_RealEstate", "Transaction_Frequency",
    "Top_3_Fund_Concentration", "Remaining_Fund_Allocation", 
    "Average_Holding_Period"
]

In [None]:
df2[int_columns] = df2[int_columns].round(0).astype(int)

In [None]:
df2.head()

In [None]:
df2["customer_id"] = range(1, len(df2) + 1)

In [None]:
df2['Total_Investment'] = np.clip(df2['Total_Investment'], 50000, 1500000)
df2['Investment_Tenure'] = np.clip(df2['Investment_Tenure'], 1, 30)
df2['Average_Return'] = np.clip(df2['Average_Return'], 0, 20)
df2['Portfolio_Volatility'] = np.clip(df2['Portfolio_Volatility'], 0, 50)
df2['Expense_Ratio'] = np.clip(df2['Expense_Ratio'], 0, 5)
df2['Fund_Size'] = np.clip(df2['Fund_Size'], 1e6, 1e8)
df2['Sector_Allocation_Tech'] = np.clip(df2['Sector_Allocation_Tech'], 0, 100)
df2['Sector_Allocation_Healthcare'] = np.clip(df2['Sector_Allocation_Healthcare'], 0, 100)
df2['Sector_Allocation_Finance'] = np.clip(df2['Sector_Allocation_Finance'], 0, 100)
df2['Sector_Allocation_RealEstate'] = np.clip(df2['Sector_Allocation_RealEstate'], 0, 100)
df2['Risk_Appetite_Flag'] = np.clip(df2['Risk_Appetite_Flag'], 0, 3)
df2['Redemption_Frequency_Flag'] = np.clip(df2['Redemption_Frequency_Flag'], -1, 3)
df2['Preferred_Equity_Flag'] = np.clip(df2['Preferred_Equity_Flag'], 0, 3)
df2['Preferred_Debt_Flag'] = np.clip(df2['Preferred_Debt_Flag'], 0, 3)
df2['Diversification_Index'] = np.clip(df2['Diversification_Index'], 0, 10)
df2['Top_3_Fund_Concentration'] = np.clip(df2['Top_3_Fund_Concentration'], 0, 100)
df2['Remaining_Fund_Allocation'] = np.clip(df2['Remaining_Fund_Allocation'], 0, 100)
df2['Average_Holding_Period'] = np.clip(df2['Average_Holding_Period'], 1, 15)

In [None]:
df2.shape

In [None]:
df2.head()

In [None]:
df2.describe()

# 3. propensity to buy

In [123]:
feature_names = [
    "Age", "Income", "Risk_Appetite", "Preferred_Investment_Type", "Credit_Score",
    "Loan_History", "Monthly_Savings", "Annual_Expenses", "Debt_to_Income_Ratio",
    "Website_Visits", "Mutual_Fund_Pages_Visited", "Content_Viewed",
    "Promotional_Responses", "Referral_Usage", "Geographic_Region"
]

In [125]:
target_names = [
    "Equity_Fund", "Debt_Fund", "Hybrid_Fund", "Tax_Saving_Fund",
    "Liquid_Fund", "Index_Fund", "Sectoral_Thematic_Fund"
]

In [127]:
_, y = make_multilabel_classification(
    n_samples=100000,
    n_features=15,
    n_classes=7,
    n_labels=3,
    random_state=42
)
y_df = pd.DataFrame(y, columns=target_names)

In [128]:
X_blob, _ = make_blobs(
    n_samples=100000,
    n_features=15,
    centers=[
        [30, 50000, 1, 1, 600, 2, 3000, 40000, 0.5, 20, 10, 5, 2, 0, 1],
        [45, 100000, 2, 2, 750, 1, 7000, 80000, 0.2, 50, 25, 10, 5, 1, 2],
        [35, 80000, 1, 0, 680, 3, 5000, 60000, 1.0, 30, 15, 7, 3, 0, 3],
        [50, 40000, 0, 0, 620, 5, 2000, 30000, 1.5, 10, 5, 3, 1, 1, 4],
        [28, 90000, 2, 1, 780, 0, 8000, 70000, 0.3, 40, 20, 8, 4, 0, 0],
        [40, 30000, 0, 2, 640, 4, 2500, 35000, 1.8, 15, 8, 4, 2, 1, 1],
        [55, 110000, 2, 2, 800, 1, 9000, 90000, 0.1, 60, 30, 12, 6, 0, 2]
    ],
    cluster_std=10.0,
    random_state=42
)

In [129]:
X_blob = pd.DataFrame(X_blob, columns=feature_names)
X_blob['Age'] = np.clip(X_blob['Age'], 18, 65)  # Age range
X_blob['Income'] = np.clip(X_blob['Income'], 20000, 150000)  # Income range
X_blob['Risk_Appetite'] = np.clip(X_blob['Risk_Appetite'], 0, 2)  # Risk appetite
X_blob['Preferred_Investment_Type'] = np.clip(X_blob['Preferred_Investment_Type'], 0, 2)  # Investment type
X_blob['Credit_Score'] = np.clip(X_blob['Credit_Score'], 300, 850)  # Credit score
X_blob['Loan_History'] = np.clip(X_blob['Loan_History'], 0, 10)  # Loan history
X_blob['Monthly_Savings'] = np.clip(X_blob['Monthly_Savings'], 1000, 10000)  # Savings
X_blob['Annual_Expenses'] = np.clip(X_blob['Annual_Expenses'], 20000, 120000)  # Annual expenses
X_blob['Debt_to_Income_Ratio'] = np.clip(X_blob['Debt_to_Income_Ratio'], 0.1, 2.0)  # Debt-to-income ratio
X_blob['Website_Visits'] = np.clip(X_blob['Website_Visits'], 0, 100)  # Website visits
X_blob['Mutual_Fund_Pages_Visited'] = np.clip(X_blob['Mutual_Fund_Pages_Visited'], 0, 50)  # Pages visited
X_blob['Content_Viewed'] = np.clip(X_blob['Content_Viewed'], 0, 20)  # Content viewed
X_blob['Promotional_Responses'] = np.clip(X_blob['Promotional_Responses'], 0, 10)  # Promo responses
X_blob['Referral_Usage'] = np.clip(X_blob['Referral_Usage'], 0, 1)  # Referral usage

In [130]:
df3 = pd.concat([X_blob, y_df], axis=1)

In [131]:
random_regions = ["North", "South", "East", "West", "Central", "Northeast", "Southwest", "Midwest", "Southeast"]

In [132]:
df3["Geographic_Region"] = [random.choice(random_regions) for _ in range(len(df3))]

In [133]:
df3["customer_id"] = range(1, len(df3) + 1)

In [134]:
df3_imbalanced = randomly_delete_target_1_multilabel(df3, target_names)

In [135]:
df3_imbalanced.shape

(49527, 23)

In [136]:
df3_imbalanced.head()

Unnamed: 0,Age,Income,Risk_Appetite,Preferred_Investment_Type,Credit_Score,Loan_History,Monthly_Savings,Annual_Expenses,Debt_to_Income_Ratio,Website_Visits,...,Referral_Usage,Geographic_Region,Equity_Fund,Debt_Fund,Hybrid_Fund,Tax_Saving_Fund,Liquid_Fund,Index_Fund,Sectoral_Thematic_Fund,customer_id
0,47.290405,39988.65758,2.0,2.0,629.663147,7.555221,2003.159823,30000.371481,0.1,1.438388,...,0.0,Central,0,0,0,0,0,0,0,61
1,18.0,50010.715729,2.0,2.0,601.915267,4.84531,3004.978693,39996.099418,0.1,28.100143,...,1.0,East,0,0,0,0,0,0,0,103
2,60.194677,109995.300524,2.0,2.0,790.477814,0.0,8994.355917,89998.25931,0.1,56.775413,...,0.0,Central,0,0,0,0,0,0,0,122
3,33.832782,39993.812998,2.0,2.0,631.936777,2.646375,1999.81429,30002.475745,0.1,20.384048,...,0.0,Southwest,0,0,0,0,0,0,0,130
4,35.516928,50005.12997,0.847545,2.0,607.453805,9.974575,3004.60675,39982.471152,1.958866,15.392815,...,1.0,Northeast,0,0,0,0,0,0,0,133


In [137]:
df3_imbalanced.describe()

Unnamed: 0,Age,Income,Risk_Appetite,Preferred_Investment_Type,Credit_Score,Loan_History,Monthly_Savings,Annual_Expenses,Debt_to_Income_Ratio,Website_Visits,...,Promotional_Responses,Referral_Usage,Equity_Fund,Debt_Fund,Hybrid_Fund,Tax_Saving_Fund,Liquid_Fund,Index_Fund,Sectoral_Thematic_Fund,customer_id
count,49527.0,49527.0,49527.0,49527.0,49527.0,49527.0,49527.0,49527.0,49527.0,49527.0,...,49527.0,49527.0,49527.0,49527.0,49527.0,49527.0,49527.0,49527.0,49527.0,49527.0
mean,40.545412,71400.512448,1.007337,1.007212,695.477442,3.992315,5205.172342,57820.166523,1.024294,32.343967,...,4.351687,0.495144,0.501524,0.715529,0.661356,0.613181,0.299594,0.297131,0.147071,49892.227149
std,12.924852,29001.112812,0.972914,0.973732,75.32129,4.217927,2616.232516,21716.711915,0.925416,19.734859,...,4.284872,0.493644,0.500003,0.451167,0.473253,0.487027,0.458085,0.456999,0.354181,28849.494014
min,18.0,29961.077611,0.0,0.0,565.351909,0.0,1965.21075,29964.948207,0.1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,30.675436,40006.826594,0.0,0.0,624.948196,0.0,2506.410935,35006.414993,0.1,16.34757,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24914.0
50%,40.379049,79999.720069,1.096034,1.095412,679.990647,2.325293,4999.600943,59999.550023,0.69494,30.535906,...,3.216969,0.366566,1.0,1.0,1.0,1.0,0.0,0.0,0.0,49908.0
75%,50.209856,99993.491703,2.0,2.0,773.005526,9.120409,7993.049174,79993.288161,2.0,47.709879,...,10.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,74877.5
max,65.0,110039.276658,2.0,2.0,839.740006,10.0,9039.592985,90037.604982,2.0,100.0,...,10.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,100000.0


In [138]:
df3_imbalanced.to_csv('propensity_to_buy.csv', index=-False)