# Dataset for Model Retraining section

In [None]:
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.preprocessing import KBinsDiscretizer
from scipy.stats import truncnorm

In [None]:
# define specifications of the synthetic dataset
n_features= 11
n_samples=918

# defining categorigal and numeric columns
categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'FastingBS', 'ExerciseAngina', 'ST_Slope']
int_cols = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR']
cat_len = [2, 4, 3, 2, 2, 3, 2]
mean = [53.5, 132.4, 198.8, 136.8]
std = [9.4, 18.5, 109.38, 25.46]
max = [77, 200, 603, 202]
min = [28, 0, 0, 60]

# Generate a synthetic dataset with the same number of features and samples
X, y = make_classification(n_samples=n_samples, n_features=n_features,
                           n_informative=5, n_redundant=2, n_repeated=0,
                           n_classes=2, n_clusters_per_class=2, weights=None,
                           flip_y=0.01, class_sep=1.0, hypercube=True,
                           shift=0.0, scale=1.0, shuffle=True, random_state=42)

synthetic_data = pd.DataFrame(X, columns=['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope'])

# Convert the numerical data to categorical data
for ind, col in enumerate(categorical_cols):
  discretizer = KBinsDiscretizer(n_bins=cat_len[ind], encode='ordinal', strategy='uniform')
  synthetic_data[col] = discretizer.fit_transform(synthetic_data[[col]]).astype(int)

for ind, col in enumerate(int_cols):
  min_val, max_val = (min[ind] - mean[ind]) / std[ind], (max[ind] - mean[ind]) / std[ind]
  synthetic_data[col] = truncnorm(a=min_val, b=max_val, loc=mean[ind], scale=std[ind]).rvs(size=n_samples)
  synthetic_data[col] = synthetic_data[col].round().astype('int64')


# Create a DataFrame with the synthetic features and target variable
synthetic_data['HeartDisease'] = y

synthetic_data

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,51,0,2,153,293,1,0,128,0,0.303221,1,1
1,49,1,2,170,201,1,1,129,1,1.317817,1,0
2,66,1,2,136,160,1,1,149,1,-1.422645,0,1
3,51,1,2,111,264,0,2,146,1,1.433547,2,1
4,50,1,2,124,160,1,0,102,1,-0.509987,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,42,1,1,136,359,1,1,170,0,-0.480682,2,0
914,43,1,2,162,86,1,1,137,0,-1.372706,0,1
915,61,1,2,129,220,1,1,130,1,0.614167,1,0
916,54,0,2,115,106,1,1,115,0,-0.559046,1,0


In [None]:
# Save the synthetic dataset
synthetic_dataset_path = '/content/heart.csv'
synthetic_data.to_csv(synthetic_dataset_path, index=False)

# Generate a statistical summary of the synthetic dataset
synthetic_summary = synthetic_data.describe()

synthetic_summary

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.255991,0.502179,1.687364,132.300654,211.351852,0.715686,0.955338,136.72549,0.495643,0.17641,1.167756,0.501089
std,9.017905,0.500268,0.707703,17.933386,103.141417,0.451333,0.523357,24.995285,0.500254,1.924241,0.574034,0.500271
min,28.0,0.0,0.0,73.0,0.0,0.0,0.0,66.0,0.0,-5.342139,0.0,0.0
25%,47.0,0.0,1.0,120.0,137.0,0.0,1.0,121.0,0.0,-1.335052,1.0,0.0
50%,53.0,1.0,2.0,132.0,207.0,1.0,1.0,137.0,0.0,0.093607,1.0,1.0
75%,59.0,1.0,2.0,144.0,278.0,1.0,1.0,154.0,1.0,1.716767,2.0,1.0
max,76.0,1.0,3.0,190.0,599.0,1.0,2.0,202.0,1.0,4.91817,2.0,1.0


# Dataset for Model Reuse section (online sales dataset)

In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification

In [None]:
# define the start and end dates of records
start_date = '2021-01-01'
end_date = '2022-01-01'
date_df = pd.DataFrame({'Purchase_DATE':pd.date_range(start=start_date, end=end_date)})
date_df.index += 1
date_dict = pd.Series(date_df.Purchase_DATE.values,index=date_df.index).to_dict()

In [None]:
# define categorical columns and numeric columns
categorical_columns = ['Purchase_DATE', 'Gender', 'Browser', 'Newsletter', 'Voucher', 'Pay_Method']
numeric_columns = ['Customer_id', 'Age',  'Revenue_Total', 'N_Purchases', 'Purchase_VALUE', 'Time_Spent']

# define mean, sd of the numeric columns
mean_values = [537205.5,39.6, 27.73, 3.99, 15.97,  598.92]
sd_values = [18993.81, 13.82, 14.94, 2.004, 13.23,  277.83]
cols = 11
num_samples = 65796

# Initialize the synthetic data dictionary
synthetic_data = {}

# Generate synthetic data for categorical columns using the frequencies of the categories
synthetic_data['Gender'] = np.random.choice([1,0], size=num_samples, p=[0.67,0.33])
synthetic_data['Pay_Method'] = np.random.choice([0,1,2,3], size=num_samples, p=[0.29,0.3,0.22,0.19])
synthetic_data['Newsletter'] = np.random.choice([0,1], size=num_samples, p=[0.85,0.15])
synthetic_data['Voucher'] = np.random.choice([0,1], size=num_samples, p=[0.75,0.25])
synthetic_data['Browser'] = np.random.choice([0,1,2,3], size=num_samples, p=[0.64,0.2,0.06,0.1])



# Generate synthetic data for numeric columns
for index, column in enumerate(numeric_columns):
    mean = mean_values[index]
    std = sd_values[index]
    synthetic_data[column] = np.random.normal(mean, std, size=num_samples)

# generate dummy column for date
synthetic_data['Purchase_DATE'] = np.random.randint(1,366, size=num_samples)

In [None]:
# Create a DataFrame from the synthetic data
synthetic_df = pd.DataFrame(synthetic_data)

# Finalize date column
synthetic_df['Purchase_DATE'].replace(date_dict, inplace=True)

# revise columns to integers where required and types
synthetic_df[['Customer_id', 'Age', 'N_Purchases']] = synthetic_df[['Customer_id', 'Age', 'N_Purchases']].astype(int)
for column in categorical_columns:
    synthetic_df[column] = synthetic_df[column].astype('object')

synthetic_df

Unnamed: 0,Gender,Pay_Method,Newsletter,Voucher,Browser,Customer_id,Age,Revenue_Total,N_Purchases,Purchase_VALUE,Time_Spent,Purchase_DATE
0,1,0,0,0,3,567212,54,45.611871,2,19.760390,238.798523,2021-07-07 00:00:00
1,1,3,1,1,0,521848,26,33.838817,1,9.971389,204.151566,2021-01-21 00:00:00
2,1,0,0,0,1,504476,18,23.736064,2,10.509903,745.004183,2021-06-03 00:00:00
3,0,0,0,1,0,539699,26,35.758422,5,3.030867,1015.852997,2021-07-11 00:00:00
4,1,1,1,0,0,557505,32,42.707809,6,-7.294613,483.837991,2021-02-09 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...
65791,0,1,0,0,1,537683,18,-5.477865,3,19.305458,250.661862,2021-12-13 00:00:00
65792,1,1,1,0,1,541003,58,15.875853,3,6.336101,290.619738,2021-07-12 00:00:00
65793,1,1,0,0,0,525699,24,50.772095,6,-11.397467,709.142451,2021-07-31 00:00:00
65794,1,1,0,1,3,481392,39,40.150343,7,9.192692,922.197270,2021-03-02 00:00:00


In [None]:
# Save the synthetic dataset to a CSV file
synthetic_file_path = '/content/OnlineCustomerSalesData.csv'
synthetic_df.to_csv(synthetic_file_path, index=False)