In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import pickle

In [2]:
# Generate synthetic data for real-time features
n_samples = 1000
rt_data = pd.DataFrame({
    'last_page_1': np.random.randint(1, 10, n_samples),
    'last_page_2': np.random.randint(1, 10, n_samples),
    'last_page_3': np.random.randint(1, 10, n_samples),
    'time_spent_1': np.random.randint(1, 100, n_samples),
    'time_spent_2': np.random.randint(1, 100, n_samples),
    'time_spent_3': np.random.randint(1, 100, n_samples),
})

# Generate synthetic data for offline features
offline_data = pd.DataFrame({
    'total_purchases': np.random.randint(1, 10, n_samples),
    'total_amount_spent': np.random.uniform(1, 1000, n_samples),
    'average_order_value': np.random.uniform(1, 100, n_samples),
    'days_since_last_purchase': np.random.randint(1, 30, n_samples),
    'is_returning_customer': np.random.choice([True, False], n_samples)
})
#Generate aggrigated data for rt features
agg_data = pd.DataFrame({
    'total_time_spent': np.random.randint(1, 300, n_samples),
    'avg_time_spent': np.random.randint(1, 100, n_samples),
})

# Generate synthetic data for y value binary "buy not buy" 
did_purchase = pd.DataFrame({
    'did_purchase':  np.random.randint(0, 2, n_samples)
})


# Merge all synthetic data into one dataframe
synthetic_data = pd.concat([rt_data, offline_data , agg_data , did_purchase], axis=1)

In [3]:
synthetic_data

Unnamed: 0,last_page_1,last_page_2,last_page_3,time_spent_1,time_spent_2,time_spent_3,total_purchases,total_amount_spent,average_order_value,days_since_last_purchase,is_returning_customer,total_time_spent,avg_time_spent,did_purchase
0,9,3,3,60,65,56,5,341.437247,93.856452,28,False,21,83,1
1,7,9,9,83,24,37,5,337.729254,68.353843,12,False,53,54,1
2,7,8,7,65,78,92,5,477.794817,73.426132,3,True,296,22,0
3,3,1,3,56,80,53,1,961.499659,12.059013,29,False,255,15,1
4,8,7,5,33,80,40,2,738.204945,86.509190,10,False,90,50,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,4,9,3,39,76,77,7,527.889761,67.384377,14,True,191,90,0
996,1,5,7,99,95,46,4,494.900942,98.354686,13,False,246,28,0
997,7,3,9,95,22,64,8,172.619293,87.708099,18,False,248,13,0
998,3,6,4,77,94,36,9,104.074302,24.910027,21,False,127,20,1


In [4]:
# Split data into training and validation sets
train_data = synthetic_data.sample(frac=0.8, random_state=0)
valid_data = synthetic_data.drop(train_data.index)

# Separate features and labels
train_features = train_data.iloc[:, :-1]
valid_features = valid_data.iloc[:, :-1]
train_labels = train_data.iloc[:, -1]
valid_labels = valid_data.iloc[:, -1]


In [5]:
# Train the regression model
regressor = LinearRegression()
regressor.fit(train_features, train_labels)

LinearRegression()

In [6]:
batch_predictions = regressor.predict(valid_features)
print(batch_predictions) 

[0.44359892 0.51073087 0.61176071 0.54540946 0.4024467  0.46103733
 0.47929706 0.44238273 0.53456509 0.58014312 0.38804002 0.60712683
 0.53379642 0.59949121 0.49088649 0.41248305 0.68895469 0.48448602
 0.48372591 0.49082721 0.52042688 0.40230821 0.56547007 0.39957833
 0.42363396 0.53965022 0.54912175 0.39242866 0.53701582 0.65655773
 0.57655958 0.56634449 0.50580577 0.54908768 0.45706687 0.58127045
 0.52647493 0.46783296 0.55421222 0.48682854 0.46236942 0.46756106
 0.51338884 0.47845747 0.59553917 0.48220738 0.60405163 0.53224943
 0.54880475 0.4312098  0.47460233 0.59015034 0.44971926 0.59837731
 0.54987586 0.53998137 0.37006372 0.49946673 0.5295177  0.43064379
 0.54957848 0.57187485 0.55581977 0.66138173 0.53486673 0.40230582
 0.43258811 0.51337004 0.43984382 0.48043924 0.73144316 0.7011855
 0.46035457 0.48634242 0.65509614 0.57135446 0.43445508 0.65267843
 0.6040502  0.56000139 0.52766692 0.47722248 0.52229451 0.5820813
 0.56934168 0.49183848 0.46904687 0.49461814 0.58274075 0.573310

In [7]:
# Save the model as an .h5 file using pickle
with open('purchase_prediction_model.h5', 'wb') as f:
    pickle.dump(regressor, f)

In [8]:
with open('purchase_prediction_model.h5', 'rb') as f:
    model = pickle.load(f)

# Use the loaded model for prediction
y_pred = model.predict(valid_features)

In [9]:
list(y_pred)

[0.4435989186506932,
 0.510730871846328,
 0.6117607143387089,
 0.5454094620910118,
 0.40244669987044623,
 0.46103733050304835,
 0.4792970583039034,
 0.4423827271908239,
 0.5345650931872501,
 0.5801431154651044,
 0.388040015480809,
 0.6071268272834466,
 0.5337964217185722,
 0.5994912107391224,
 0.4908864888972788,
 0.4124830502203542,
 0.6889546877033581,
 0.48448602413546427,
 0.4837259061088925,
 0.49082720637560984,
 0.5204268844580128,
 0.40230820778973164,
 0.5654700672886158,
 0.39957832832108875,
 0.423633957820129,
 0.5396502213850329,
 0.5491217548550171,
 0.3924286583415926,
 0.5370158155405665,
 0.6565577334105781,
 0.576559580276146,
 0.5663444892553684,
 0.5058057716489786,
 0.5490876778640498,
 0.4570668675108327,
 0.5812704530900277,
 0.5264749309951208,
 0.467832964129728,
 0.5542122185754798,
 0.48682854096558204,
 0.462369422268481,
 0.467561060984121,
 0.5133888428810925,
 0.47845746579355064,
 0.5955391745090803,
 0.4822073778653224,
 0.6040516313369444,
 0.532249433