In [1]:
import pandas as pd
import numpy as np

# Load the events dataset
data = pd.read_csv('C:/Users/nafla/Downloads/article/dataset/filtered_sessions.csv')
data.head()

Unnamed: 0,timestamp,visitorid,event,itemid,session_id,global_session_id,time_since_session_start,interaction_type_id
0,2015-09-11 20:49:49.439,0,view,285930,0,0:START:20150911204949:DUR:00327s,0.0,3
1,2015-09-11 20:52:39.591,0,view,357564,0,0:START:20150911204949:DUR:00327s,170.152,3
2,2015-09-11 20:55:17.175,0,view,67045,0,0:START:20150911204949:DUR:00327s,327.736,3
3,2015-08-30 06:39:38.318,6,view,253615,1,6:START:20150830063938:DUR:01015s,0.0,3
4,2015-08-30 06:40:23.805,6,view,344723,1,6:START:20150830063938:DUR:01015s,45.487,3


In [2]:
# finding unique users
unique_users = data['visitorid'].unique()

# Set a random seed for reproducibility
np.random.seed(42)

# Calculate the number of users to sample
sample_size = int(len(unique_users) * 0.20)

# Randomly sample user IDs
sampled_users = np.random.choice(unique_users, size=sample_size, replace=False)

In [3]:
# Filter the original data to keep only the sessions of sampled users
sampled_data = data[data['visitorid'].isin(sampled_users)]

# Optional: Verify the results
sampled_data.head()
print(f"Number of sessions in the sampled data: {sampled_data['global_session_id'].nunique()}")
print(f"Number of users in the sampled data: {sampled_data['visitorid'].nunique()}")


Number of sessions in the sampled data: 64120
Number of users in the sampled data: 55642


In [4]:
sampled_data.head()

Unnamed: 0,timestamp,visitorid,event,itemid,session_id,global_session_id,time_since_session_start,interaction_type_id
15,2015-07-13 16:29:23.786,36,view,176286,0,36:START:20150713162923:DUR:00205s,0.0,3
16,2015-07-13 16:32:48.853,36,view,176286,0,36:START:20150713162923:DUR:00205s,205.067,3
29,2015-06-22 15:38:37.948,65,view,52383,0,65:START:20150622153837:DUR:00051s,0.0,3
30,2015-06-22 15:39:17.183,65,view,168656,0,65:START:20150622153837:DUR:00051s,39.235,3
31,2015-06-22 15:39:23.298,65,view,52383,0,65:START:20150622153837:DUR:00051s,45.35,3


In [5]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Assuming 'sampled_data' is your DataFrame from the previous step
sampled_data['time_since_session_start'] = scaler.fit_transform(sampled_data[['time_since_session_start']])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sampled_data['time_since_session_start'] = scaler.fit_transform(sampled_data[['time_since_session_start']])


In [6]:
from sklearn.preprocessing import LabelEncoder

# Initialize the Label Encoder
encoder = LabelEncoder()

# Encode the itemid
sampled_data['itemid'] = encoder.fit_transform(sampled_data['itemid'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sampled_data['itemid'] = encoder.fit_transform(sampled_data['itemid'])


In [7]:
sampled_data.head()

Unnamed: 0,timestamp,visitorid,event,itemid,session_id,global_session_id,time_since_session_start,interaction_type_id
15,2015-07-13 16:29:23.786,36,view,18147,0,36:START:20150713162923:DUR:00205s,0.0,3
16,2015-07-13 16:32:48.853,36,view,18147,0,36:START:20150713162923:DUR:00205s,0.041414,3
29,2015-06-22 15:38:37.948,65,view,5394,0,65:START:20150622153837:DUR:00051s,0.0,3
30,2015-06-22 15:39:17.183,65,view,17379,0,65:START:20150622153837:DUR:00051s,0.007924,3
31,2015-06-22 15:39:23.298,65,view,5394,0,65:START:20150622153837:DUR:00051s,0.009159,3


In [10]:
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.utils import to_categorical

data['itemid'] = scaler.fit_transform(data[['itemid']])

# Convert 'interaction_type_id' to categorical (one-hot encoded format)
data['interaction_type_id'] = to_categorical(data['interaction_type_id'])

# Assuming 'sampled_data' is already loaded and contains a 'timestamp' column
sampled_data.sort_values('timestamp', inplace=True)

# Calculate indices for splitting
total_count = len(sampled_data)
train_end = int(total_count * 0.70)
validation_end = int(total_count * 0.85)

# Split the data
train_data = sampled_data[:train_end]
validation_data = sampled_data[train_end:validation_end]
test_data = sampled_data[validation_end:]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sampled_data.sort_values('timestamp', inplace=True)


In [11]:
# Verify the minimum sequence length in validation and test sets
min_validation_length = validation_data.groupby('global_session_id').size().min()
min_test_length = test_data.groupby('global_session_id').size().min()

print(f"Minimum interactions in validation sequences: {min_validation_length}")
print(f"Minimum interactions in test sequences: {min_test_length}")


Minimum interactions in validation sequences: 1
Minimum interactions in test sequences: 1


In [12]:
train_data.head()

Unnamed: 0,timestamp,visitorid,event,itemid,session_id,global_session_id,time_since_session_start,interaction_type_id
384073,2015-05-03 03:00:04.384,693516,addtocart,30968,0,693516:START:20150503030004:DUR:00025s,0.0,1
384074,2015-05-03 03:00:26.228,693516,view,30968,0,693516:START:20150503030004:DUR:00025s,0.004411,3
384075,2015-05-03 03:00:29.427,693516,addtocart,30968,0,693516:START:20150503030004:DUR:00025s,0.005057,1
144845,2015-05-03 03:01:07.992,260113,view,13013,0,260113:START:20150503030107:DUR:00362s,0.0,3
144846,2015-05-03 03:01:50.079,260113,view,5355,0,260113:START:20150503030107:DUR:00362s,0.0085,3


In [15]:

# Group data by 'global_session_id' and collect sequences
grouped = train_data.groupby('global_session_id').agg(list)

# Create sequences for training
X = []
y = []
for session_id, group in grouped.iterrows():
    inputs = np.column_stack((group['interaction_type_id'], group['itemid'], group['time_since_session_start']))
    X.append(inputs[:-1])
    y.append(inputs[1:])


In [16]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Pad sequences
X_padded = pad_sequences(X, padding='post', dtype='float32')
y_padded = pad_sequences(y, padding='post', dtype='float32')

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, TimeDistributed

# Define model architecture
model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(None, X_padded.shape[2])),
    LSTM(50, return_sequences=True),
    TimeDistributed(Dense(X_padded.shape[2], activation='softmax'))  # Predict the next interaction
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

  super().__init__(**kwargs)


In [18]:
# Train the model
history = model.fit(X_padded, y_padded, epochs=10, validation_split=0.2)

Epoch 1/10
[1m1121/1121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.4787 - loss: 1252.3622 - val_accuracy: 0.4893 - val_loss: 14.7223
Epoch 2/10
[1m1121/1121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.4838 - loss: 14.5519 - val_accuracy: 0.4893 - val_loss: 14.6297
Epoch 3/10
[1m1121/1121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.4869 - loss: 14.5686 - val_accuracy: 0.4893 - val_loss: 14.5952
Epoch 4/10
[1m1121/1121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.4839 - loss: 14.4516 - val_accuracy: 0.4893 - val_loss: 14.5863
Epoch 5/10
[1m1121/1121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.4849 - loss: 14.4541 - val_accuracy: 0.4893 - val_loss: 14.5829
Epoch 6/10
[1m1121/1121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.4847 - loss: 14.4480 - val_accuracy: 0.4893 - val_loss: 14.5754
Ep

In [19]:
# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_padded, y_padded)
print(f"Test Accuracy: {test_accuracy:.2f}")

[1m1401/1401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.4850 - loss: 14.4586
Test Accuracy: 0.48
