In [1]:
import pandas as pd
import numpy as np

# Load the events dataset
data = pd.read_csv('C:/Users/nafla/Downloads/article/dataset/filtered_sessions.csv')
data.head()

Unnamed: 0,timestamp,visitorid,event,itemid,session_id,global_session_id,time_since_session_start,interaction_type_id
0,2015-09-11 20:49:49.439,0,view,285930,0,0:START:20150911204949:DUR:00327s,0.0,3
1,2015-09-11 20:52:39.591,0,view,357564,0,0:START:20150911204949:DUR:00327s,170.152,3
2,2015-09-11 20:55:17.175,0,view,67045,0,0:START:20150911204949:DUR:00327s,327.736,3
3,2015-08-30 06:39:38.318,6,view,253615,1,6:START:20150830063938:DUR:01015s,0.0,3
4,2015-08-30 06:40:23.805,6,view,344723,1,6:START:20150830063938:DUR:01015s,45.487,3


Convert raw timestamps into multiple features that capture different temporal aspects:
- Time of day: Extract hours, minutes, and even seconds.
- Day of week: Extract the day of the week from the timestamp.
- Month or Season: Extract the month or the season, which can be particularly useful for capturing seasonal behavior in user interactions.

In [2]:
# Convert 'timestamp' to datetime format if not already done
data['timestamp'] = pd.to_datetime(data['timestamp'])


data['hour'] = data['timestamp'].dt.hour
data['day_of_week'] = data['timestamp'].dt.dayofweek
data['month'] = data['timestamp'].dt.month


In [3]:
# Encodeing time features using sin and cosine 
data['hour_sin'] = np.sin(2 * np.pi * data['hour']/24)
data['hour_cos'] = np.cos(2 * np.pi * data['hour']/24)
data['day_sin'] = np.sin(2 * np.pi * data['day_of_week']/7)
data['day_cos'] = np.cos(2 * np.pi * data['day_of_week']/7)

# Optionally, view some of the data to confirm it looks correct
print(data[['hour', 'hour_sin', 'hour_cos', 'day_of_week', 'day_sin', 'day_cos']].head())


   hour  hour_sin      hour_cos  day_of_week   day_sin   day_cos
0    20 -0.866025  5.000000e-01            4 -0.433884 -0.900969
1    20 -0.866025  5.000000e-01            4 -0.433884 -0.900969
2    20 -0.866025  5.000000e-01            4 -0.433884 -0.900969
3     6  1.000000  6.123234e-17            6 -0.781831  0.623490
4     6  1.000000  6.123234e-17            6 -0.781831  0.623490


In [4]:
# finding unique users
unique_users = data['visitorid'].unique()

# Set a random seed for reproducibility
np.random.seed(42)

# Calculate the number of users to sample
sample_size = int(len(unique_users) * 0.40)

# Randomly sample user IDs
sampled_users = np.random.choice(unique_users, size=sample_size, replace=False)

In [5]:
# Filter the original data to keep only the sessions of sampled users
sampled_data = data[data['visitorid'].isin(sampled_users)]

# Optional: Verify the results
sampled_data.head()
print(f"Number of sessions in the sampled data: {sampled_data['global_session_id'].nunique()}")
print(f"Number of users in the sampled data: {sampled_data['visitorid'].nunique()}")


Number of sessions in the sampled data: 127382
Number of users in the sampled data: 111284


In [6]:
sampled_data.head()

Unnamed: 0,timestamp,visitorid,event,itemid,session_id,global_session_id,time_since_session_start,interaction_type_id,hour,day_of_week,month,hour_sin,hour_cos,day_sin,day_cos
3,2015-08-30 06:39:38.318,6,view,253615,1,6:START:20150830063938:DUR:01015s,0.0,3,6,6,8,1.0,6.123234000000001e-17,-0.781831,0.62349
4,2015-08-30 06:40:23.805,6,view,344723,1,6:START:20150830063938:DUR:01015s,45.487,3,6,6,8,1.0,6.123234000000001e-17,-0.781831,0.62349
5,2015-08-30 06:54:09.385,6,view,344723,1,6:START:20150830063938:DUR:01015s,871.067,3,6,6,8,1.0,6.123234000000001e-17,-0.781831,0.62349
6,2015-08-30 06:56:33.838,6,view,344723,1,6:START:20150830063938:DUR:01015s,1015.52,3,6,6,8,1.0,6.123234000000001e-17,-0.781831,0.62349
11,2015-07-02 23:16:15.173,22,view,86411,0,22:START:20150702231615:DUR:00023s,0.0,3,23,3,7,-0.258819,0.9659258,0.433884,-0.900969


In [7]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Assuming 'sampled_data' is your DataFrame from the previous step
sampled_data['time_since_session_start'] = scaler.fit_transform(sampled_data[['time_since_session_start']])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sampled_data['time_since_session_start'] = scaler.fit_transform(sampled_data[['time_since_session_start']])


In [8]:
from sklearn.preprocessing import LabelEncoder

# Initialize the Label Encoder
encoder = LabelEncoder()

# Encode the itemid
sampled_data['itemid'] = encoder.fit_transform(sampled_data['itemid'])
# Encode month
sampled_data['month'] = encoder.fit_transform(sampled_data['month'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sampled_data['itemid'] = encoder.fit_transform(sampled_data['itemid'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sampled_data['month'] = encoder.fit_transform(sampled_data['month'])


In [9]:
sampled_data.head()

Unnamed: 0,timestamp,visitorid,event,itemid,session_id,global_session_id,time_since_session_start,interaction_type_id,hour,day_of_week,month,hour_sin,hour_cos,day_sin,day_cos
3,2015-08-30 06:39:38.318,6,view,39276,1,6:START:20150830063938:DUR:01015s,0.0,3,6,6,3,1.0,6.123234000000001e-17,-0.781831,0.62349
4,2015-08-30 06:40:23.805,6,view,53453,1,6:START:20150830063938:DUR:01015s,0.009186,3,6,6,3,1.0,6.123234000000001e-17,-0.781831,0.62349
5,2015-08-30 06:54:09.385,6,view,53453,1,6:START:20150830063938:DUR:01015s,0.175914,3,6,6,3,1.0,6.123234000000001e-17,-0.781831,0.62349
6,2015-08-30 06:56:33.838,6,view,53453,1,6:START:20150830063938:DUR:01015s,0.205087,3,6,6,3,1.0,6.123234000000001e-17,-0.781831,0.62349
11,2015-07-02 23:16:15.173,22,view,13332,0,22:START:20150702231615:DUR:00023s,0.0,3,23,3,2,-0.258819,0.9659258,0.433884,-0.900969


In [10]:
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.utils import to_categorical

# data['itemid'] = scaler.fit_transform(data[['itemid']])

# Convert 'interaction_type_id' to categorical (one-hot encoded format)
data['interaction_type_id'] = to_categorical(data['interaction_type_id'])

# Assuming 'sampled_data' is already loaded and contains a 'timestamp' column
sampled_data.sort_values('timestamp', inplace=True)

# Calculate indices for splitting
total_count = len(sampled_data)
train_end = int(total_count * 0.80)  # Adjusted to use 80% of the data for training

# Split the data
train_data = sampled_data[:train_end]
test_data = sampled_data[train_end:]  # Rest of the data for testing

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sampled_data.sort_values('timestamp', inplace=True)


In [11]:
# Verify the minimum sequence length in validation and test sets
min_test_length = test_data.groupby('global_session_id').size().min()

print(f"Minimum interactions in test sequences: {min_test_length}")


Minimum interactions in test sequences: 1


In [12]:
train_data.head()

Unnamed: 0,timestamp,visitorid,event,itemid,session_id,global_session_id,time_since_session_start,interaction_type_id,hour,day_of_week,month,hour_sin,hour_cos,day_sin,day_cos
384073,2015-05-03 03:00:04.384,693516,addtocart,46087,0,693516:START:20150503030004:DUR:00025s,0.0,1,3,6,0,0.707107,0.707107,-0.781831,0.62349
384074,2015-05-03 03:00:26.228,693516,view,46087,0,693516:START:20150503030004:DUR:00025s,0.004411,3,3,6,0,0.707107,0.707107,-0.781831,0.62349
384075,2015-05-03 03:00:29.427,693516,addtocart,46087,0,693516:START:20150503030004:DUR:00025s,0.005057,1,3,6,0,0.707107,0.707107,-0.781831,0.62349
144845,2015-05-03 03:01:07.992,260113,view,19463,0,260113:START:20150503030107:DUR:00362s,0.0,3,3,6,0,0.707107,0.707107,-0.781831,0.62349
50695,2015-05-03 03:01:14.535,90447,view,37466,0,90447:START:20150503030114:DUR:00116s,0.0,3,3,6,0,0.707107,0.707107,-0.781831,0.62349


 ### Group Data by Sessions:
 The data is grouped by global_session_id, which represents individual sessions. The agg(list) aggregates all the columns into lists, effectively creating a list of interactions for each session.

 ### Create Sequences for Training:
 Inputs: The interaction data is combined into a matrix (inputs) where each row corresponds to an interaction featuring the interaction type ID, item ID, and normalized time since session start.
 X (features): The sequences of interactions up to the second last are stored in X.
 y (labels/targets): The sequences from the second interaction onward are stored in y. This setup makes the RNN learn to predict the next interaction based on the previous ones

In [13]:

# Group data by 'global_session_id' and collect sequences
grouped = train_data.groupby('global_session_id').agg(list)

# Create sequences for training
X = []
y = []
for session_id, group in grouped.iterrows():
    inputs = np.column_stack((group['interaction_type_id'], group['itemid'], group['time_since_session_start'], group['hour_sin'], group['hour_cos'], group['day_sin'], group['day_cos']))
    X.append(inputs[:-1])
    y.append(inputs[1:])


### Pad Sequences:
Padding: Since sessions can have varying lengths, padding is applied to standardize their lengths, enabling batch processing by the neural network.

In [14]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Pad sequences
X_padded = pad_sequences(X, padding='post', dtype='float32')
y_padded = pad_sequences(y, padding='post', dtype='float32')

### Define and Compile the RNN Model:

LSTM Layers: Two LSTM layers are used, both returning sequences to ensure that the model outputs a sequence that matches the input sequence in length.

TimeDistributed Layer: This applies a dense layer to each time step independently, predicting the interaction type, item ID, and time for each step.

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, TimeDistributed
from tensorflow.keras.layers import GRU, Dropout


# # Define model architecture
# model = Sequential([
#     GRU(50, return_sequences=True, input_shape=(None, X_padded.shape[2])),
#     GRU(50, return_sequences=True),
#     TimeDistributed(Dense(X_padded.shape[2], activation='softmax'))  # Predict the next interaction
# ])

model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(None, X_padded.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(50, return_sequences=True))
model.add(TimeDistributed(Dense(X_padded.shape[2], activation='softmax')))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

  super().__init__(**kwargs)


### Training the Model:
The model is trained for 10 epochs, using 20% of the data for validation to monitor overfitting.

In [16]:
# Train the model
history = model.fit(X_padded, y_padded, epochs=25, validation_split=0.2)

Epoch 1/25
[1m2545/2545[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.4762 - loss: -33505.4453 - val_accuracy: 0.4856 - val_loss: -40316.3047
Epoch 2/25
[1m2545/2545[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.4851 - loss: -40092.9102 - val_accuracy: 0.4856 - val_loss: -40316.1016
Epoch 3/25
[1m2545/2545[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 5ms/step - accuracy: 0.4846 - loss: -40165.1250 - val_accuracy: 0.4856 - val_loss: -40315.3242
Epoch 4/25
[1m2545/2545[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 5ms/step - accuracy: 0.4820 - loss: -39949.6406 - val_accuracy: 0.4856 - val_loss: -40315.3516
Epoch 5/25
[1m2545/2545[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.4837 - loss: -40047.5781 - val_accuracy: 0.4856 - val_loss: -40314.8008
Epoch 6/25
[1m2545/2545[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - accuracy: 0.4832 - loss: -40056.20

In [20]:
# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_padded, y_padded)
print(f"Test Accuracy: {test_accuracy:.2f}")

[1m3181/3181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.7028 - loss: -40131.9258
Test Accuracy: 0.70
