In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, GRU, Dense, Embedding, Concatenate, Flatten
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
import os
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import LearningRateScheduler, ReduceLROnPlateau
from tensorflow.keras.models import load_model

In [None]:
# this code is for working in colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Replace with the actual path to your CSV file in your Drive
csv_path = '/content/drive/MyDrive/retailrocket/filtered_sessions.csv'
data = pd.read_csv(csv_path)
data.head()

Unnamed: 0,timestamp,visitorid,event,itemid,category,global_session_id,interaction_type_id,time_since_last_interaction_logscaled,item_preference_logscaled,item_popularity_logscaled,time_since_session_start_logscaled,session_duration_logscaled
0,2015-09-11 20:49:49.439,0,view,285930,1188,0:START:20150911204949:DUR:00327s,3,0.18986,0.080446,0.229065,0.289316,0.468937
1,2015-09-11 20:52:39.591,0,view,357564,256,0:START:20150911204949:DUR:00327s,3,0.0,0.0,0.883486,0.0,0.0
2,2015-09-11 20:55:17.175,0,view,67045,333,0:START:20150911204949:DUR:00327s,3,0.0,0.0,0.738648,0.0,0.0
3,2015-08-07 17:51:44.567,2,view,325215,299,2:START:20150807175144:DUR:01753s,3,0.244409,0.298417,0.684853,0.657039,0.677851
4,2015-08-07 17:53:33.790,2,view,325215,299,2:START:20150807175144:DUR:01753s,3,0.0,0.0,0.287558,0.0,0.0


In [None]:
# Sort data
data['timestamp'] = pd.to_datetime(data['timestamp'])
data.sort_values(by=['visitorid', 'timestamp'], inplace=True)

# Encode the user IDs and interaction types as integers
user_encoder = LabelEncoder()
data['user_id_encoded'] = user_encoder.fit_transform(data['visitorid'])


# Encode 'interaction_type_id'
item_types = data['itemid'].unique().tolist()
item_encoder = LabelEncoder()
item_encoder.fit(item_types)

# transform  data with this updated encoder
data['itemid_encoded'] = item_encoder.transform(data['itemid'])

# Encode categories
category_types = data['category'].unique().tolist()
category_encoder = LabelEncoder()
category_encoder.fit(category_types)

# transform  data with this updated encoder
data['category_encoded'] = category_encoder.transform(data['category'])

In [None]:
# sort data
data.sort_values(by=['visitorid', 'timestamp'], inplace=True)

# set cutoff poin
train_cutoff = data['timestamp'].quantile(0.70)
val_test_cutoff = data['timestamp'].quantile(0.85)

# Assign sessions to train, validation, or test set based on the cutoff dates
train_sessions = data[data['timestamp'] < train_cutoff]['global_session_id'].unique()
val_test_sessions = data[(data['timestamp'] >= train_cutoff) & (data['timestamp'] < val_test_cutoff)]['global_session_id'].unique()
test_sessions = data[data['timestamp'] >= val_test_cutoff]['global_session_id'].unique()

# Create train, validation, and test datasets
train_data = data[data['global_session_id'].isin(train_sessions)]
val_data = data[data['global_session_id'].isin(val_test_sessions)]
test_data = data[data['global_session_id'].isin(test_sessions)]


In [None]:
# Count the number of unique sessions in the train dataset
unique_sessions_train = train_data['global_session_id'].nunique()

# Print the number of unique sessions
print("Number of unique sessions in the train dataset:", unique_sessions_train)
# Calculate the maximum sequence length across all sessions
max_sequence_length = data.groupby(['visitorid', 'global_session_id']).size().max()
print(f"Maximum sequence length in the dataset: {max_sequence_length}")

Number of unique sessions in the train dataset: 118983
Maximum sequence length in the dataset: 99


In [None]:

def prepare_session_data(data, max_length):
    """
    Prepares session data for input into an RNN model.

    Args:
    - data (DataFrame): The DataFrame containing session data with interaction features.
    - max_length (int): The maximum sequence length for padding.

    Returns:
    - X (ndarray): The array of input features, shaped to fit the RNN.
    - y (ndarray): The array of target values (next item IDs).
    """
    grouped = data.groupby(['visitorid', 'global_session_id'])
    X, y = [], []

    # Iterate over each group (session)
    for (_, session_df) in grouped:
        # Create sequences of item IDs and categories, along with other features
        features = session_df[['itemid_encoded', 'category_encoded', 'item_popularity_logscaled'
                                ]].values

        # Generate sequences for the model
        for i in range(len(features) - 1):
            current_sequence = features[:i+1]
            next_item = features[i+1, 0]

            # Padding or truncating the sequence
            if len(current_sequence) < max_length:
                padded_sequence = np.pad(current_sequence, ((max_length - len(current_sequence), 0), (0, 0)), 'constant')
            else:
                padded_sequence = current_sequence[-max_length:]

            X.append(padded_sequence)
            y.append(next_item)

    return np.array(X), np.array(y)



In [None]:
# Prepare training data
X_train,  y_train  = prepare_session_data(train_data, max_length=max_sequence_length)


# Prepare validation data
X_val, y_val = prepare_session_data(val_data, max_length=max_sequence_length)


# Prepare test data
X_test,  y_test = prepare_session_data(test_data, max_length=max_sequence_length)



In [None]:
# set number of classes
num_items = len(item_encoder.classes_)
num_categories = len(category_encoder.classes_)

# Define the input shape, expecting 3 features per time step, adjust if more features are present
feature_input = Input(shape=(None, 3), dtype='float32', name='feature_input')

# Embed item IDs; 'itemid_encoded' assumed to be the first feature in the sequence
item_embedding_layer = Embedding(input_dim=num_items, output_dim=50, name='item_embedding')
item_embedding = item_embedding_layer(feature_input[:, :, 0])

# Embed categories; 'category_encoded' assumed to be the second feature in the sequence
category_embedding_layer = Embedding(input_dim=num_categories, output_dim=25, name='category_embedding')  # Smaller embedding size for categories
category_embedding = category_embedding_layer(feature_input[:, :, 1])

# Concatenate item and category embeddings with other numerical/log-scaled features
concat_features = Concatenate()([item_embedding, category_embedding, feature_input[:, :, 2:]])

# GRU layer to process the sequence of embeddings and numerical features
gru_layer = GRU(128, return_sequences=False)(concat_features)

# Output layer with a softmax activation to predict the next item ID
output_layer = Dense(num_items, activation='softmax')(gru_layer)

# Construct the model
model = Model(inputs=feature_input, outputs=output_layer)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
# Reduce learning rate when a metric has stopped improving
lr_reduction = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3)

# Print the model summary to understand its structure
model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 feature_input (InputLayer)  [(None, None, 3)]            0         []                            
                                                                                                  
 tf.__operators__.getitem (  (None, None)                 0         ['feature_input[0][0]']       
 SlicingOpLambda)                                                                                 
                                                                                                  
 tf.__operators__.getitem_1  (None, None)                 0         ['feature_input[0][0]']       
  (SlicingOpLambda)                                                                               
                                                                                              

In [None]:

# Path to  GitHub repository on  local machine
repo_path = "C:/Users/nafla/OneDrive/Documents/GitHub/Data-generator-for-RS"
model_dir = os.path.join(repo_path, "models")
os.makedirs(model_dir, exist_ok=True)  # Ensure the directory exists

# Specify the filepath including the required suffix
checkpoint_filepath = os.path.join(model_dir, 'item_model_checkpoint.weights.h5')

# Setup the ModelCheckpoint callback
checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True,
    verbose=1
)




In [None]:

# Add both callbacks to your model training
history = model.fit([X_train], y_train,
                    epochs=10,
                    batch_size=64,
                    validation_data=([X_val], y_val),
                    callbacks=[ lr_reduction, checkpoint_callback, early_stopping],
                    verbose=1)



Epoch 1/10
Epoch 1: val_accuracy improved from -inf to 0.06324, saving model to C:/Users/nafla/OneDrive/Documents/GitHub/Data-generator-for-RS/models/item_model_checkpoint.weights.h5
Epoch 2/10
Epoch 2: val_accuracy improved from 0.06324 to 0.09665, saving model to C:/Users/nafla/OneDrive/Documents/GitHub/Data-generator-for-RS/models/item_model_checkpoint.weights.h5
Epoch 3/10
Epoch 3: val_accuracy improved from 0.09665 to 0.11242, saving model to C:/Users/nafla/OneDrive/Documents/GitHub/Data-generator-for-RS/models/item_model_checkpoint.weights.h5
Epoch 4/10
Epoch 4: val_accuracy improved from 0.11242 to 0.11671, saving model to C:/Users/nafla/OneDrive/Documents/GitHub/Data-generator-for-RS/models/item_model_checkpoint.weights.h5
Epoch 5/10
Epoch 5: val_accuracy improved from 0.11671 to 0.11897, saving model to C:/Users/nafla/OneDrive/Documents/GitHub/Data-generator-for-RS/models/item_model_checkpoint.weights.h5
Epoch 6/10
Epoch 6: val_accuracy did not improve from 0.11897
Epoch 7/10


In [None]:

# Save the complete model (architecture + weights)
complete_model_filepath = ('/content/drive/MyDrive/retailrocket/final_item_model.h5')
model.save(complete_model_filepath)

print("Model loaded and saved successfully.")

  saving_api.save_model(


Model loaded and saved successfully.


In [None]:


# Load the entire model for inference
loaded_model = load_model('/content/drive/MyDrive/retailrocket/final_item_model.h5')

# evaluate on the test set

test_loss, test_accuracy = loaded_model.evaluate([X_test], y_test, verbose=1)

print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

Test Loss: 9.331269264221191
Test Accuracy: 0.0930633470416069


In [None]:
from tensorflow.keras.models import load_model
loaded_model = load_model('/content/drive/MyDrive/retailrocket/final_item_model.h5')

In [None]:
# Calculate the quarter index based on the total number of samples
quarter_index = len(X_test) // 10

# Split X_test into 10 quarters
X_test_one_tenth = X_test[:quarter_index]


# use these quarters independently for predictions due to computational problem
predictions_one_tenth = loaded_model.predict(X_test_one_tenth)





In [None]:
# Extract the relevant portion of y_test for the first quarter
y_test_first_quarter = y_test[:quarter_index]

# Calculate the predicted classes from the softmax outputs
predicted_classes_first_quarter = np.argmax(predictions_one_tenth, axis=1)

# Calculate hits by comparing predicted classes with actual classes
hits_first_quarter = np.sum(predicted_classes_first_quarter == y_test_first_quarter)

# Calculate hit rate
hit_rate_first_quarter = hits_first_quarter / len(y_test_first_quarter)

# Print the hit rate
print("Hit Rate for the first quarter of the test data:", hit_rate_first_quarter)


Hit Rate for the first quarter of the test data: 0.09526741731175228


In [None]:
def calculate_top_k_hit_rate(y_true, predictions, k=5):
    top_k_hits = 0
    for true_label, prediction in zip(y_true, predictions):
        # `prediction` is sorted by confidence from highest to lowest
        top_k_predictions = np.argsort(prediction)[-k:]
        if true_label in top_k_predictions:
            top_k_hits += 1
    return top_k_hits / len(y_true)

# Calculate hit rate for top-5 predictions
top_5_hit_rate = calculate_top_k_hit_rate(y_test_first_quarter, predictions_one_tenth, k=5)
print(f"Top-5 Hit Rate: {top_5_hit_rate:.2%}")


Top-5 Hit Rate: 20.51%


In [None]:
top_5_hit_rate = calculate_top_k_hit_rate(y_test_first_quarter, predictions_one_tenth, k=10)
print(f"Top-10 Hit Rate: {top_5_hit_rate:.2%}")

Top-5 Hit Rate: 26.14%


In [None]:
top_5_hit_rate = calculate_top_k_hit_rate(y_test_first_quarter, predictions_one_tenth, k=20)
print(f"Top-20 Hit Rate: {top_5_hit_rate:.2%}")

Top-5 Hit Rate: 32.20%
