In [None]:
import os
import pprint
import tempfile
import datetime
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs
tf.get_logger().setLevel('INFO')
from typing import Dict, Text
from sklearn.model_selection import train_test_split

In [None]:
#article data
art_df = pd.read_csv('../../data/articles_filled.csv', dtype={'article_id':str})

In [None]:
#customer data
cus_df = pd.read_csv('../../data/customers_filled.csv')

In [None]:
#Full interaction dataset
inter = pd.read_csv('../../data/transactions_train.csv').astype(str)

In [None]:
#Ordering by date
inter = inter.sort_values(by='t_dat').set_index('t_dat')

In [None]:
#Setting index to datetime to filter using date ranges
inter.index = pd.to_datetime(inter.index, format='%Y-%m-%d')

In [None]:
#Calculating dates to narrow-down the dataset
latest_date = inter.index.max()
start_of_week = latest_date - pd.Timedelta(days=7)
prior_6_weeks = latest_date - pd.Timedelta(days=42)

In [None]:
#Keeping test data as last week and train data as last 12 month (excluding last week)
test_inter = inter.loc[start_of_week:]
train_inter = inter.loc[prior_6_weeks:start_of_week]

In [None]:
#Free RAM
del inter

In [None]:
#Grouping by customer-article combinations to keep only unique interactions
train_inter = train_inter.groupby(['customer_id', 'article_id']).count().reset_index()[['customer_id', 'article_id']]
test_inter = test_inter.groupby(['customer_id', 'article_id']).count().reset_index()[['customer_id', 'article_id']]

In [None]:
#Adding customer and product attributes
train_df = train_inter.merge(art_df, left_on='article_id', right_on='article_id', how='left')
test_df = test_inter.merge(art_df, left_on='article_id', right_on='article_id', how='left')
train_df = train_df.merge(cus_df, left_on='customer_id', right_on='customer_id', how='left')
test_df = test_df.merge(cus_df, left_on='customer_id', right_on='customer_id', how='left')

In [None]:
#Free RAM
del train_inter
del test_inter
del art_df
del cus_df

In [None]:
def normalize_features(df):
    for column in df.columns:
        df[column] = df[column].astype(str)
    return df

In [None]:
# Define customer and product features
customer_features = ['customer_id', 'postal_code', 'club_member_status', 'fashion_news_frequency']
product_features = ['article_id', 'prod_name', 'product_group_name', 'product_type_name']

In [None]:
train_df = train_df[product_features+customer_features]
test_df = test_df[product_features+customer_features]

In [None]:
train_df = normalize_features(train_df)
test_df = normalize_features(test_df)

In [None]:
train_df.head(5)

In [None]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    ds = tf.data.Dataset.from_tensor_slices(dict(dataframe))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

In [None]:
batch_size = 128

In [None]:
train_ds = df_to_dataset(train_df, batch_size=batch_size)
test_ds = df_to_dataset(test_df, shuffle=False, batch_size=batch_size)

In [None]:
# Create embeddings
def create_embedding_for_feature(feature_name, vocabulary, embedding_dimension=32):
    vocabulary = [str(item) for item in vocabulary]
    return tf.keras.Sequential([
        tf.keras.layers.StringLookup(vocabulary=vocabulary, mask_token=None),
        tf.keras.layers.Embedding(len(vocabulary) + 1, embedding_dimension)
    ])

In [None]:
embedding_dims = 32

In [None]:
embeddings = {feature: create_embedding_for_feature(feature, np.unique(train_df[feature].astype('object')), embedding_dimension=embedding_dims) 
              for feature in customer_features + product_features}

In [None]:
class UserModel(tf.keras.Model):
    def __init__(self, feature_names):
        super().__init__()
        self.feature_models = [embeddings[feature] for feature in feature_names]
        self.feature_names = feature_names
        self.dense_layers = tf.keras.Sequential([
            tf.keras.layers.Dense(64, activation='relu'),
            # Add more layers if needed
        ])
        
    def get_config(self):
        config = super().get_config()
        config.update({
            "feature_names": self.feature_names
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)
        
    def call(self, inputs):
        concatenated_features = []
        for feature in self.feature_names:
            feature_input = inputs[feature]
#             print('users', feature)
            feature_model = self.feature_models[self.feature_names.index(feature)]
            feature_embedding = feature_model(feature_input)
            concatenated_features.append(feature_embedding)
        concatenated = tf.concat(concatenated_features, axis=1)
        return self.dense_layers(concatenated)

class ProductModel(tf.keras.Model):
    def __init__(self, feature_names):
        super().__init__()
        self.feature_models = [embeddings[feature] for feature in feature_names]
        self.feature_names = feature_names
        self.dense_layers = tf.keras.Sequential([
            tf.keras.layers.Dense(64, activation='relu'),
            # Add more layers if needed
        ])
        
    def get_config(self):
        config = super().get_config()
        config.update({
            "feature_names": self.feature_names
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)        

    def call(self, inputs):
        concatenated_features = []
        for feature in self.feature_names:
            feature_input = inputs[feature]
#             print('product', feature)
#             print(inputs[feature])
            feature_model = self.feature_models[self.feature_names.index(feature)]
            feature_embedding = feature_model(feature_input)
            concatenated_features.append(feature_embedding)
        concatenated = tf.concat(concatenated_features, axis=1)
        return self.dense_layers(concatenated)

    def compute_embeddings(self, products):
        return products.map(self.call)

In [None]:
user_model = UserModel(customer_features)
product_model = ProductModel(product_features)

In [None]:
# Prepare the unique product features for candidate embeddings
unique_products = train_df[product_features].drop_duplicates().astype('string')
product_features_ds = tf.data.Dataset.from_tensor_slices(dict(unique_products))
product_embeddings = product_model.compute_embeddings(product_features_ds.batch(batch_size))

In [None]:
class TwoTowerModel(tfrs.Model):
    def __init__(self, user_model, product_model, product_embeddings):
        super().__init__()
        self.user_model = user_model
        self.product_model = product_model
        self.task = tfrs.tasks.Retrieval(metrics=tfrs.metrics.FactorizedTopK(candidates=product_embeddings))
        
    def get_config(self):
        config = super().get_config()
        config.update({
            "user_model": self.user_model,
            "product_model": self.product_model,
            # Since product_embeddings are likely not serializable, you might need to handle them differently
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)
        

    def compute_loss(self, features, training=False):
        # Extracting the features dictionary from the input tuple
        user_embeddings = self.user_model(features)
        positive_product_embeddings = self.product_model(features)

        return self.task(user_embeddings, positive_product_embeddings, compute_metrics=not training)

In [None]:
#Hyperparameters
learning_rate = 0.5
epochs = 1

In [None]:
model = TwoTowerModel(user_model, product_model, product_embeddings)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate))

In [None]:
model.fit(train_ds, epochs=epochs)

In [None]:
model.user_model.save('../../models/tfrs_retrieval_user.model')
model.product_model.save('../../models/tfrs_retrieval_product.model')

In [None]:
user_model = tf.keras.models.load_model('../../models/tfrs_retrieval_user.model')
product_model = tf.keras.models.load_model('../../models/tfrs_retrieval_product.model')

In [None]:
model.user_model = user_model
model.product_model = product_model

In [None]:
model.evaluate(test_ds, return_dict=True)

In [None]:
#Pulling out models for users and products
user_model = model.user_model
product_model = model.product_model

In [None]:
#Creating full dataset in order to generate embeddings for all products with trained product_model
norm_counts = normalize_features(train_df).drop_duplicates()
counts_ds = df_to_dataset(norm_counts)

In [None]:
#Updating product embeddings with trained product model to be used to generate recommendations
new_prod_embeddings = product_model.compute_embeddings(counts_ds)

In [None]:
#Combining product embeddings 
product_embeddings_tensor = tf.concat([x for x in new_prod_embeddings], axis=0)

In [None]:
#Selecting closest pairs based on top_k metric
def find_recommendations(user_embeddings, product_embeddings, product_df=train_df, k=10):
    scores = tf.matmul(user_embeddings, product_embeddings, transpose_b=True)
    top_k_indices = tf.math.top_k(scores, k=k)[1]
    recommended_product_ids = [product_df['article_id'].unique()[index] for index in top_k_indices]
    return recommended_product_ids

In [None]:
#printing user info alongside with recommendation to see if the users are truely similar
for batch in test_ds:
    print([(f, batch[f][:10])for f in customer_features]) 
    user_embeddings = user_model(batch)
    recommended_products = find_recommendations(user_embeddings, product_embeddings_tensor, k=10)
    for recommendation in recommended_products[:10]:
        print(recommendation)

In [None]:
import sys
def batch_generator(transactions_path, customer_df_path, product_df_path, batch_size=100):
    # Load product dataframe
    product_df = pd.read_csv(product_df_path)

    # Process customers in batches
    for customer_df in pd.read_csv(customer_df_path, chunksize=batch_size):        
        #Normalizing features of customers
        customer_df = normalize_features(customer_df)
        customer_ds = df_to_dataset(customer_df, batch_size=batch_size)
        
        #Normalizing features of customers
        product_df = normalize_features(product_df)
        product_ds = df_to_dataset(product_df, batch_size=batch_size)

        # Generate embeddings
        user_embeddings = user_model.predict(customer_ds)
        product_embeddings = product_model.compute_embeddings(product_ds)
        product_embeddings = tf.concat([x for x in product_embeddings], axis=0)

        # Find recommendations
        recommended_product_ids = find_recommendations(user_embeddings, product_embeddings, product_df=product_df, k=12)
        joined_product_ids = np.apply_along_axis(lambda x: ' '.join(map(str, x)), 1, recommended_product_ids)
        
        # Prepare the results
        results = pd.DataFrame({
            'customer_id': customer_df['customer_id'],
            'prediction': joined_product_ids
        })

        # Yield the results for this batch
        yield results

In [None]:
# Usage
generator = batch_generator('../../data/transactions_train.csv', '../../data/customers.csv', '../../data/articles.csv', batch_size=1_000)
for predictions in generator:
    # Process the predictions, such as saving to a CSV file
    predictions.to_csv('../../data/predictions_test_1.csv', mode='a', header=True, index=False)

In [None]:
#Adding 0 for each product id to fix the predictions per VJ proposal
results['prediction'] = results['prediction'].apply(lambda x: ' '.join(['0' + p for p in x.split()]))

In [None]:
results.to_csv('../../data/tfrs_baseline_submission_fixed.csv', header=True, index=False)