In [1]:
import implicit
import pandas as pd
import scipy.sparse as sparse

In [2]:
desktop_path = "/Users/erikagromule/Desktop"
file_path = desktop_path + "/customers.csv"

data_cust = pd.read_csv(file_path)

In [3]:
desktop_path = "/Users/erikagromule/Desktop"
file_path1 = desktop_path + "/articles.csv"

data_art = pd.read_csv(file_path1)

In [4]:
desktop_path = "/Users/erikagromule/Desktop"
file_path_tr = desktop_path + "/transactions_train.csv"

data_tr = pd.read_csv(file_path_tr)

In [5]:
#The minimum and maximum dates in the training data
min_date = data_tr['t_dat'].min()
max_date = data_tr['t_dat'].max()

print(f"Minimum date in the training data: {min_date}")
print(f"Maximum date in the training data: {max_date}")

Minimum date in the training data: 2018-09-20
Maximum date in the training data: 2020-09-22


In [6]:
#Convert the 't_dat' column to datetime format
data_tr['t_dat'] = pd.to_datetime(data_tr['t_dat'])

#Dataset to store historical purchase behavior features
customer_history = pd.DataFrame()

#Features based on historical purchase behavior
customer_history['total_purchases'] = data_tr.groupby('customer_id')['article_id'].count()
customer_history['unique_articles_purchased'] = data_tr.groupby('customer_id')['article_id'].nunique()
customer_history['last_purchase_date'] = data_tr.groupby('customer_id')['t_dat'].max()
customer_history['first_purchase_date'] = data_tr.groupby('customer_id')['t_dat'].min()

#Convert 'last_purchase_date' to datetime format
customer_history['last_purchase_date'] = pd.to_datetime(customer_history['last_purchase_date'])

#Additional time-based features
reference_date = pd.to_datetime('2020-09-23')
customer_history['days_since_last_purchase'] = (reference_date - customer_history['last_purchase_date']).dt.days
customer_history['purchase_frequency'] = customer_history['total_purchases'] / (customer_history['days_since_last_purchase'] + 1)

print(customer_history.head())

                                                    total_purchases  \
customer_id                                                           
00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d...               21   
0000423b00ade91418cceaf3b26c6af3dd342b51fd051ee...               86   
000058a12d5b43e67d225668fa1f8d618c13dc232df0cad...               18   
00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2c...                2   
00006413d8573cd20ed7128e53b7b13819fe5cfc2d801fe...               13   

                                                    unique_articles_purchased  \
customer_id                                                                     
00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d...                         19   
0000423b00ade91418cceaf3b26c6af3dd342b51fd051ee...                         64   
000058a12d5b43e67d225668fa1f8d618c13dc232df0cad...                         14   
00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2c...                          2   
00006413d8573cd2

In [7]:
print(customer_history.head(20))

                                                    total_purchases  \
customer_id                                                           
00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d...               21   
0000423b00ade91418cceaf3b26c6af3dd342b51fd051ee...               86   
000058a12d5b43e67d225668fa1f8d618c13dc232df0cad...               18   
00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2c...                2   
00006413d8573cd20ed7128e53b7b13819fe5cfc2d801fe...               13   
000064249685c11552da43ef22a5030f35a147f723d5b02...                3   
0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d8...                6   
00007d2de826758b65a93dd24ce629ed66842531df66993...              120   
00007e8d4e54114b5b2a9b51586325a8d0fa74ea23ef773...                2   
00008469a21b50b3d147c97135e25b4201a8c58997f7878...                4   
0000945f66de1a11d9447609b8b41b1bc987ba185a5496a...                3   
000097d91384a0c14893c09ed047a963c4fc6a5c021044e...               15   
00009c

In [8]:
#the Dataset by total_purchases in descending order
customer_history_sorted = customer_history.sort_values(by='total_purchases', ascending=False)

print(customer_history_sorted.head(20))

                                                    total_purchases  \
customer_id                                                           
be1981ab818cf4ef6765b2ecaea7a2cbf14ccd6e8a7ee98...             1895   
b4db5e5259234574edfff958e170fe3a5e13b6f146752ca...             1441   
49beaacac0c7801c2ce2d189efe525fe80b5d37e46ed05b...             1364   
a65f77281a528bf5c1e9f270141d601d116e1df33bf9df5...             1361   
cd04ec2726dd58a8c753e0d6423e57716fd9ebcf2f14ed6...             1237   
55d15396193dfd45836af3a6269a079efea339e875eff42...             1208   
c140410d72a41ee5e2e3ba3d7f5a860f337f1b5e41c27cf...             1170   
8df45859ccd71ef1e48e2ee9d1c65d5728c31c46ae957d6...             1169   
03d0011487606c37c1b1ed147fc72f285a50c05f00b9712...             1157   
6cc121e5cc202d2bf344ffe795002bdbf87178054bcda2e...             1143   
e34f8aa5e7c8c258523ea3e5f5f13168b6c21a9e8bffccd...             1117   
3493c55a7fe252c84a9a03db338f5be7afbce1edbca12f3...             1115   
0bf4c6

In [9]:
#The total purchases for each day of the week
customer_history['last_purchase_day_of_week'] = customer_history['last_purchase_date'].dt.day_name()
day_of_week_counts = customer_history['last_purchase_day_of_week'].value_counts()

top_days_of_week = day_of_week_counts.head(7)
print(f'Top 3 days of the week:\n{top_days_of_week}')

Top 3 days of the week:
Saturday     212982
Thursday     208024
Friday       205473
Wednesday    199729
Tuesday      190500
Monday       181372
Sunday       164201
Name: last_purchase_day_of_week, dtype: int64


In [10]:
#The total purchases for each month
customer_history['last_purchase_month'] = customer_history['last_purchase_date'].dt.month_name()
month_counts = customer_history['last_purchase_month'].value_counts()

top_months = month_counts.head(12)
print(f'The months:\n{top_months}')

The months:
September    234465
August       200512
July         161800
June         143984
May           96054
April         91752
November      77858
February      75315
December      74571
March         69443
October       69321
January       67206
Name: last_purchase_month, dtype: int64


In [11]:
import pandas as pd

# Convert the 'date' column to datetime format
data_tr['t_dat'] = pd.to_datetime(data_tr['t_dat'])

# Specify the cutoff date for the time-based split
cutoff_date = pd.to_datetime('2020-09-15')

# Split the data into training and validation sets
train_data = data_tr[data_tr['t_dat'] < cutoff_date]
validation_data = data_tr[data_tr['t_dat'] >= cutoff_date]

# Display the shape of the training and validation sets
print(f"Training set shape: {train_data.shape}")
print(f"Validation set shape: {validation_data.shape}")

Training set shape: (31521960, 5)
Validation set shape: (266364, 5)


In [12]:
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import coo_matrix

#Initialize LabelEncoders
customer_encoder = LabelEncoder()
article_encoder = LabelEncoder()

#Encode 'customer_id' and 'article_id' into numerical indices
data_tr['customer_index'] = customer_encoder.fit_transform(data_tr['customer_id'])
data_tr['article_index'] = article_encoder.fit_transform(data_tr['article_id'])

#Add a 'purchase_count' column with a default value of 1
data_tr['purchase_count'] = 1

#Create a sparse user-item matrix
user_item_matrix = coo_matrix((data_tr['purchase_count'], (data_tr['customer_index'], data_tr['article_index'])))

In [13]:
import implicit

#Initialize an implicit ALS model
als_model = implicit.als.AlternatingLeastSquares(factors=50, regularization=0.01, iterations=50)

#Train the model
als_model.fit(user_item_matrix)

  check_blas_config()


  0%|          | 0/50 [00:00<?, ?it/s]

In [15]:
unique_customer_ids = data_tr['customer_id'].unique()
sum_of_unique_customers = len(unique_customer_ids)

print("Sum of unique customer IDs:", sum_of_unique_customers)

Sum of unique customer IDs: 1362281


In [15]:
def predict_top_n_for_customer(customer_id, model, user_factors, item_factors, n=12):
    try:
        # Transform customer ID to index using the encoder
        customer_index = customer_encoder.transform([customer_id])[0]
    except KeyError:
        # Handle unseen customer ID
        print(f"Customer ID '{customer_id}' not found in training data.")
        return None

    # Extract the customer vector from the precomputed user factors
    customer_vector = user_factors[customer_index]

    # Calculate the dot product of the customer vector with all item vectors
    predictions = item_factors.dot(customer_vector)

    # Get the indices of the top N predictions
    top_n_indices = predictions.argsort()[-n:][::-1]

    # Create a DataFrame with article IDs and corresponding top N predictions
    predictions_df = pd.DataFrame({
        'customer_id': [customer_id] * n,
        'prediction': predictions[top_n_indices]
    })

    return predictions_df

In [None]:
# Get a list of unique customer IDs from your dataset
all_customer_ids = data_tr['customer_id'].unique()

# Initialize an empty list to store predictions
all_predictions = []

# Loop over all unique customer IDs
for customer_id in all_customer_ids:
    predictions_df = predict_top_n_for_customer(customer_id, als_model, als_model.user_factors, als_model.item_factors, n=12)
    
    if predictions_df is not None:
        all_predictions.append(predictions_df)

# Concatenate predictions for all customers into a single DataFrame
final_predictions_df = pd.concat(all_predictions, ignore_index=True)

# Display or analyze the final predictions
print(final_predictions_df.head())

Literature review:
Li S. (2022) Building A Recommender System With Implicit Feedback Datasets Using Alternating Least Squares. Retrieved by: https://actsusanli.medium.com/building-a-recommender-system-with-implicit-feedback-datasets-using-alternating-least-squares-64d4f5ba3c57

Kriplani H. (2019) Alternating Least Square for Implicit Dataset with code. Retrieved by: https://towardsdatascience.com/alternating-least-square-for-implicit-dataset-with-code-8e7999277f4b

Victor (2017) ALS Implicit Collaborative Filtering. Retrieved by: https://medium.com/radon-dev/als-implicit-collaborative-filtering-5ed653ba39fe

Sefidian Academy Source (2023) Implicit Recommender Systems with Alternating Least Squares. https://www.sefidian.com/2021/02/04/implicit-recommender-systems-with-alternating-least-squares/

Takacs G. (2012) Alternating least squares for personalized ranking. Retrieved by: https://www.researchgate.net/publication/254464370_Alternating_least_squares_for_personalized_ranking

Lundquist E. (2020) Factorization Machines for Item Recommendation with Implicit Feedback Data Retrieved by: https://towardsdatascience.com/factorization-machines-for-item-recommendation-with-implicit-feedback-data-5655a7c749db