In [2]:
pip install lightfm

Collecting lightfm
  Downloading lightfm-1.17.tar.gz (316 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
  Created wheel for lightfm: filename=lightfm-1.17-cp310-cp310-linux_x86_64.whl size=804891 sha256=94fb514e2af21e203116dc13c3725716e6f21075e468f69851aa1f356df0158b
  Stored in directory: /root/.cache/pip/wheels/4f/9b/7e/0b256f2168511d8fa4dae4fae0200fdbd729eb424a912ad636
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.17


# Testing LightFM on test data

In [3]:
import numpy as np
from lightfm.datasets import fetch_movielens

data = fetch_movielens(min_rating=5.0)

print(repr(data['train']))
print(repr(data['test']))

<943x1682 sparse matrix of type '<class 'numpy.int32'>'
	with 19048 stored elements in COOrdinate format>
<943x1682 sparse matrix of type '<class 'numpy.int32'>'
	with 2153 stored elements in COOrdinate format>


In [7]:
from lightfm import LightFM

model = LightFM(loss='warp')
%time model.fit(data['train'], epochs=30, num_threads=4)

CPU times: user 485 ms, sys: 528 µs, total: 486 ms
Wall time: 410 ms


<lightfm.lightfm.LightFM at 0x7be94c0f7be0>

In [9]:
from lightfm.evaluation import precision_at_k

In [10]:
# Measuring precision of train and test data
print("Train precision: %.2f" % precision_at_k(model, data['train'], k=5).mean())
print("Test precision: %.2f" % precision_at_k(model, data['test'], k=5).mean())

Train precision: 0.39
Test precision: 0.05


For an alternative way of judging the model, we can sample a couple of users and get their recommendations. To make predictions for given user, we pass the id of that user and the ids of all products we want predictions for into the predict method.

In [11]:
# Sampling some users and getting their recommendations
def sample_recommendation(model, data, user_ids):

    n_users, n_items = data['train'].shape

    for user_id in user_ids:
        known_positives = data['item_labels'][data['train'].tocsr()[user_id].indices]

        scores = model.predict(user_id, np.arange(n_items))
        top_items = data['item_labels'][np.argsort(-scores)]

        print("User %s" % user_id)
        print("     Known positives:")

        for x in known_positives[:3]:
            print("        %s" % x)

        print("     Recommended:")

        for x in top_items[:3]:
            print("        %s" % x)

# model = LightFM(loss='warp')
# data = fetch_movielens(min_rating=5.0)
# [3, 25, 450]
sample_recommendation(model, data, [3, 25, 450])


User 3
     Known positives:
        C
        A
        I
     Recommended:
        S
        C
        C
User 25
     Known positives:
        F
        G
        L
     Recommended:
        F
        L
        T
User 450
     Known positives:
        E
        S
        C
     Recommended:
        G
        C
        A


# Preprocessing H&M data

In [1]:
import pandas as pd

art_df = pd.read_csv('/content/drive/MyDrive/data/articles.csv')
cus_df = pd.read_csv('/content/drive/MyDrive/data/customers.csv')
transactions  = pd.read_csv('/content/drive/MyDrive/data/transactions_train.csv')

#Harmonizing class names and filling in missing values
cus_df.club_member_status     = cus_df.club_member_status.fillna('NONE')
cus_df.fashion_news_frequency = cus_df.fashion_news_frequency.fillna('NONE')
cus_df.club_member_status     = cus_df.club_member_status.replace('None', 'NONE')
cus_df.fashion_news_frequency = cus_df.fashion_news_frequency.replace('None','NONE')

#Replacing missing age values with average age
cus_df['age'].fillna(int(cus_df['age'].mean()), inplace=True)

#Replacing missing values
cus_df.FN.fillna(0.0, inplace=True)
cus_df.Active.fillna(0.0, inplace=True)
#cus_df.head()

#Filling in missing plaintext descriptions
art_df.detail_desc = art_df.detail_desc.fillna('')

# Transaction data contain price variable
# It is valuable to know how this varies for each article
art_grp_data = transactions.groupby('article_id').agg(
        {
        'price':['mean', 'std', 'sem', 'min', 'max'],
        't_dat':'count'
        }
    )
display(art_grp_data)

#filling in missing std values
art_grp_data[[('price', 'std'), ('price','sem')]].fillna(0.0, inplace=True)

#Adding mean price variable to the articles data
art_price = art_grp_data[('price', 'mean')].reset_index()
art_price.columns = art_price.columns.droplevel(1)
art_price.rename(columns={'price':'avg_price'}, inplace=True)

#adding average price for product that were bought at least 1 time
art_df = art_df.merge(art_price, on='article_id', how='left')
#filling empty average price values with 0 - these products were never purchased
art_df.avg_price.fillna(0.0, inplace=True)
#art_df.head()



Unnamed: 0_level_0,price,price,price,price,price,t_dat
Unnamed: 0_level_1,mean,std,sem,min,max,count
article_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
108775015,0.008142,0.000652,0.000006,0.001339,0.009153,10841
108775044,0.008114,0.000735,0.000009,0.001424,0.008508,7250
108775051,0.004980,0.000544,0.000037,0.003237,0.008458,215
110065001,0.020219,0.006844,0.000212,0.002525,0.025407,1044
110065002,0.018205,0.006004,0.000259,0.004542,0.025407,539
...,...,...,...,...,...,...
952267001,0.014982,0.002385,0.000372,0.010153,0.016932,41
952938001,0.048006,0.004300,0.001433,0.040661,0.050831,9
953450001,0.016836,0.000395,0.000096,0.015305,0.016932,17
953763001,0.021908,0.000251,0.000042,0.021169,0.022017,35


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  art_grp_data[[('price', 'std'), ('price','sem')]].fillna(0.0, inplace=True)


In [2]:
# art_df
# cus_df
# Renaming transactions to inter
inter = transactions
del transactions

In [3]:
#Ordering by date
inter = inter.sort_values(by='t_dat').set_index('t_dat')

#Setting index to datetime to filter using date ranges
inter.index = pd.to_datetime(inter.index, format='%Y-%m-%d')

#Calculating dates to narrow-down the dataset
latest_date = inter.index.max()
start_of_week = latest_date - pd.Timedelta(days=7)
prior_6_weeks = latest_date - pd.Timedelta(days=42)

#Keeping test data as last week and train data as last 12 month (excluding last week)
test_inter = inter.loc[start_of_week:]
train_inter = inter.loc[prior_6_weeks:start_of_week]
#del inter # Free RAM

#Grouping by customer-article combinations to keep only unique interactions
train_inter = train_inter.groupby(['customer_id', 'article_id']).count().reset_index()[['customer_id', 'article_id']]
test_inter = test_inter.groupby(['customer_id', 'article_id']).count().reset_index()[['customer_id', 'article_id']]
#full_inter = inter.groupby(['customer_id', 'article_id']).count().reset_index()[['customer_id', 'article_id']]

#Adding customer and product attributes
train_df = train_inter.merge(art_df, left_on='article_id', right_on='article_id', how='left')
test_df = test_inter.merge(art_df, left_on='article_id', right_on='article_id', how='left')
#full_df = full_inter.merge(art_df, left_on='article_id', right_on='article_id', how='left')

train_df = train_df.merge(cus_df, left_on='customer_id', right_on='customer_id', how='left')
test_df = test_df.merge(cus_df, left_on='customer_id', right_on='customer_id', how='left')
#full_df = full_df.merge(cus_df, left_on='customer_id', right_on='customer_id', how='left')

#Free RAM
#del train_inter
#del test_inter
#del art_df
#del cus_df

In [4]:
def normalize_features(df):
    for column in df.columns:
        df[column] = df[column].astype(str)
    return df

#print(train_df.info())
#print(test_df.info())

In [2]:
import pandas as pd

# Specify dtype for article_id column to read it as a string
#dtype_dict = {'article_id': str}
#inter_df = pd.read_csv('/content/drive/MyDrive/data/transactions_train.csv', dtype=dtype_dict)
inter_df = pd.read_csv('/content/drive/MyDrive/data/transactions_train.csv')

#inter_df = inter_df.sort_values(by='t_dat').set_index('t_dat')
#inter_df.index = pd.to_datetime(inter_df.index, format='%Y-%m-%d')
#latest_date = inter_df.index.max()
#start_of_week = latest_date - pd.Timedelta(days=7)
#prior_6_weeks = latest_date - pd.Timedelta(days=42)

inter_df.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


In [5]:
# WORKS ON 0.1% OF DATA using ~4 GB RAM and generating results for 30k customers
import numpy as np
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from sklearn.preprocessing import LabelEncoder

# Assuming inter_df is your full dataset

# Initialize the LabelEncoders
customer_encoder = LabelEncoder()
article_encoder = LabelEncoder()

# Sample 0.1% of the data for training
train_df_sampled = inter_df.sample(frac=0.001)

# Encoding user and item identifiers on the sampled data
customer_encoder.fit(train_df_sampled['customer_id'])
article_encoder.fit(train_df_sampled['article_id'])

# Fit and transform customer_id and article_id for the sampled data
encoded_customer_id_sampled = customer_encoder.transform(train_df_sampled['customer_id'])
encoded_article_id_sampled = article_encoder.transform(train_df_sampled['article_id'])

# Prepare the dataset with LightFM's format for the sampled data
dataset_sampled = Dataset()
dataset_sampled.fit(encoded_customer_id_sampled, encoded_article_id_sampled)

# Building the interaction matrix for the sampled data
(interactions_sampled, weights_sampled) = dataset_sampled.build_interactions(zip(encoded_customer_id_sampled, encoded_article_id_sampled))

# Fit the model on the sampled data
model = LightFM(loss='warp')
model.fit(interactions_sampled, sample_weight=weights_sampled, epochs=10)

# Define the batched_sample_recommendation function
def batched_sample_recommendation(model, dataset, customer_encoder, article_encoder, batch_size=1000):
    n_users, n_items = dataset.interactions_shape()
    all_predictions = []

    for start_index in range(0, n_users, batch_size):
        end_index = min(start_index + batch_size, n_users)
        batch_customer_idxs = np.arange(start_index, end_index)

        all_item_idxs = np.tile(np.arange(n_items), (len(batch_customer_idxs), 1))
        batch_customer_idxs_expanded = np.repeat(batch_customer_idxs[:, np.newaxis], n_items, axis=1)

        # Predict scores for the batch
        batch_scores = model.predict(batch_customer_idxs_expanded.flatten(), all_item_idxs.flatten()).reshape(len(batch_customer_idxs), n_items)

        # Get top 12 items for each user in the batch
        top_items_idxs = np.argpartition(-batch_scores, 11, axis=1)[:, :12]

        # Flatten, inverse transform, and reshape
        top_items_flat = article_encoder.inverse_transform(top_items_idxs.flatten())
        top_items_reshaped = top_items_flat.reshape(-1, 12)

        all_predictions.extend(top_items_reshaped)

    return all_predictions


# Define the create_predictions_df function
def create_predictions_df(customer_encoder, predictions):
    customer_ids = customer_encoder.classes_
    formatted_predictions = []

    for pred in predictions:
        # Format each article_id with leading zeros to make it 10 digits long
        formatted_pred = ["{:010d}".format(int(article)) for article in pred]
        formatted_predictions_str = ' '.join(formatted_pred)
        formatted_predictions.append(formatted_predictions_str)

    return pd.DataFrame({'customer_id': customer_ids, 'prediction': formatted_predictions})

# Use the batched_sample_recommendation function for prediction
top_items_predictions = batched_sample_recommendation(model, dataset_sampled, customer_encoder, article_encoder, batch_size=1000)

# Create the final DataFrame for all customers
predictions_df = create_predictions_df(customer_encoder, top_items_predictions)

# Now you can use predictions_df as needed
predictions_df.head()

Unnamed: 0,customer_id,prediction
0,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,0316657001 0504154022 0580684001 0307239001 06...
1,000608ab13228c9d4f90f2e7e7dfd3b0c280d84ae817bf...,0664949002 0504154022 0571197007 0659460001 05...
2,00079f6287599e8e3f3558e6787e8a2a241e6536b15a7a...,0541308022 0300024063 0108775051 0599580072 06...
3,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,0675139002 0456163028 0741342005 0579541035 05...
4,0008d30a148478dc88c69af6c51230ad5802590afc8488...,0581588009 0300024063 0316657001 0619884020 05...


In [6]:
predictions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30501 entries, 0 to 30500
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   customer_id  30501 non-null  object
 1   prediction   30501 non-null  object
dtypes: object(2)
memory usage: 476.7+ KB


In [7]:
import pandas as pd

# Specify dtype for article_id column to read it as a string
#dtype_dict = {'article_id': str}
#inter_df = pd.read_csv('/content/drive/MyDrive/data/transactions_train.csv', dtype=dtype_dict)
inter_df = pd.read_csv('/content/drive/MyDrive/data/transactions_train.csv')

import numpy as np
#import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from sklearn.preprocessing import LabelEncoder

# Assuming inter_df is your full dataset

# Initialize the LabelEncoders
customer_encoder = LabelEncoder()
article_encoder = LabelEncoder()

# Encode all customer_ids and article_ids
encoded_customer_ids = customer_encoder.fit_transform(inter_df['customer_id'])
encoded_article_ids = article_encoder.fit_transform(inter_df['article_id'])

# Initialize the LightFM dataset
dataset = Dataset()
dataset.fit(encoded_customer_ids, encoded_article_ids)

# Define the create_predictions_df function
def create_predictions_df(customer_encoder, predictions):
    customer_ids = customer_encoder.classes_
    formatted_predictions = []

    for pred in predictions:
        formatted_pred = ["{:010d}".format(int(article)) for article in pred]
        formatted_predictions_str = ' '.join(formatted_pred)
        formatted_predictions.append(formatted_predictions_str)

    return pd.DataFrame({'customer_id': customer_ids, 'prediction': formatted_predictions})

# Initialize the LightFM model
model = LightFM(loss='warp')

# Split data into chunks and train the model iteratively
chunk_size = int(0.001 * len(inter_df))  # 0.1% of the dataset size
for start in range(0, len(inter_df), chunk_size):
    end = min(start + chunk_size, len(inter_df))
    chunk = inter_df.iloc[start:end]

    # Transform customer_id and article_id for the chunk
    chunk_encoded_customer_id = customer_encoder.transform(chunk['customer_id'])
    chunk_encoded_article_id = article_encoder.transform(chunk['article_id'])

    # Building the interaction matrix for the chunk
    interactions, weights = dataset.build_interactions(zip(chunk_encoded_customer_id, chunk_encoded_article_id))

    # Fit the model on the chunk
    model.fit(interactions, sample_weight=weights, epochs=10)

# Use the model to make predictions for all customers
top_items_predictions = batched_sample_recommendation(model, dataset, customer_encoder, article_encoder, batch_size=1000)

# Create the final DataFrame for all customers
predictions_df = create_predictions_df(customer_encoder, top_items_predictions)

# Now you can use predictions_df as needed
predictions_df.head()


Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0759814012 0902864003 0875950005 0891050001 08...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0902864003 0902069001 0657497007 0911564001 08...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0902864003 0875950005 0904995002 0889816001 08...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0903518002 0902069001 0875950005 0926387001 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0903864001 0889816001 0911564001 0875950005 06...


In [8]:
# Save predictions_df as a CSV file
predictions_df.to_csv('/content/drive/MyDrive/data/predictions_jv.csv', index=False)


In [None]:
from google.colab import files

# Trigger a download to your local machine
files.download('/content/drive/MyDrive/data/predictions_jv.csv')


In [5]:
# Batch computing/fitting
import numpy as np
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from sklearn.preprocessing import LabelEncoder

# Assuming inter_df is your full dataset

# Initialize the LabelEncoders
customer_encoder = LabelEncoder()
article_encoder = LabelEncoder()

# Fit encoders on the full dataset
customer_encoder.fit(inter_df['customer_id'])
article_encoder.fit(inter_df['article_id'])

# Prepare the dataset with LightFM's format for the full data
dataset = Dataset()
dataset.fit(
    customer_encoder.transform(inter_df['customer_id'].unique()),
    article_encoder.transform(inter_df['article_id'].unique())
)

# Initialize the model
model = LightFM(loss='warp')

# Function to process each chunk
def process_chunk(chunk, model, dataset, customer_encoder, article_encoder):
    encoded_customer_id_chunk = customer_encoder.transform(chunk['customer_id'])
    encoded_article_id_chunk = article_encoder.transform(chunk['article_id'])

    interactions_chunk, weights_chunk = dataset.build_interactions(zip(encoded_customer_id_chunk, encoded_article_id_chunk))

    model.fit(interactions_chunk, sample_weight=weights_chunk, epochs=10)

    return batched_sample_recommendation(model, dataset, customer_encoder, article_encoder)

# Define batched_sample_recommendation
def batched_sample_recommendation(model, dataset, customer_encoder, article_encoder, batch_size=1000):
    n_users, n_items = dataset.interactions_shape()
    all_predictions = []

    for start_index in range(0, n_users, batch_size):
        end_index = min(start_index + batch_size, n_users)
        batch_customer_idxs = np.arange(start_index, end_index)

        all_item_idxs = np.tile(np.arange(n_items), (len(batch_customer_idxs), 1))
        batch_customer_idxs_expanded = np.repeat(batch_customer_idxs[:, np.newaxis], n_items, axis=1)

        # Predict scores for the batch
        batch_scores = model.predict(batch_customer_idxs_expanded.flatten(), all_item_idxs.flatten()).reshape(len(batch_customer_idxs), n_items)

        # Get top 12 items for each user in the batch
        top_items_idxs = np.argpartition(-batch_scores, 11, axis=1)[:, :12]

        # Flatten, inverse transform, and reshape
        top_items_flat = article_encoder.inverse_transform(top_items_idxs.flatten())
        top_items_reshaped = top_items_flat.reshape(-1, 12)

        all_predictions.extend(top_items_reshaped)

    return all_predictions

# Define create_predictions_df as before
def create_predictions_df(customer_encoder, predictions):
    customer_ids = customer_encoder.classes_
    formatted_predictions = []

    for pred in predictions:
        # Format each article_id with leading zeros to make it 10 digits long
        formatted_pred = ["{:010d}".format(int(article)) for article in pred]
        formatted_predictions_str = ' '.join(formatted_pred)
        formatted_predictions.append(formatted_predictions_str)

    return pd.DataFrame({'customer_id': customer_ids, 'prediction': formatted_predictions})

# Fit encoders on the full dataset
customer_encoder.fit(inter_df['customer_id'])
article_encoder.fit(inter_df['article_id'])

# Prepare the dataset with LightFM's format for the full data
dataset = Dataset()
dataset.fit(
    (customer_id for customer_id in inter_df['customer_id']),
    (article_id for article_id in inter_df['article_id'])
)

# Process each chunk
chunk_size = 10000
all_predictions = []

for start in range(0, len(inter_df), chunk_size):
    end = min(start + chunk_size, len(inter_df))
    chunk = inter_df.iloc[start:end]
    predictions = process_chunk(chunk, model, dataset, customer_encoder, article_encoder)
    all_predictions.extend(predictions)

# Compile all predictions
predictions_df = create_predictions_df(customer_encoder, all_predictions)

# Use predictions_df as needed
predictions_df.head()

ValueError: User id 2 not in user id mapping. Make sure you call the fit method.

In [None]:
import numpy as np
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoders
customer_encoder = LabelEncoder()
article_encoder = LabelEncoder()

# Preprocess the entire dataset
inter_df['encoded_customer_id'] = customer_encoder.fit_transform(inter_df['customer_id'])
inter_df['encoded_article_id'] = article_encoder.fit_transform(inter_df['article_id'])

# Prepare the dataset with LightFM's format for the preprocessed data
dataset = Dataset()
dataset.fit(
    (row['encoded_customer_id'] for _, row in inter_df.iterrows()),
    (row['encoded_article_id'] for _, row in inter_df.iterrows())
)

# Initialize the model
model = LightFM(loss='warp')

# Function to process each chunk
def process_chunk(chunk, model, dataset):
    # Building the interaction matrix for the chunk
    interactions_chunk, weights_chunk = dataset.build_interactions(
        zip(chunk['encoded_customer_id'], chunk['encoded_article_id'])
    )

    # Fit the model on the chunk
    model.fit(interactions_chunk, sample_weight=weights_chunk, epochs=10)

    return batched_sample_recommendation(model, dataset)

# Define batched_sample_recommendation
def batched_sample_recommendation(model, dataset, customer_encoder, article_encoder, batch_size=1000):
    n_users, n_items = dataset.interactions_shape()
    all_predictions = []

    for start_index in range(0, n_users, batch_size):
        end_index = min(start_index + batch_size, n_users)
        batch_customer_idxs = np.arange(start_index, end_index)

        all_item_idxs = np.tile(np.arange(n_items), (len(batch_customer_idxs), 1))
        batch_customer_idxs_expanded = np.repeat(batch_customer_idxs[:, np.newaxis], n_items, axis=1)

        # Predict scores for the batch
        batch_scores = model.predict(batch_customer_idxs_expanded.flatten(), all_item_idxs.flatten()).reshape(len(batch_customer_idxs), n_items)

        # Get top 12 items for each user in the batch
        top_items_idxs = np.argpartition(-batch_scores, 11, axis=1)[:, :12]

        # Flatten, inverse transform, and reshape
        top_items_flat = article_encoder.inverse_transform(top_items_idxs.flatten())
        top_items_reshaped = top_items_flat.reshape(-1, 12)

        all_predictions.extend(top_items_reshaped)

    return all_predictions

# Define create_predictions_df as before
def create_predictions_df(customer_encoder, predictions):
    customer_ids = customer_encoder.classes_
    formatted_predictions = []

    for pred in predictions:
        # Format each article_id with leading zeros to make it 10 digits long
        formatted_pred = ["{:010d}".format(int(article)) for article in pred]
        formatted_predictions_str = ' '.join(formatted_pred)
        formatted_predictions.append(formatted_predictions_str)

    return pd.DataFrame({'customer_id': customer_ids, 'prediction': formatted_predictions})

# Process each chunk
chunk_size = 10000
all_predictions = []

for start in range(0, len(inter_df), chunk_size):
    end = min(start + chunk_size, len(inter_df))
    chunk = inter_df.iloc[start:end]
    predictions = process_chunk(chunk, model, dataset)
    all_predictions.extend(predictions)

# Compile all predictions
predictions_df = create_predictions_df(customer_encoder, all_predictions)

# Use predictions_df as needed

In [1]:
predictions_df.head

NameError: name 'predictions_df' is not defined

In [11]:
predictions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30498 entries, 0 to 30497
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   customer_id  30498 non-null  object
 1   prediction   30498 non-null  object
dtypes: object(2)
memory usage: 476.7+ KB


# Prepearing train and test datasets as sparse matrix usable by lightfm

In [5]:
pd.set_option('display.max_colwidth', None)

import numpy as np
from lightfm import LightFM
from lightfm.data import Dataset
from sklearn.preprocessing import LabelEncoder

# Assuming train_df is loaded and ready

# Encoding user and item identifiers
customer_encoder = LabelEncoder()
article_encoder = LabelEncoder()

# Fit and transform customer_id and article_id
encoded_customer_id = customer_encoder.fit_transform(train_df['customer_id'])
encoded_article_id = article_encoder.fit_transform(train_df['article_id'])

# Prepare the dataset with LightFM's format
dataset = Dataset()
dataset.fit(encoded_customer_id, encoded_article_id)

# Building the interaction matrix
(interactions, weights) = dataset.build_interactions(zip(encoded_customer_id, encoded_article_id))

# Fit the model
model = LightFM(loss='warp')
model.fit(interactions, sample_weight=weights, epochs=10)

# Adjusted sample_recommendation function
def sample_recommendation(model, dataset, customer_encoder, article_encoder, customer_ids):
    n_users, n_items = dataset.interactions_shape()

    for customer_id in customer_ids:
        # Check if the customer_id is in the training set
        if customer_id in customer_encoder.classes_:
            customer_idx = customer_encoder.transform([customer_id])[0]
            # Repeat customer_idx to match the length of item array
            customer_idx_array = np.full(n_items, customer_idx)
            item_indices = np.arange(n_items)
            scores = model.predict(customer_idx_array, item_indices)
            # Select the top 12 items
            top_items = article_encoder.inverse_transform(np.argsort(-scores)[:12])

            print("Customer %s" % customer_id)
            print("     Recommended articles:")

            for x in top_items:
                print("        %s" % x)
        else:
            print(f"Customer ID {customer_id} not found in training data.")

# Test the model for a few customer IDs
# Ensure these are actual IDs from your train_df
sample_customer_ids = ['000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318']
sample_recommendation(model, dataset, customer_encoder, article_encoder, sample_customer_ids)

Customer 000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318
     Recommended articles:
        156231002
        224521007
        309864012
        379360053
        516900010
        461327008
        523488001
        233091021
        489435015
        533022039
        622141005
        499243001


In [6]:
sample_customer_ids = ['000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318']
sample_recommendation(model, dataset, customer_encoder, article_encoder, sample_customer_ids)

Customer 000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318
     Recommended articles:
        156231002
        224521007
        309864012
        379360053
        516900010
        461327008
        523488001
        233091021
        489435015
        533022039
        622141005
        499243001


In [1]:
import pandas as pd



def optimized_sample_recommendation(model, dataset, customer_encoder, article_encoder):
    n_users, n_items = dataset.interactions_shape()
    all_customer_idxs = np.arange(n_users)
    all_item_idxs = np.tile(np.arange(n_items), (n_users, 1))
    all_customer_idxs_expanded = np.repeat(all_customer_idxs[:, np.newaxis], n_items, axis=1)

    # Predict scores for all user-item pairs
    all_scores = model.predict(all_customer_idxs_expanded.flatten(), all_item_idxs.flatten()).reshape(n_users, n_items)

    # Get top 12 items for each user
    top_items_idxs = np.argpartition(-all_scores, 11, axis=1)[:, :12]
    top_items = article_encoder.inverse_transform(top_items_idxs)

    return top_items

def create_predictions_df(customer_encoder, predictions):
    customer_ids = customer_encoder.classes_
    predictions_str = [' '.join(map(str, pred)) for pred in predictions]
    return pd.DataFrame({'customer_id': customer_ids, 'prediction': predictions_str})

# Generate predictions
top_items_predictions = optimized_sample_recommendation(model, dataset, customer_encoder, article_encoder)

# Create the final DataFrame
predictions_df = create_predictions_df(customer_encoder, top_items_predictions)

# Now you can use predictions_df as needed
predictions_df.head()

NameError: name 'model' is not defined