# Installing libraries

In [None]:
#pip install lightfm

# Testing LightFM on test data

In [None]:
import numpy as np
from lightfm.datasets import fetch_movielens

data = fetch_movielens(min_rating=5.0)

print(repr(data['train']))
print(repr(data['test']))

<943x1682 sparse matrix of type '<class 'numpy.int32'>'
	with 19048 stored elements in COOrdinate format>
<943x1682 sparse matrix of type '<class 'numpy.int32'>'
	with 2153 stored elements in COOrdinate format>


In [None]:
from lightfm import LightFM

model = LightFM(loss='warp')
%time model.fit(data['train'], epochs=30, num_threads=2)

CPU times: user 740 ms, sys: 2.96 ms, total: 743 ms
Wall time: 382 ms


<lightfm.lightfm.LightFM at 0x7c4a585d8640>

In [None]:
from lightfm.evaluation import precision_at_k

In [None]:
# Measuring precision of train and test data
print("Train precision: %.2f" % precision_at_k(model, data['train'], k=5).mean())
print("Test precision: %.2f" % precision_at_k(model, data['test'], k=5).mean())

Train precision: 0.39
Test precision: 0.05


For an alternative way of judging the model, we can sample a couple of users and get their recommendations. To make predictions for given user, we pass the id of that user and the ids of all products we want predictions for into the predict method.

In [None]:
# Sampling some users and getting their recommendations
def sample_recommendation(model, data, user_ids):

    n_users, n_items = data['train'].shape

    for user_id in user_ids:
        known_positives = data['item_labels'][data['train'].tocsr()[user_id].indices]

        scores = model.predict(user_id, np.arange(n_items))
        top_items = data['item_labels'][np.argsort(-scores)]

        print("User %s" % user_id)
        print("     Known positives:")

        for x in known_positives[:3]:
            print("        %s" % x)

        print("     Recommended:")

        for x in top_items[:3]:
            print("        %s" % x)

# model = LightFM(loss='warp')
# data = fetch_movielens(min_rating=5.0)
# [3, 25, 450]
sample_recommendation(model, data, [3, 25, 450])


User 3
     Known positives:
        C
        A
        I
     Recommended:
        S
        K
        C
User 25
     Known positives:
        F
        G
        L
     Recommended:
        E
        F
        T
User 450
     Known positives:
        E
        S
        C
     Recommended:
        C
        S
        T


# Preprocessing H&M data

In [1]:
import pandas as pd

art_df = pd.read_csv('/content/drive/MyDrive/data/articles.csv')
cus_df = pd.read_csv('/content/drive/MyDrive/data/customers.csv')
transactions  = pd.read_csv('/content/drive/MyDrive/data/transactions_train.csv')


In [30]:
sample_submission = pd.read_csv('/content/drive/MyDrive/data/sample_submission.csv')
sample_submission.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657,0706016001 0706016002 0372860001 0610776002 0759871002 0464297007 0372860002 0610776001 0399223001 0706016003 0720125001 0156231001
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa,0706016001 0706016002 0372860001 0610776002 0759871002 0464297007 0372860002 0610776001 0399223001 0706016003 0720125001 0156231001
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,0706016001 0706016002 0372860001 0610776002 0759871002 0464297007 0372860002 0610776001 0399223001 0706016003 0720125001 0156231001
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2c5feb1ca5dff07c43e,0706016001 0706016002 0372860001 0610776002 0759871002 0464297007 0372860002 0610776001 0399223001 0706016003 0720125001 0156231001
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801fe7fc0f26dd8d65a85a,0706016001 0706016002 0372860001 0610776002 0759871002 0464297007 0372860002 0610776001 0399223001 0706016003 0720125001 0156231001


In [2]:
art_df.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [3]:
cus_df.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...


In [4]:
transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


## Customer data preprocessing

In [5]:
#Harmonizing class names and filling in missing values
cus_df.club_member_status     = cus_df.club_member_status.fillna('NONE')
cus_df.fashion_news_frequency = cus_df.fashion_news_frequency.fillna('NONE')
cus_df.club_member_status     = cus_df.club_member_status.replace('None', 'NONE')
cus_df.fashion_news_frequency = cus_df.fashion_news_frequency.replace('None','NONE')

#Replacing missing age values with average age
cus_df['age'].fillna(int(cus_df['age'].mean()), inplace=True)

#Replacing missing values
cus_df.FN.fillna(0.0, inplace=True)
cus_df.Active.fillna(0.0, inplace=True)
cus_df.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.0,0.0,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...


## Article data preprocessing

In [6]:
#Filling in missing plaintext descriptions
art_df.detail_desc = art_df.detail_desc.fillna('')

## Transaction data

In [7]:
# Transaction data contain price variable
# It is valuable to know how this varies for each article
art_grp_data = transactions.groupby('article_id').agg(
        {
        'price':['mean', 'std', 'sem', 'min', 'max'],
        't_dat':'count'
        }
    )
display(art_grp_data)

Unnamed: 0_level_0,price,price,price,price,price,t_dat
Unnamed: 0_level_1,mean,std,sem,min,max,count
article_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
108775015,0.008142,0.000652,0.000006,0.001339,0.009153,10841
108775044,0.008114,0.000735,0.000009,0.001424,0.008508,7250
108775051,0.004980,0.000544,0.000037,0.003237,0.008458,215
110065001,0.020219,0.006844,0.000212,0.002525,0.025407,1044
110065002,0.018205,0.006004,0.000259,0.004542,0.025407,539
...,...,...,...,...,...,...
952267001,0.014982,0.002385,0.000372,0.010153,0.016932,41
952938001,0.048006,0.004300,0.001433,0.040661,0.050831,9
953450001,0.016836,0.000395,0.000096,0.015305,0.016932,17
953763001,0.021908,0.000251,0.000042,0.021169,0.022017,35


In [8]:
#filling in missing std values
art_grp_data[[('price', 'std'), ('price','sem')]].fillna(0.0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  art_grp_data[[('price', 'std'), ('price','sem')]].fillna(0.0, inplace=True)


In [9]:
#Adding mean price variable to the articles data
art_price = art_grp_data[('price', 'mean')].reset_index()
art_price.columns = art_price.columns.droplevel(1)
art_price.rename(columns={'price':'avg_price'}, inplace=True)

#adding average price for product that were bought at least 1 time
art_df = art_df.merge(art_price, on='article_id', how='left')
#filling empty average price values with 0 - these products were never purchased
art_df.avg_price.fillna(0.0, inplace=True)
art_df.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc,avg_price
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,0.008142
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,0.008114
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,0.00498
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde...",0.020219
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde...",0.018205


# Creating train_inter / test_inter datasets

In [10]:
# art_df
# cus_df
# Renaming transactions to inter
inter = transactions
del transactions

In [11]:
#Ordering by date
inter = inter.sort_values(by='t_dat').set_index('t_dat')

In [12]:
#Setting index to datetime to filter using date ranges
inter.index = pd.to_datetime(inter.index, format='%Y-%m-%d')

In [13]:
#Calculating dates to narrow-down the dataset
latest_date = inter.index.max()
start_of_week = latest_date - pd.Timedelta(days=7)
prior_6_weeks = latest_date - pd.Timedelta(days=42)

In [14]:
#Keeping test data as last week and train data as last 12 month (excluding last week)
test_inter = inter.loc[start_of_week:]
train_inter = inter.loc[prior_6_weeks:start_of_week]
del inter # Free RAM

In [15]:
print(test_inter.info())
print(train_inter.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 266364 entries, 2020-09-15 to 2020-09-22
Data columns (total 4 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   customer_id       266364 non-null  object 
 1   article_id        266364 non-null  int64  
 2   price             266364 non-null  float64
 3   sales_channel_id  266364 non-null  int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 10.2+ MB
None
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1367647 entries, 2020-08-11 to 2020-09-15
Data columns (total 4 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   customer_id       1367647 non-null  object 
 1   article_id        1367647 non-null  int64  
 2   price             1367647 non-null  float64
 3   sales_channel_id  1367647 non-null  int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 52.2+ MB
None


In [16]:
#Grouping by customer-article combinations to keep only unique interactions
train_inter = train_inter.groupby(['customer_id', 'article_id']).count().reset_index()[['customer_id', 'article_id']]
test_inter = test_inter.groupby(['customer_id', 'article_id']).count().reset_index()[['customer_id', 'article_id']]
#Adding customer and product attributes
train_df = train_inter.merge(art_df, left_on='article_id', right_on='article_id', how='left')
test_df = test_inter.merge(art_df, left_on='article_id', right_on='article_id', how='left')
train_df = train_df.merge(cus_df, left_on='customer_id', right_on='customer_id', how='left')
test_df = test_df.merge(cus_df, left_on='customer_id', right_on='customer_id', how='left')
#Free RAM
del train_inter
del test_inter
del art_df
del cus_df

In [17]:
def normalize_features(df):
    for column in df.columns:
        df[column] = df[column].astype(str)
    return df

print(train_df.info())
print(test_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1209646 entries, 0 to 1209645
Data columns (total 33 columns):
 #   Column                        Non-Null Count    Dtype  
---  ------                        --------------    -----  
 0   customer_id                   1209646 non-null  object 
 1   article_id                    1209646 non-null  int64  
 2   product_code                  1209646 non-null  int64  
 3   prod_name                     1209646 non-null  object 
 4   product_type_no               1209646 non-null  int64  
 5   product_type_name             1209646 non-null  object 
 6   product_group_name            1209646 non-null  object 
 7   graphical_appearance_no       1209646 non-null  int64  
 8   graphical_appearance_name     1209646 non-null  object 
 9   colour_group_code             1209646 non-null  int64  
 10  colour_group_name             1209646 non-null  object 
 11  perceived_colour_value_id     1209646 non-null  int64  
 12  perceived_colour_value_name 

# Prepearing train and test datasets as sparse matrix usable by lightfm

In [None]:
# NOT WORKING
import numpy as np
from lightfm import LightFM
from lightfm.data import Dataset
from sklearn.preprocessing import LabelEncoder

# Step 1: Prepare the dataset for LightFM
# Encoding user and item identifiers
customer_encoder = LabelEncoder()
article_encoder = LabelEncoder()

train_df['customer_id'] = customer_encoder.fit_transform(train_df['customer_id'])
train_df['article_id'] = article_encoder.fit_transform(train_df['article_id'])

# Prepare the dataset with LightFM's format
dataset = Dataset()
dataset.fit(train_df['customer_id'], train_df['article_id'])

# Building the interaction matrix
(interactions, weights) = dataset.build_interactions([(x['customer_id'], x['article_id']) for index, x in train_df.iterrows()])

# Step 2: Fit the model
model = LightFM(loss='warp')
model.fit(interactions, sample_weight=weights, epochs=10)

# Step 3: Adjust the sample_recommendation function
def sample_recommendation(model, dataset, customer_encoder, article_encoder, customer_ids):
    n_users, n_items = dataset.interactions_shape()

    for customer_id in customer_ids:
        customer_idx = customer_encoder.transform([customer_id])[0]
        scores = model.predict(customer_idx, np.arange(n_items))
        top_items = article_encoder.inverse_transform(np.argsort(-scores)[:10])

        print("Customer %s" % customer_id)
        print("     Recommended articles:")

        for x in top_items:
            print("        %s" % x)

In [None]:
# NOT WORKING
def sample_recommendation(model, dataset, customer_encoder, article_encoder, customer_ids):
    n_users, n_items = dataset.interactions_shape()

    for customer_id in customer_ids:
        # Check if the customer_id is in the training set
        if customer_id in customer_encoder.classes_:
            customer_idx = customer_encoder.transform([customer_id])[0]
            scores = model.predict(customer_idx, np.arange(n_items))
            top_items = article_encoder.inverse_transform(np.argsort(-scores)[:10])

            print("Customer %s" % customer_id)
            print("     Recommended articles:")

            for x in top_items:
                print("        %s" % x)
        else:
            print(f"Customer {customer_id} not found in training data.")


In [None]:
# Test the model for a few customer ID's
sample_customer_ids = [0, 1, 2]  # Replace with actual customer IDs from your dataset
sample_recommendation(model, train_df, customer_encoder, article_encoder, sample_customer_ids)

AttributeError: ignored

In [32]:
import numpy as np
from lightfm import LightFM
from lightfm.data import Dataset
from sklearn.preprocessing import LabelEncoder

# Assuming train_df is loaded and ready

# Encoding user and item identifiers
customer_encoder = LabelEncoder()
article_encoder = LabelEncoder()

# Fit and transform customer_id and article_id
encoded_customer_id = customer_encoder.fit_transform(train_df['customer_id'])
encoded_article_id = article_encoder.fit_transform(train_df['article_id'])

# Prepare the dataset with LightFM's format
dataset = Dataset()
dataset.fit(encoded_customer_id, encoded_article_id)

# Building the interaction matrix
(interactions, weights) = dataset.build_interactions(zip(encoded_customer_id, encoded_article_id))

# Fit the model
model = LightFM(loss='warp')
model.fit(interactions, sample_weight=weights, epochs=10)

# Adjusted sample_recommendation function
def sample_recommendation(model, dataset, customer_encoder, article_encoder, customer_ids):
    n_users, n_items = dataset.interactions_shape()

    for customer_id in customer_ids:
        # Check if the customer_id is in the training set
        if customer_id in customer_encoder.classes_:
            customer_idx = customer_encoder.transform([customer_id])[0]
            # Repeat customer_idx to match the length of item array
            customer_idx_array = np.full(n_items, customer_idx)
            item_indices = np.arange(n_items)
            scores = model.predict(customer_idx_array, item_indices)
            # Select the top 12 items
            top_items = article_encoder.inverse_transform(np.argsort(-scores)[:12])

            print("Customer %s" % customer_id)
            print("     Recommended articles:")

            for x in top_items:
                print("        %s" % x)
        else:
            print(f"Customer ID {customer_id} not found in training data.")

# Test the model for a few customer IDs
# Ensure these are actual IDs from your train_df
sample_customer_ids = ['000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318']
sample_recommendation(model, dataset, customer_encoder, article_encoder, sample_customer_ids)


Customer 000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318
     Recommended articles:
        309864012
        156231002
        307239004
        524825012
        503569001
        111565001
        516000087
        516000086
        401044004
        574109035
        543054014
        499243001


In [23]:
pd.set_option('display.max_colwidth', None)
train_df.head()

Unnamed: 0,customer_id,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,...,garment_group_no,garment_group_name,detail_desc,avg_price,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657,568601043,568601,Mariette Blazer,264,Blazer,Garment Upper body,1010026,Other structure,93,...,1008,Dressed,"Fitted jacket in woven fabric with notch lapels, jetted front pockets, a decorative button at the cuffs and a single back vent. Lined.",0.049197,0.0,0.0,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a91f8ca0d4b6efa8100
1,000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,794321007,794321,SULIMA jkt,262,Jacket,Garment Upper body,1010016,Solid,15,...,1001,Unknown,"Outdoor jacket in woven fabric with a double-layered hood, stand up collar and zip down the front with a chin guard. Padded front and back sections, zipped side pockets, long raglan sleeves with thumbholes at the cuffs and inset side panels in sturdy jersey lined with thermal fleece. Longer and rounded at the back. Partly lined.",0.066389,0.0,0.0,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6c9090f7dd3e38380dc
2,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801fe7fc0f26dd8d65a85a,730683050,730683,HAVANA HW tights,-1,Unknown,Unknown,1010028,Mesh,9,...,1005,Jersey Fancy,Sports tights in fast-drying functional fabric with a high waist and wide waistband to hold in and shape the waist. Sculpting seams at the back that showcase the body’s physique. Concealed key pocket in the waistband.,0.041424,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd4564743b005a805b1d
3,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801fe7fc0f26dd8d65a85a,791587015,791587,Speedy conscious tee,255,T-shirt,Garment Upper body,1010016,Solid,92,...,1005,Jersey Fancy,Straight-cut sports top in fast-drying mesh with short sleeves and a rounded hem. Slightly longer at the back. The polyester content of the top is recycled.,0.02476,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd4564743b005a805b1d
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801fe7fc0f26dd8d65a85a,896152002,896152,Amelie,255,T-shirt,Garment Upper body,1010016,Solid,9,...,1003,Knitwear,"Top in a soft, fine knit containing some wool with a collar and rhinestone-decorated buttons at the top. Gently dropped shoulders, short sleeves with ribbed trims and a ribbed hem. The polyester content of the top is recycled.",0.032986,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd4564743b005a805b1d


In [28]:
# Test the model for a few customer IDs
# Ensure these are actual IDs from your train_df
sample_customer_ids = ['000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318']
sample_recommendation(model, dataset, customer_encoder, article_encoder, sample_customer_ids)

Customer 000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318
     Recommended articles:
        587320001
        297791001
        470361001
        662773003
        456163060
        214844003
        310091042
        571041001
        547429008
        469039034


# Applying LightFM model on test data (train_df, test_df)

In [None]:
# Sampling some users and getting their recommendations
def sample_recommendation(model, data, user_ids):

    n_users, n_items = data['train'].shape

    for user_id in user_ids:
        known_positives = data['item_labels'][data['train'].tocsr()[user_id].indices]

        scores = model.predict(user_id, np.arange(n_items))
        top_items = data['item_labels'][np.argsort(-scores)]

        print("User %s" % user_id)
        print("     Known positives:")

        for x in known_positives[:3]:
            print("        %s" % x)

        print("     Recommended:")

        for x in top_items[:3]:
            print("        %s" % x)

# model = LightFM(loss='warp')
# data = fetch_movielens(min_rating=5.0)
# sample user_ids [3, 25, 450]
sample_recommendation(model, data, [3, 25, 450])

In [None]:
dataset

<lightfm.data.Dataset at 0x7c4a2797b940>

In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1209646 entries, 0 to 1209645
Data columns (total 33 columns):
 #   Column                        Non-Null Count    Dtype  
---  ------                        --------------    -----  
 0   customer_id                   1209646 non-null  int64  
 1   article_id                    1209646 non-null  int64  
 2   product_code                  1209646 non-null  int64  
 3   prod_name                     1209646 non-null  object 
 4   product_type_no               1209646 non-null  int64  
 5   product_type_name             1209646 non-null  object 
 6   product_group_name            1209646 non-null  object 
 7   graphical_appearance_no       1209646 non-null  int64  
 8   graphical_appearance_name     1209646 non-null  object 
 9   colour_group_code             1209646 non-null  int64  
 10  colour_group_name             1209646 non-null  object 
 11  perceived_colour_value_id     1209646 non-null  int64  
 12  perceived_colour_value_name 