Source: https://www.kaggle.com/carrie1/ecommerce-data

In [2]:
import datetime as dt
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix, lil_matrix
from scipy import spatial
from scipy import stats, sparse
%matplotlib inline

# EDA

## Data cleaning

In [None]:
data = pd.read_csv("/Users/maximmigutin/Documents/My_projects/cv_projects/Dockerized_Recommender_system/data/data_raw.csv", encoding = "ISO-8859-1")

data.InvoiceDate = pd.to_datetime(data.InvoiceDate, format="%m/%d/%Y %H:%M")
data = data[data["UnitPrice"] >= 0 ]
data = data[data["InvoiceNo"].astype(str).str[0] != "C"]
data = data[data["InvoiceNo"].astype(str).str[0] != "A"]
data = data[data["Quantity"] > 0 ]
data["Description"] = data["Description"].fillna("Unkown")
data["CustomerID"] = data["CustomerID"].fillna(-9999)
data['Year'] = data.InvoiceDate.dt.year
data['Revenue'] = data['Quantity']*data['UnitPrice']
# keeping only authorized orders
data = data[data["CustomerID"]!=-9999]

In [None]:
# оставляем только данные продаж из UK за 2011 год
data = data[data['Country'] == 'United Kingdom']
data = data[data['Year'] == 2011]

In [None]:
data.to_csv('/Users/maximmigutin/Documents/My_projects/cv_projects/Dockerized_Recommender_system/data_clean.csv', index=False)

In [None]:
data.tail()

In [None]:
data.info()

In [None]:
print("Total data points: ", data.shape[0])
print("Total missing values: {} - which is {:.2f}% of our total data".format(
    data.isnull().sum().sum(), (data.isnull().sum().sum()*100)/data.shape[0]))
print("Total unique Countries: ", data.Country.nunique())
print("Total unique description: ", data.Description.nunique())

# Recommenders

In [21]:
data = pd.read_csv(
    '/Users/maximmigutin/Documents/My_projects/cv_projects/Dockerized_Recommender_system/data/data_clean.csv')

In [22]:
data['CustomerID'].nunique()

3814

In [23]:
data['StockCode'].nunique()

3575

In [25]:
def to_sparse(data):
    d = data[['StockCode', 'CustomerID', 'Revenue']].groupby(['StockCode', 'CustomerID']).count()#[1:10000]
    d = d.reset_index()
    piv = pd.pivot_table(d, index='CustomerID', columns='StockCode', values='Revenue')
    piv[piv>=1]=1
    piv = piv.dropna(axis=1, how='all')
    cols = piv.columns
    piv = piv.fillna(0)
    interactions= piv.copy()
    piv = lil_matrix(piv, dtype='float')
    return piv,cols, interactions
piv,cols,interactions = to_sparse(data)

In [8]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import recall_at_k
from lightfm.evaluation import auc_score



### Aggarwal's suggestions

https://towardsdatascience.com/solving-business-usecases-by-recommender-system-using-lightfm-4ba7b3ac8e62

#### Items2user

In [9]:
def create_user_dict(interactions):
    '''
    Create a user dictionary based on their index and id in interactions df
    Arguments: 
        - interactions: dataframe of shape n_users*m_items containing transactional history 
    Output:
       - user_dict - standard python dict of type {"user_id": index}
    '''
    user_id = list(interactions.index)
    user_dict = {}
    counter = 0
    for i in user_id:
        user_dict[round(i)] = counter
        counter += 1
    return user_dict


def create_item_dict(df, id_col, name_col):
    '''
    Create an item dictionary based on their item_id and item_name
    Arguments: 
        - df: dataframe with items data
        - id_col: column name containing unique identifier for an item
        - name_col: column name containing name of the item
    Output:
        - item_dict = standard python dict of type {"item_id": item_name}
    '''
    item_dict = {}
    for i in range(df.shape[0]):
        item_dict[(df.at[df.index[i], id_col])] = df.at[df.index[i], name_col]
    return item_dict


def fit_mf_model(interactions, n_components, loss='warp', epoch=3, n_jobs=6):
    '''
    Create csr matrix out of interactions df, create and fit Matrix Factorization model.
    Find more about parameters on LightFM's official docs page:
    https://making.lyst.com/lightfm/docs/lightfm.html
    Arguments:
        - interactions: dataframe of shape n_users*m_items containing transactional history 
        - n_components:  the dimensionality of the feature latent embeddings
        - loss:  one of (‘logistic’, ‘bpr’, ‘warp’, ‘warp-kos’): the loss function
        - epoch: number of epochs to run 
        - n_jobs: number of cores used for running the training process
    Output:
        - model object which is an instance of LightFM class
    '''
    X = csr_matrix(interactions.values)
    model = LightFM(no_components=n_components, loss=loss)
    model.fit(X, epochs=epoch, num_threads=n_jobs)
    return model


def items_to_user(model, interactions, user_id, user_dict,
                               item_dict, threshold=0, nrec_items=10, show=True):
    '''
    Create recommendations for 1 user.
    Arguments:
        - model: model object of LightFM class
        - interactions: dataframe of shape n_users*m_items containing transactional history
        - user_id 
        - user_dict 
        - item_dict
        - threshold: value above which the rating is favorable in new interaction matrix
        - nrec_items: number of items to recommend 
    Output: 
        - List of items the given user has already bought
        - List of nrec_items which user is likely to be interested in
    '''
    n_users, n_items = interactions.shape
    user_x = user_dict[user_id]
    scores = pd.Series(model.predict(user_x, np.arange(n_items)))
    scores.index = interactions.columns
    scores = list(pd.Series(scores.sort_values(ascending=False).index))

    known_items = list(pd.Series(interactions.loc[user_id, :]
                                 [interactions.loc[user_id, :] > threshold].index)
                       .sort_values(ascending=False))

    scores = [x for x in scores if x not in known_items]
    return_score_list = scores[0:nrec_items]
    known_items = list(pd.Series(known_items).apply(lambda x: item_dict[x]))
    scores = list(pd.Series(return_score_list).apply(lambda x: item_dict[x]))
    if show == True:
        print("Known Likes:")
        counter = 1
        for i in known_items:
            print(str(counter) + '- ' + i)
            counter += 1

        print("\n Recommended Items:")
        counter = 1
        for i in scores:
            print(str(counter) + '- ' + i)
            counter += 1
    return return_score_list


def users_to_item(model, interactions, item_id, user_dict, item_dict, len_users):
    '''
    Create a list of top N interested users for a given item
    Arguments:
        - model: model object of LightFM class
        - interactions: dataframe of shape n_users*m_items containing transactional history
        - item_id 
        - user_dict 
        - item_dict
        - len_users: number of users needed as an output
    Output:
        - user_list: list of recommended users 
    '''
    n_users, n_items = interactions.shape
    x = np.array(interactions.columns)
    scores = pd.Series(model.predict(np.arange(n_users),
                                     np.repeat(x.searchsorted(item_id), n_users)))
    user_list = list(interactions.index[scores.sort_values(
        ascending=False).head(len_users).index])
    return user_list


def create_item_emdedding_distance_matrix(model, interactions):
    '''
    Create item-item distance embedding matrix
    Arguments:
        - model: model object of LightFM class
        - interactions: dataframe of shape n_users*m_items containing transactional history
    Output:
        - item_emdedding_distance_matrix: dataframe containing pair-wise cosine distance matrix between items
    '''
    df_item_norm_sparse = sparse.csr_matrix(model.item_embeddings)
    similarities = cosine_similarity(df_item_norm_sparse)
    item_emdedding_distance_matrix = pd.DataFrame(similarities)
    item_emdedding_distance_matrix.columns = interactions.columns
    item_emdedding_distance_matrix.index = interactions.columns
    return item_emdedding_distance_matrix


def items_to_item(item_emdedding_distance_matrix, item_id,
                             item_dict, n_items=10, show=True):
    '''
    Function to create item-item recommendation
    Arguments:
        - item_emdedding_distance_matrix: dataframe containing pair-wise cosine distance matrix between items
        - item_id
        - item_dict
        - n_items: number of items needed as an output
    Output:
        - recommended_items: list of recommended items
    '''
    recommended_items = list(pd.Series(item_emdedding_distance_matrix.loc[item_id, :].
                                       sort_values(ascending=False).head(n_items+1).
                                       index[1:n_items+1]))
    if show == True:
        print("Item of interest :{0}".format(item_dict[item_id]))
        print("Item similar to the stated item:")
        counter = 1
        for i in recommended_items:
            print(str(counter) + '- ' + item_dict[i])
            counter += 1
    return recommended_items

In [10]:
# Create User Dict
user_dict = create_user_dict(interactions=interactions)
# Create Item dict
item_dict = create_item_dict(df = data,
                               id_col = 'StockCode',
                               name_col = 'Description')

In [11]:
user_dict

{12346: 0,
 12747: 1,
 12748: 2,
 12749: 3,
 12820: 4,
 12821: 5,
 12822: 6,
 12823: 7,
 12824: 8,
 12826: 9,
 12827: 10,
 12828: 11,
 12829: 12,
 12830: 13,
 12831: 14,
 12832: 15,
 12833: 16,
 12834: 17,
 12836: 18,
 12837: 19,
 12838: 20,
 12839: 21,
 12840: 22,
 12841: 23,
 12842: 24,
 12843: 25,
 12844: 26,
 12845: 27,
 12847: 28,
 12849: 29,
 12851: 30,
 12852: 31,
 12853: 32,
 12854: 33,
 12856: 34,
 12857: 35,
 12863: 36,
 12864: 37,
 12867: 38,
 12868: 39,
 12871: 40,
 12872: 41,
 12873: 42,
 12875: 43,
 12877: 44,
 12878: 45,
 12879: 46,
 12881: 47,
 12882: 48,
 12883: 49,
 12884: 50,
 12885: 51,
 12886: 52,
 12888: 53,
 12890: 54,
 12891: 55,
 12893: 56,
 12895: 57,
 12897: 58,
 12901: 59,
 12902: 60,
 12904: 61,
 12906: 62,
 12908: 63,
 12909: 64,
 12910: 65,
 12912: 66,
 12913: 67,
 12915: 68,
 12916: 69,
 12917: 70,
 12919: 71,
 12920: 72,
 12921: 73,
 12922: 74,
 12923: 75,
 12924: 76,
 12925: 77,
 12928: 78,
 12929: 79,
 12930: 80,
 12931: 81,
 12933: 82,
 12935: 83,
 1

In [12]:
item_dict

{'22386': 'JUMBO BAG PINK POLKADOT',
 '21499': 'BLUE POLKADOT WRAP',
 '21498': 'RED RETROSPOT WRAP ',
 '22379': 'RECYCLING BAG RETROSPOT ',
 '20718': 'RED RETROSPOT SHOPPER BAG',
 '85099B': 'JUMBO BAG RED RETROSPOT',
 '20682': 'RED RETROSPOT CHILDRENS UMBRELLA',
 '22961': 'JAM MAKING SET PRINTED',
 '22667': 'RECIPE BOX RETROSPOT ',
 '22898': 'CHILDRENS APRON APPLES DESIGN',
 '22896': 'PEG BAG APPLES DESIGN',
 '22303': 'COFFEE MUG APPLES DESIGN',
 '22302': 'COFFEE MUG PEARS  DESIGN',
 '85123A': 'CREAM HANGING HEART T-LIGHT HOLDER',
 '22808': 'SET OF 6 T-LIGHTS EASTER CHICKS',
 '22458': 'CAST IRON HOOK GARDEN FORK',
 '22862': 'LOVE HEART NAPKIN BOX ',
 '21733': 'RED HANGING HEART T-LIGHT HOLDER',
 '22062': 'CERAMIC BOWL WITH LOVE HEART DESIGN',
 '22060': 'LARGE CAKE STAND HANGING HEARTS',
 '22151': 'PLACE SETTING WHITE HEART',
 '22508': 'DOORSTOP RETROSPOT HEART',
 '21411': 'GINGHAM HEART  DOORSTOP RED',
 '22644': 'CERAMIC CHERRY CAKE MONEY BANK',
 '70006': 'LOVE HEART POCKET WARMER',
 '

In [13]:
mf_model = fit_mf_model(interactions = interactions,
                 n_components = 140,
                 loss = 'warp',
                 epoch = 10,
                 n_jobs = 6)

In [14]:
print('Precision @k: ' +
      str(round(precision_at_k(mf_model, piv, k=3).mean()*100)) + '%')
print('Recall @k: ' +
      str(round(recall_at_k(mf_model, piv, k=3).mean()*100)) + '%')
print('AUC score: ' + str(round(auc_score(mf_model, piv).mean()*100)) + '%')

Precision @k: 74.0%
Recall @k: 13.0%
AUC score: 99.0%


In [20]:
interactions

StockCode,10002,10080,10120,10123C,10124A,10124G,10125,10133,10135,11001,...,90214V,90214W,90214Y,90214Z,BANK CHARGES,C2,DOT,M,PADS,POST
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12747.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12748.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
12749.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
12820.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18280.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18281.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18282.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18283.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [15]:
rec_list = items_to_user(model = mf_model, 
                                      interactions = interactions, 
                                      user_id = 12947, 
                                      user_dict = user_dict,
                                      item_dict = item_dict, 
                                      threshold = 0,
                                      nrec_items = 10,
                                      show = True)

Known Likes:
1- WORLD WAR 2 GLIDERS ASSTD DESIGNS
2- PARTY BUNTING
3- SPOTTY BUNTING
4- WRAP ALPHABET DESIGN
5- LUNCH BAG VINTAGE DOILY 
6- LUNCH BAG APPLE DESIGN
7- SCHOOL DESK AND CHAIR 
8- BLUE COAT RACK PARIS FASHION
9- RED COAT RACK PARIS FASHION
10- BREAD BIN DINER STYLE RED 
11- 3 TIER CAKE TIN GREEN AND CREAM
12- CREAM WALL PLANTER HEART SHAPED
13- CARD PARTY GAMES 
14- SWEETHEART WIRE MAGAZINE RACK
15- CARD CIRCUS PARADE
16- WRAP CIRCUS PARADE
17- FELTCRAFT CUSHION RABBIT
18- FELTCRAFT CUSHION BUTTERFLY
19- FELTCRAFT CUSHION OWL
20- HAPPY STENCIL CRAFT
21- MONSTERS STENCIL CRAFT
22- PLASTERS IN TIN WOODLAND ANIMALS
23- PLASTERS IN TIN SKULLS
24- CHILDS GARDEN FORK PINK
25- CHILDS GARDEN FORK BLUE 
26- CHILDS GARDEN TROWEL PINK
27- CHILDS GARDEN TROWEL BLUE 
28- SEWING BOX RETROSPOT DESIGN 
29- NATURAL SLATE CHALKBOARD LARGE 
30- WATERING CAN PINK BUNNY
31- WATERING CAN BLUE ELEPHANT
32- LUNCH BAG PINK POLKADOT
33- LUNCH BAG SUKI DESIGN 
34- LUNCH BAG SPACEBOY DESIGN 
35- 3 STR

In [None]:
rec_list

#### Users2item

In [34]:
## Calling 15 user recommendation for item id 1
users_to_item(model = mf_model,
                           interactions = interactions,
                           item_id = '22386',
                           user_dict = user_dict,
                           item_dict = item_dict,
                           len_users = 15)

[15417.0,
 14489.0,
 14495.0,
 16986.0,
 16997.0,
 15043.0,
 15032.0,
 13461.0,
 12990.0,
 13791.0,
 17174.0,
 17669.0,
 16748.0,
 13577.0,
 18177.0]

#### Item2Item

In [35]:
## Creating item-item distance matrix
item_item_dist = create_item_emdedding_distance_matrix(model = mf_model,
                                                       interactions = interactions)
## Checking item embedding distance matrix
item_item_dist.head()

StockCode,10002,10080,10120,10123C,10124A,10124G,10125,10133,10135,11001,...,90214V,90214W,90214Y,90214Z,BANK CHARGES,C2,DOT,M,PADS,POST
StockCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10002,1.0,-0.041341,0.104004,-0.035129,-0.015941,0.104587,0.14832,0.147324,-0.038987,0.1847,...,-0.106904,-0.013555,-0.181364,-0.0168,-0.215513,-0.258153,-0.030889,-0.094237,-0.253222,-0.193939
10080,-0.041341,1.0,0.200686,-0.006859,-0.111714,0.080097,0.165078,0.145528,0.084138,-0.080982,...,0.085508,0.09111,0.031713,0.100228,-0.142846,-0.074533,-0.004838,0.047166,-0.098251,0.048045
10120,0.104004,0.200686,1.0,0.193294,0.134058,0.259699,0.253011,0.094998,-0.06316,0.183261,...,0.405297,0.500357,0.352359,0.452071,0.100078,-0.300729,0.266385,0.149217,0.035784,0.088592
10123C,-0.035129,-0.006859,0.193294,1.0,0.394563,0.318157,0.419325,0.039738,-0.126786,-0.13429,...,0.325619,0.310878,0.358885,0.310871,0.339037,-0.174229,0.357655,0.071845,0.145382,-0.145592
10124A,-0.015941,-0.111714,0.134058,0.394563,1.0,0.573723,0.115395,0.049351,-0.215765,-0.160462,...,0.334712,0.288365,0.314377,0.325796,0.350654,0.051836,0.305813,-0.025112,0.071197,-0.180514


In [36]:
## Calling 10 recommended items for item id 
rec_list = items_to_item(item_emdedding_distance_matrix = item_item_dist,
                                    item_id = '22466',
                                    item_dict = item_dict,
                                    n_items = 10)

Item of interest :FAIRY TALE COTTAGE NIGHT LIGHT
Item similar to the stated item:
1- MINI LIGHTS WOODLAND MUSHROOMS
2- RED TOADSTOOL LED NIGHT LIGHT
3- RABBIT NIGHT LIGHT
4- GUMBALL COAT RACK
5- ROUND SNACK BOXES SET OF4 WOODLAND 
6- WOODLAND SMALL RED FELT HEART
7- CABIN BAG VINTAGE PAISLEY
8- AIRLINE BAG VINTAGE JET SET WHITE
9- TOADSTOOL BEDSIDE LIGHT 
10- DOLLY GIRL LUNCH BOX


In [37]:
rec_list

['23480',
 '21731',
 '23084',
 '22467',
 '22326',
 '23473',
 '22503',
 '22376',
 '23079',
 '22630']

In [38]:
item_dict[rec_list[0]]

'MINI LIGHTS WOODLAND MUSHROOMS'