# Sistema de Recomendação para site de E-commerce

In [5]:
import pandas as pd

In [6]:
df = pd.read_csv('ecommerce_data.zip', encoding = 'ISO-8859-1')

In [7]:
print ("Rows     : " , df.shape[0])
print ("Columns  : " , df.shape[1])
print ("\nFeatures : \n" , df.columns.tolist())
print ("\nMissing values :  ", df.isnull().sum().values.sum())
print ("\nUnique values :  \n", df.nunique())

Rows     :  541909
Columns  :  8

Features : 
 ['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country']

Missing values :   136534

Unique values :  
 InvoiceNo      25900
StockCode       4070
Description     4223
Quantity         722
InvoiceDate    23260
UnitPrice       1630
CustomerID      4372
Country           38
dtype: int64


In [8]:
df.describe()

Unnamed: 0,Quantity,UnitPrice,CustomerID
count,541909.0,541909.0,406829.0
mean,9.55225,4.611114,15287.69057
std,218.081158,96.759853,1713.600303
min,-80995.0,-11062.06,12346.0
25%,1.0,1.25,13953.0
50%,3.0,2.08,15152.0
75%,10.0,4.13,16791.0
max,80995.0,38970.0,18287.0


>Removendo valores negativos das colunas preço e quantidade.

In [9]:
df = df.loc[df['Quantity'] > 0]
df = df.loc[df['UnitPrice'] > 0]

In [11]:
df.isnull().sum()

InvoiceNo           0
StockCode           0
Description         0
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     132220
Country             0
dtype: int64

In [13]:
df.shape

(530104, 8)

Vamos dar uma olhada nos registros que estão vazios.


In [14]:
df.loc[df['CustomerID'].isna()].head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
1443,536544,21773,DECORATIVE ROSE BATHROOM BOTTLE,1,12/1/2010 14:32,2.51,,United Kingdom
1444,536544,21774,DECORATIVE CATS BATHROOM BOTTLE,2,12/1/2010 14:32,2.51,,United Kingdom
1445,536544,21786,POLKADOT RAIN HAT,4,12/1/2010 14:32,0.85,,United Kingdom
1446,536544,21787,RAIN PONCHO RETROSPOT,2,12/1/2010 14:32,1.66,,United Kingdom
1447,536544,21790,VINTAGE SNAP CARDS,9,12/1/2010 14:32,1.66,,United Kingdom


> Agora iremos dropar as os valores vazios que estão dentro de CustomerID, pois precisamos desse campo definido para fazer nossas matrizes de recomendação.

In [15]:
df = df.dropna(subset=['CustomerID'])

Vejamos se está tudo ok.

In [16]:
df.isnull().sum()

InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
dtype: int64

# Criando uma matiz cliente-item 

A matriz segue o seguinte padrão:

<div>
<img src="https://images.slideplayer.com/18/6109706/slides/slide_9.jpg" width="500"/>
</div>


Precisamos criar uma matriz que contenha os IDs do cliente como índice e cada item individual como uma coluna.
Usamos a função pivot para usar o CódigoDoCliente como o índice e usar o CódigoCódigo como colunas.
Em seguida, usamos o valor Quantidade como os valores que exibimos e, finalmente, usamos o aggfunc para resumir esses valores.

In [17]:
customer_item_matrix = df.pivot_table(index='CustomerID', columns='StockCode', values='Quantity',aggfunc='sum')
customer_item_matrix.head()

StockCode,10002,10080,10120,10123C,10124A,10124G,10125,10133,10135,11001,...,90214V,90214W,90214Y,90214Z,BANK CHARGES,C2,DOT,M,PADS,POST
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346.0,,,,,,,,,,,...,,,,,,,,,,
12347.0,,,,,,,,,,,...,,,,,,,,,,
12348.0,,,,,,,,,,,...,,,,,,,,,,9.0
12349.0,,,,,,,,,,,...,,,,,,,,,,1.0
12350.0,,,,,,,,,,,...,,,,,,,,,,1.0


In [20]:
len(df.StockCode.unique())

3665

Agora é hora de transformar nossa tabela em matriz. Nas células onde existem ocorrências vamos subistituir por 1 e nas células onde tem NaN vamos subistituir por 0

In [21]:
customer_item_matrix = customer_item_matrix.applymap(lambda x: 1 if x > 0 else 0)
customer_item_matrix.head()

StockCode,10002,10080,10120,10123C,10124A,10124G,10125,10133,10135,11001,...,90214V,90214W,90214Y,90214Z,BANK CHARGES,C2,DOT,M,PADS,POST
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12347.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12348.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
12349.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
12350.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


## Criando um filtro colaborativo
***
Vamos calcular o cosseno da similaridade entre cada vetor de clientes

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

user_user_sim_matrix = pd.DataFrame(cosine_similarity(customer_item_matrix))
user_user_sim_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4328,4329,4330,4331,4332,4333,4334,4335,4336,4337
0,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,1.000000,0.063022,0.046130,0.047795,0.038484,0.000000,0.025876,0.136641,0.094742,...,0.000000,0.029709,0.052668,0.000000,0.032844,0.062318,0.000000,0.113776,0.109364,0.012828
2,0.000000,0.063022,1.000000,0.024953,0.051709,0.027756,0.000000,0.027995,0.118262,0.146427,...,0.000000,0.064282,0.113961,0.000000,0.000000,0.000000,0.000000,0.000000,0.170905,0.083269
3,0.000000,0.046130,0.024953,1.000000,0.056773,0.137137,0.000000,0.030737,0.032461,0.144692,...,0.000000,0.105868,0.000000,0.000000,0.039014,0.000000,0.000000,0.067574,0.137124,0.030475
4,0.000000,0.047795,0.051709,0.056773,1.000000,0.031575,0.000000,0.000000,0.000000,0.033315,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.044866,0.000000
5,0.000000,0.038484,0.027756,0.137137,0.031575,1.000000,0.000000,0.102568,0.036108,0.089414,...,0.000000,0.157014,0.000000,0.000000,0.000000,0.000000,0.000000,0.037582,0.080278,0.033898
6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.138675,0.068680,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,0.000000,0.025876,0.027995,0.030737,0.000000,0.102568,0.000000,1.000000,0.000000,0.054109,...,0.000000,0.039590,0.000000,0.000000,0.000000,0.041523,0.049629,0.000000,0.113354,0.000000
8,0.000000,0.136641,0.118262,0.032461,0.000000,0.036108,0.138675,0.000000,1.000000,0.152388,...,0.000000,0.083624,0.148250,0.000000,0.000000,0.000000,0.000000,0.160128,0.034204,0.108324
9,0.000000,0.094742,0.146427,0.144692,0.033315,0.089414,0.068680,0.054109,0.152388,1.000000,...,0.000000,0.082832,0.000000,0.000000,0.000000,0.000000,0.000000,0.079305,0.093170,0.000000


Note que as colunas e índices mudaram, pois estamos comparando a similaridade entre os padrões de compra entre clientes x clientes
***
Agora renomearemos a matriz com os respectivos nomes dos clientes

In [23]:
user_user_sim_matrix.columns = customer_item_matrix.index

user_user_sim_matrix['CustomerID'] = customer_item_matrix.index

user_user_sim_matrix = user_user_sim_matrix.set_index('CustomerID')
user_user_sim_matrix.head()

CustomerID,12346.0,12347.0,12348.0,12349.0,12350.0,12352.0,12353.0,12354.0,12355.0,12356.0,...,18273.0,18274.0,18276.0,18277.0,18278.0,18280.0,18281.0,18282.0,18283.0,18287.0
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12347.0,0.0,1.0,0.063022,0.04613,0.047795,0.038484,0.0,0.025876,0.136641,0.094742,...,0.0,0.029709,0.052668,0.0,0.032844,0.062318,0.0,0.113776,0.109364,0.012828
12348.0,0.0,0.063022,1.0,0.024953,0.051709,0.027756,0.0,0.027995,0.118262,0.146427,...,0.0,0.064282,0.113961,0.0,0.0,0.0,0.0,0.0,0.170905,0.083269
12349.0,0.0,0.04613,0.024953,1.0,0.056773,0.137137,0.0,0.030737,0.032461,0.144692,...,0.0,0.105868,0.0,0.0,0.039014,0.0,0.0,0.067574,0.137124,0.030475
12350.0,0.0,0.047795,0.051709,0.056773,1.0,0.031575,0.0,0.0,0.0,0.033315,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.044866,0.0


Para exemplificar a aplicabilidade dessa metodologia, vamos encontrar os clientes mais parecidos com o nosso cliente de teste, '12358'

In [25]:
user_user_sim_matrix.loc[12358].sort_values(ascending=False).head(10)

CustomerID
12358.0    1.000000
14155.0    0.452911
14145.0    0.452911
18240.0    0.452911
13551.0    0.416025
15442.0    0.392232
12917.0    0.392232
16565.0    0.320256
12665.0    0.320256
15355.0    0.277350
Name: 12358.0, dtype: float64

> Quais itens o cliente 12358 comprou?

In [26]:
items_bought_by_12358 = set(customer_item_matrix.loc[12358].iloc[customer_item_matrix.loc[12358].to_numpy().nonzero()].index)
items_bought_by_12358

{'15056BL',
 '15056N',
 '15056P',
 '15060B',
 '20679',
 '21232',
 '22059',
 '22063',
 '22646',
 '37447',
 '37449',
 '48185',
 'POST'}

>  Agora vejamos os itens que o cliente 14155 comprou.

In [27]:
items_bought_by_14145 = set(customer_item_matrix.loc[14145.0].iloc[customer_item_matrix.loc[14145.0].to_numpy().nonzero()].index)
items_bought_by_14145

{'15056BL', '15056N', '15056P', '20679', '85014A', '85014B'}

>Quais itens 12358 compraram, mas 14145 não compraram?<br

Eles seriam bons itens para recomendar ao 14145, pois são muito semelhantes

In [28]:
items_to_recommend_to_14145 = items_bought_by_12358 - items_bought_by_14145
items_to_recommend_to_14145

{'15060B',
 '21232',
 '22059',
 '22063',
 '22646',
 '37447',
 '37449',
 '48185',
 'POST'}

>Vamos ver as descrições dos itens recomendados

In [29]:
df.loc[df['StockCode'].isin(items_to_recommend_to_14145), ['StockCode', 'Description']].drop_duplicates().set_index('StockCode')

Unnamed: 0_level_0,Description
StockCode,Unnamed: 1_level_1
POST,POSTAGE
22646,CERAMIC STRAWBERRY CAKE MONEY BANK
48185,DOORMAT FAIRY CAKE
21232,STRAWBERRY CERAMIC TRINKET BOX
22059,CERAMIC STRAWBERRY DESIGN MUG
37449,CERAMIC CAKE STAND + HANGING CAKES
15060B,FAIRY CAKE DESIGN UMBRELLA
37447,CERAMIC CAKE DESIGN SPOTTED PLATE
22063,CERAMIC BOWL WITH STRAWBERRY DESIGN
21232,STRAWBERRY CERAMIC TRINKET POT


##  Encontrando itens para recomendar a um cliente
***
Vamos criar uma função para executar essa tarefa:

In [30]:
def get_items_to_recommend_cust(cust_a):
  '''returns the items to recommend to a customer using customer similarity'''
  most_similar_user = user_user_sim_matrix.loc[cust_a].sort_values(ascending=False).reset_index().iloc[1, 0]
  items_bought_by_cust_a = set(customer_item_matrix.loc[cust_a].iloc[customer_item_matrix.loc[cust_a].to_numpy().nonzero()].index)
  items_bought_by_cust_b = set(customer_item_matrix.loc[most_similar_user].iloc[customer_item_matrix.loc[most_similar_user].to_numpy().nonzero()].index)
  items_to_recommend_to_a = items_bought_by_cust_b - items_bought_by_cust_a
  items_description = df.loc[df['StockCode'].isin(items_to_recommend_to_a), ['StockCode', 'Description']].drop_duplicates().set_index('StockCode')
  return items_description

Agora é hora de testar.

In [31]:
get_items_to_recommend_cust(12358.0)

Unnamed: 0_level_0,Description
StockCode,Unnamed: 1_level_1
85014B,RED RETROSPOT UMBRELLA
15044D,RED PAPER PARASOL


In [32]:
get_items_to_recommend_cust(12348.0)

Unnamed: 0_level_0,Description
StockCode,Unnamed: 1_level_1
21212,PACK OF 72 RETROSPOT CAKE CASES
21975,PACK OF 60 DINOSAUR CAKE CASES


# Filtragem colaborativa baseada em item
***
>vamos transpor nossa customer_item_matrix para desta vez criar um filtro baseado nos produdos e não mais nos clientes.

In [33]:
item_item_sim_matrix = pd.DataFrame(cosine_similarity(customer_item_matrix.T))
pitem_item_sim_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3655,3656,3657,3658,3659,3660,3661,3662,3663,3664
0,1.0,0.0,0.094868,0.091287,0.0,0.0,0.090351,0.062932,0.098907,0.095346,...,0.0,0.0,0.0,0.0,0.0,0.029361,0.0,0.067591,0.0,0.078217
1,0.0,1.0,0.0,0.0,0.0,0.0,0.032774,0.045655,0.047836,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016345,0.0,0.0
2,0.094868,0.0,1.0,0.11547,0.0,0.0,0.057143,0.059702,0.041703,0.060302,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071247,0.0,0.010993
3,0.091287,0.0,0.11547,1.0,0.0,0.0,0.164957,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.447214,0.063888,0.044499,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
print('Shape da matriz baseada nos clientes: {}'.format(user_user_sim_matrix.shape))
print('Shape da matriz baseada nos produtos: {}'.format(item_item_sim_matrix.shape))

Shape da matriz baseada nos clientes: (4338, 4338)
Shape da matriz baseada nos produtos: (3665, 3665)


A quantidade de clientes tende a escalar mais que a quantidade de produtos ofereidos por uma loja, e por isso criar um sistema de recomendação baseado na comparação dos produtos acaba sendo mais vantajoso computacionalmente.
***
Vamos agora rotular novamente as colunas para facilitar a compreensão e também vamos mudar o índice de 0 para 3665 para o StockCode.

In [36]:
item_item_sim_matrix.columns = customer_item_matrix.T.index

item_item_sim_matrix['StockCode'] = customer_item_matrix.T.index
item_item_sim_matrix = item_item_sim_matrix.set_index('StockCode')
item_item_sim_matrix.head()

StockCode,10002,10080,10120,10123C,10124A,10124G,10125,10133,10135,11001,...,90214V,90214W,90214Y,90214Z,BANK CHARGES,C2,DOT,M,PADS,POST
StockCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10002,1.0,0.0,0.094868,0.091287,0.0,0.0,0.090351,0.062932,0.098907,0.095346,...,0.0,0.0,0.0,0.0,0.0,0.029361,0.0,0.067591,0.0,0.078217
10080,0.0,1.0,0.0,0.0,0.0,0.0,0.032774,0.045655,0.047836,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016345,0.0,0.0
10120,0.094868,0.0,1.0,0.11547,0.0,0.0,0.057143,0.059702,0.041703,0.060302,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071247,0.0,0.010993
10123C,0.091287,0.0,0.11547,1.0,0.0,0.0,0.164957,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10124A,0.0,0.0,0.0,0.0,1.0,0.447214,0.063888,0.044499,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


> Itens mais similares com  o produto 10080

In [38]:
item_item_sim_matrix.loc['10080'].sort_values(ascending=False).head()

StockCode
10080     1.000000
23694     0.191346
22039     0.187317
47504H    0.166924
21650     0.165567
Name: 10080, dtype: float64

Calculando  os 10 itens mais semelhantes

In [39]:
top_10_similar_items = list(item_item_sim_matrix.loc['10080'].sort_values(ascending=False).iloc[:10].index)
top_10_similar_items

['10080',
 '23694',
 '22039',
 '47504H',
 '21650',
 '90214F',
 '79157B',
 '90206A',
 '84012',
 '22043']

> Descrição do TOP 10 mais similares itens.

In [40]:
df.loc[df['StockCode'].isin(top_10_similar_items), ['StockCode', 'Description']].drop_duplicates().set_index('StockCode').loc[top_10_similar_items]

Unnamed: 0_level_0,Description
StockCode,Unnamed: 1_level_1
10080,GROOVY CACTUS INFLATABLE
23694,PAISLEY PARK CARD
22039,BOTANICAL LILY GIFT WRAP
47504H,ENGLISH ROSE SPIRIT LEVEL
21650,ASSORTED TUTTI FRUTTI BRACELET
90214F,"LETTER ""F"" BLING KEY RING"
79157B,UBO-LIGHT TRIOBASE BLUE
90206A,GOLD DIAMANTE STAR BROOCH
84012,MAGIC SHEEP WOOL GROWING FROM PAPER
22043,CHRISTMAS CARD SCREEN PRINT


> Agora vamos criar um função para facilitar nossa vida.

In [41]:
def get_top_similar_items(item):
  top_10_similar_items = list(item_item_sim_matrix.loc[item].sort_values(ascending=False).iloc[:10].index)
  top_10 = df.loc[df['StockCode'].isin(top_10_similar_items), ['StockCode', 'Description']].drop_duplicates().set_index('StockCode').loc[top_10_similar_items]
  return top_10

In [42]:
get_top_similar_items('84029E')

Unnamed: 0_level_0,Description
StockCode,Unnamed: 1_level_1
84029E,RED WOOLLY HOTTIE WHITE HEART.
84029G,KNITTED UNION FLAG HOT WATER BOTTLE
21479,WHITE SKULL HOT WATER BOTTLE
21485,RETROSPOT HEART HOT WATER BOTTLE
22111,SCOTTIE DOG HOT WATER BOTTLE
22112,CHOCOLATE HOT WATER BOTTLE
22114,HOT WATER BOTTLE TEA AND SYMPATHY
23355,HOT WATER BOTTLE KEEP CALM
84030E,ENGLISH ROSE HOT WATER BOTTLE
22632,HAND WARMER RED POLKA DOT
