# Sistema de Recomendação Personalizado
## Utilizando o método de filtragem colaborativa para recomendar perguntas aos usuários

A filtragem colaborativa é considerada a técnica mais popular e amplamente implementada, a implementação mais simples e original dessa abordagem, recomenda aos usuários os itens que outros usuários com gostos apreciados no passado. A semelhança de gosto de dois usuários é calculada com base na semelhança no histórico de classificação dos usuários. “Item” é o termo geral usado para representar o que o sistema recomenda aos usuários, sendo neste projeto as perguntas (relatórios)

In [119]:
# Importandos as bibliotecas  

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import KNeighborsRegressor

In [120]:
# Importando os dados e armazenando em DataFrames
# logs1 = 1 semestre de 2021
# logs2 = 2 semestre de 2022

logs1 = pd.read_csv("/Users/lucasmartins/Documents/LucasMartins/Projetos/Recomendacao/Recomendacao_filtragem_colaborativa/Looqbox/logs_1semestre2021.csv")
logs2 = pd.read_csv("/Users/lucasmartins/Documents/LucasMartins/Projetos/Recomendacao/Recomendacao_filtragem_colaborativa/Looqbox/logs_1semestre2021.csv")

In [121]:
# Concateando os DataFrames
user_ratings = pd.concat([logs1,logs2]).reset_index(drop=True)

# Inspecionando o DataFrame
user_ratings

Unnamed: 0,Id,user_group_id,response_name
0,219,7,"{livre, metaLoja}"
1,219,7,"{livre, metaLoja}"
2,219,7,"{livre, metaLoja}"
3,219,7,"{venda, VendaNovaclusterLojas}"
4,219,7,"{venda, vendaNovaLoja}"
...,...,...,...
1431873,245,22,"{auditoria, auditoriaBaseVendaRestaurante}"
1431874,482,6,"{compras, LancamentosEcommerceTransferencia}"
1431875,482,6,"{compras, LancamentosEcommerceTransferencia}"
1431876,478,6,"{compras, LancamentosEcommerceTransferencia}"


In [122]:
# Renomeando as colunas 
user_ratings.rename(columns={'Id': 'UserId', 'response_name': 'Pergunta'}, inplace=True)

# Inspecionando o DataFrame
user_ratings

Unnamed: 0,UserId,user_group_id,Pergunta
0,219,7,"{livre, metaLoja}"
1,219,7,"{livre, metaLoja}"
2,219,7,"{livre, metaLoja}"
3,219,7,"{venda, VendaNovaclusterLojas}"
4,219,7,"{venda, vendaNovaLoja}"
...,...,...,...
1431873,245,22,"{auditoria, auditoriaBaseVendaRestaurante}"
1431874,482,6,"{compras, LancamentosEcommerceTransferencia}"
1431875,482,6,"{compras, LancamentosEcommerceTransferencia}"
1431876,478,6,"{compras, LancamentosEcommerceTransferencia}"


In [123]:
# Selecionando apenas as colunas necessarias
user_ratings = user_ratings[["UserId","Pergunta"]]

# Inspecionando o DataFrame
user_ratings

Unnamed: 0,UserId,Pergunta
0,219,"{livre, metaLoja}"
1,219,"{livre, metaLoja}"
2,219,"{livre, metaLoja}"
3,219,"{venda, VendaNovaclusterLojas}"
4,219,"{venda, vendaNovaLoja}"
...,...,...
1431873,245,"{auditoria, auditoriaBaseVendaRestaurante}"
1431874,482,"{compras, LancamentosEcommerceTransferencia}"
1431875,482,"{compras, LancamentosEcommerceTransferencia}"
1431876,478,"{compras, LancamentosEcommerceTransferencia}"


In [124]:
# Analisando os dados
# Descobrimos que o DataFrame possui mais de  1,4 milhoes de linhas 
# 3 colunas, sendo “UserId”  do tipo inteiro e “Pergunta” do tipo object.

# Resumo dos dados
user_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1431878 entries, 0 to 1431877
Data columns (total 2 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   UserId    1431878 non-null  int64 
 1   Pergunta  1431878 non-null  object
dtypes: int64(1), object(1)
memory usage: 21.8+ MB


In [125]:
# Gerando a coluna de Avaliação e classifcaidno do maior para o menor

# Agrupando as colunas "UserId" e "Pergunta, contando as ocorrências e armazenando na coluna Avaliação
user_ratings = user_ratings.groupby(['UserId','Pergunta']).size().reset_index().rename(columns={0:'Avaliacao'})

# Classificando de forma decrescente
user_ratings = user_ratings.sort_values(by="Avaliacao", ascending=False)

# Inspecionando o DataFrame
user_ratings

Unnamed: 0,UserId,Pergunta,Avaliacao
8755,541,"{compras, Price_VencimentoProduto}",21516
3690,172,"{compras, LojaAVencer}",15886
8354,520,"{compras, Price_VencimentoProduto}",14120
3695,172,"{compras, Price_VencimentoProduto}",12580
2296,103,"{livre, estoqueLojaProduto}",12388
...,...,...,...
5626,331,"{compras, LancamentosEcommerceVendas}",2
5627,331,"{compras, LancamentosEcommerce}",2
5635,331,"{compras, Price_VencimentoAprovaReprova}",2
5638,331,"{compras, RupturaDigital}",2


In [126]:
# Transformando o DataFrame em uma matriz de classificação de usuários onde cada linha representa uma pergunta e cada coluna representa um usuário.

# Transformando o DataFrame
user_ratings_pivot = user_ratings.pivot_table(index="Pergunta", columns="UserId", values="Avaliacao")

# Inspecionando a tabela transformada
user_ratings_pivot

UserId,3,8,9,10,12,13,14,15,19,20,...,618,619,620,621,622,627,629,630,637,638
Pergunta,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"{ExclusivoContasReceber, ConsultaClientePDV}",,,,,,,,,,,...,,,,,,,,,,
"{ExclusivoRH, ConsultaFuncionarioPDV}",,,,,,,,,,,...,,,,,,,,,,
"{InteligenciaVarejo, CheckListLojaInput}",,,,,,,,,,,...,,,,,,,,,,
"{InteligenciaVarejo, CheckListLoja}",,,,,,,,,,,...,,,,,,,,,,
"{InteligenciaVarejo, CriticaForaLinhaSemOferta}",,12.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"{venda, vendaRestauranteSecao}",,,,,,,,,,,...,,,,,,,,,,
"{venda, vendaRestaurante}",,,,,,,,,,,...,,,,,,,,,,
"{venda, vendaSecProduto}",,,,58.0,6.0,,8.0,4.0,,,...,,,,2.0,2.0,,,,,
"{{_looqbox}, noInterpretation}",,124.0,84.0,48.0,128.0,18.0,126.0,70.0,22.0,48.0,...,,28.0,,,,32.0,10.0,,2.0,


In [127]:
# Preenchendo os dados ausentes com informações que não devem influenciar os dados da análise.
# Será obtido a pontuação média que cada usuário deu em todas as classificaçoes e, em seguida, 
# utilizaremos essa média para centralizar as pontuações dos usuários em torno de zero, que será
# uma pontuação neutra, minimizando o impacto em seu perfil geral, permitindo a comparação dos usuários.

# Obtendo a classificação média para cada usuário 
avg_ratings = user_ratings_pivot.mean(axis=1)

# Centralizando as avaliações de cada usuário em torno de 0
user_ratings_table_centered = user_ratings_pivot.sub(avg_ratings, axis=0)

# Preenchendo os dados com 0s
user_ratings_table_normed = user_ratings_table_centered.fillna(0)

# Inspecionando a tabela normalizada
user_ratings_table_normed




UserId,3,8,9,10,12,13,14,15,19,20,...,618,619,620,621,622,627,629,630,637,638
Pergunta,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"{ExclusivoContasReceber, ConsultaClientePDV}",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0
"{ExclusivoRH, ConsultaFuncionarioPDV}",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0
"{InteligenciaVarejo, CheckListLojaInput}",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0
"{InteligenciaVarejo, CheckListLoja}",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0
"{InteligenciaVarejo, CriticaForaLinhaSemOferta}",0.0,5.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"{venda, vendaRestauranteSecao}",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0
"{venda, vendaRestaurante}",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0
"{venda, vendaSecProduto}",0.0,0.000000,0.000000,40.350515,-11.649485,0.000000,-9.649485,-13.649485,0.000000,0.000000,...,0.0,0.000000,0.0,-15.649485,-15.649485,0.000000,0.000000,0.0,0.000000,0.0
"{{_looqbox}, noInterpretation}",0.0,13.817568,-26.182432,-62.182432,17.817568,-92.182432,15.817568,-40.182432,-88.182432,-62.182432,...,0.0,-82.182432,0.0,0.000000,0.000000,-78.182432,-100.182432,0.0,-108.182432,0.0


In [128]:
# Calculando as semelhanças e distâncias entre itens no conjunto de dados.
# Será utilizado a distância cosseno, os dados já foram centralizados em torno
# de zero, os valores do cosseno agora podem variar de -1 a 1, sendo 1 o mais semelhante e -1 o mínimo.
# A similaridade de cosseno compara duas matrizes NumPy.

# Cosseno de similaridade é a medida do ângulo entre dois documentos no espaço métrico de alta dimensão 

# Obtendo as linhas que serão comparadas
perg_1 = user_ratings_table_normed.loc['{compras, BuscaVouchers}', :].values.reshape(1, -1)
perg_2 = user_ratings_table_normed.loc['{livre, painelVenda}', :].values.reshape(1, -1)

# Transformando em uma matriz NumPy com os valores de pontos
similarity_A = cosine_similarity(perg_1, perg_2)
print(similarity_A)

[[0.45020237]]


In [129]:
# Fazendo as recomendações e encontrando itens mais semelhantes em geral.
# Para isso, é preciso encontrar as semalhanças entre todas as perguntas de uma vez.
# Resultando em uma matriz de similaridade entre todas as perguntas.
# Com essam atriz calculada, será possível fazer recomendações.

# Gerando a matriz de similaridade
similarities = cosine_similarity(user_ratings_table_normed)

# Envolvendo as semelhanças em um DataFrame
cosine_similarity_df = pd.DataFrame(similarities, index=user_ratings_table_normed.index, columns=user_ratings_table_normed.index)

# Encontrando os valores de similaridade para uma pergunta específica
cosine_similarity_series = cosine_similarity_df.loc['{compras, BuscaVouchers}']

# Ordenando os valores do maior para o menor
ordered_similarities = cosine_similarity_series.sort_values(ascending=False)

# Inspecionando as recomendações
ordered_similarities.head(5)




Pergunta
{compras, BuscaVouchers}        1.000000
{compras, BuscaDescontosCPF}    0.457505
{livre, painelVenda}            0.450202
{compras, ClienteCampanha}      0.440650
{venda, vendaNovaGrupo}         0.424849
Name: {compras, BuscaVouchers}, dtype: float64