In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.preprocessing import MinMaxScaler

In [2]:
data = pd.read_csv('01_General_Shopping_Clean.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Rank                  240 non-null    int64  
 1   App Name              240 non-null    object 
 2   App Package Name      240 non-null    object 
 3   Subgenre              240 non-null    object 
 4   Downloads             240 non-null    int64  
 5   Cumulative Downloads  240 non-null    int64  
 6   Paid Downloads        240 non-null    int64  
 7   Organic Downloads     240 non-null    int64  
 8   Active Users          240 non-null    int64  
 9   Install Base          240 non-null    int64  
 10  Avg Session per User  240 non-null    float64
 11  Avg Session Duration  240 non-null    object 
 12  Avg time per User     240 non-null    object 
 13  Avg Active Days       240 non-null    float64
 14  gender                240 non-null    object 
 15  Tier                  2

In [4]:
data.columns

Index(['Rank', 'App Name', 'App Package Name', 'Subgenre', 'Downloads',
       'Cumulative Downloads', 'Paid Downloads', 'Organic Downloads',
       'Active Users', 'Install Base', 'Avg Session per User',
       'Avg Session Duration', 'Avg time per User', 'Avg Active Days',
       'gender', 'Tier', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18',
       'Unnamed: 19'],
      dtype='object')

In [5]:
#data = data[data['Rank'] > 10]

In [9]:
data['Genre_Subgenre'] = data['gender'] + ' ' + data['Subgenre']+' ' + data['Tier']

# Crie a matriz TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data['Genre_Subgenre'])

In [10]:
# Converter a matriz TF-IDF em um DataFrame do pandas
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names())

# Adicionar a coluna 'App Name' ao DataFrame
tfidf_df['App Name'] = data['App Name']

# Reordenar as colunas para colocar 'App Name' como a primeira coluna
column_order = ['App Name'] + [col for col in tfidf_df.columns if col != 'App Name']
tfidf_df = tfidf_df[column_order]

# Exibir as primeiras linhas do DataFrame
print(tfidf_df.head())

                        App Name       b2c  bnpl       c2c  commerce  coupons  \
0  SHEIN-Fashion Shopping Online  0.000000   0.0  0.000000  0.498089      0.0   
1  Shopee: Compre de Tudo Online  0.000000   0.0  0.864375  0.341912      0.0   
2  Mercado Libre: Compras Online  0.000000   0.0  0.820421  0.324525      0.0   
3  Magalu: loja e compras online  0.796303   0.0  0.000000  0.411301      0.0   
4     Americanas: compras online  0.796303   0.0  0.000000  0.411301      0.0   

   drugstore    female      male  pharmacy  resell  retailer  rewards  \
0        0.0  0.537137  0.000000       0.0     0.0  0.680728      0.0   
1        0.0  0.368716  0.000000       0.0     0.0  0.000000      0.0   
2        0.0  0.000000  0.470736       0.0     0.0  0.000000      0.0   
3        0.0  0.443546  0.000000       0.0     0.0  0.000000      0.0   
4        0.0  0.443546  0.000000       0.0     0.0  0.000000      0.0   

   shopping  
0       0.0  
1       0.0  
2       0.0  
3       0.0  
4   

In [11]:
# Normalizar os valores numéricos entre 0 e 1
scaler = MinMaxScaler()
numeric_data = scaler.fit_transform(data[['Active Users']])

# Combine os recursos numéricos e de texto
combined_features = tfidf_matrix.toarray() * 0.8 + numeric_data * 0.2

# Crie o DataFrame de similaridade usando o linear_kernel
similarity_matrix = linear_kernel(combined_features, combined_features)


In [12]:
similarity_dataframe = pd.DataFrame(similarity_matrix, index=data['App Name'], columns=data['App Name'])

In [13]:
similarity_dataframe

App Name,SHEIN-Fashion Shopping Online,Shopee: Compre de Tudo Online,Mercado Libre: Compras Online,Magalu: loja e compras online,Americanas: compras online,Amazon Shopping,Casas Bahia: Compras Online,AliExpress,OLX - Venda e Compra Online,O Boticário,...,StockX- Access the Now,Lista de Compras - List,Babba - Baby Cry Translator,Food List Tracking & Shopping,That Shopping List,Shopping list voice input PRO,Our Groceries Shopping List,Louis Vuitton,Shopping list — Lister,Menores Preços Dudu Rocha
App Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SHEIN-Fashion Shopping Online,0.971902,0.841980,0.674015,0.633764,0.662830,0.449291,0.519772,0.468145,0.299508,0.789966,...,0.214538,0.227025,0.195618,0.226835,0.227005,0.226842,0.227043,0.757974,0.226954,0.236992
Shopee: Compre de Tudo Online,0.841980,1.664001,1.495746,0.825768,0.870829,0.680875,0.649044,0.710104,0.534935,0.559922,...,0.783413,0.304339,0.276989,0.304044,0.304308,0.304054,0.304367,0.510324,0.304228,0.351744
Mercado Libre: Compras Online,0.674015,1.495746,1.559847,0.680119,0.723215,0.808503,0.511101,0.836458,0.627689,0.404255,...,0.878572,0.194416,0.186774,0.194133,0.194386,0.194144,0.194442,0.356820,0.194309,0.493722
Magalu: loja e compras online,0.633764,0.825768,0.680119,1.008583,1.038429,0.822933,0.891532,0.842293,0.314757,0.446946,...,0.208311,0.212858,0.185924,0.212662,0.212837,0.212669,0.212876,0.414095,0.212784,0.604936
Americanas: compras online,0.662830,0.870829,0.723215,1.038429,1.070046,0.851524,0.914429,0.872033,0.339817,0.464922,...,0.223401,0.225149,0.197737,0.224942,0.225127,0.224949,0.225168,0.430122,0.225071,0.620587
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Shopping list voice input PRO,0.226842,0.304054,0.194144,0.212669,0.224949,0.089714,0.164509,0.097680,0.088168,0.149976,...,0.000020,0.640095,0.080301,0.640015,0.640087,0.640018,0.640103,0.136460,0.640065,0.000020
Our Groceries Shopping List,0.227043,0.304367,0.194442,0.212876,0.225168,0.089912,0.164668,0.097886,0.088342,0.150100,...,0.000125,0.640181,0.080383,0.640100,0.640172,0.640103,0.640188,0.136571,0.640150,0.000129
Louis Vuitton,0.757974,0.510324,0.356820,0.414095,0.430122,0.238856,0.351242,0.249252,0.115062,0.657658,...,0.103471,0.136561,0.108668,0.136456,0.136550,0.136460,0.136571,0.640018,0.136521,0.121797
Shopping list — Lister,0.226954,0.304228,0.194309,0.212784,0.225071,0.089824,0.164597,0.097794,0.088264,0.150045,...,0.000078,0.640143,0.080346,0.640062,0.640134,0.640065,0.640150,0.136521,0.640112,0.000080


In [14]:
numeric_data

array([[4.29637680e-01],
       [1.00000000e+00],
       [9.22753486e-01],
       [4.75276848e-01],
       [5.33645849e-01],
       [4.26375873e-01],
       [2.46360674e-01],
       [4.64237499e-01],
       [4.19038472e-01],
       [6.42778955e-02],
       [6.60648392e-03],
       [1.02557147e-01],
       [1.23218173e-01],
       [2.32128640e-02],
       [5.39045569e-02],
       [4.18354430e-02],
       [6.28867098e-02],
       [1.15997396e-01],
       [2.72544123e-02],
       [9.43099695e-03],
       [4.89702321e-02],
       [5.04469568e-02],
       [1.12891753e-01],
       [6.28788153e-02],
       [6.79068203e-03],
       [8.39854110e-03],
       [2.98814052e-02],
       [6.00699343e-02],
       [8.79510642e-03],
       [7.16943981e-02],
       [5.37970837e-02],
       [1.73363321e-03],
       [4.90014289e-02],
       [4.80390818e-03],
       [3.90183086e-02],
       [6.68045024e-03],
       [3.55301845e-02],
       [7.36284446e-02],
       [4.25310807e-03],
       [7.66679489e-03],


In [15]:
combined_features

array([[8.59275359e-02, 8.59275359e-02, 8.59275359e-02, ...,
        6.30510275e-01, 8.59275359e-02, 8.59275359e-02],
       [2.00000000e-01, 2.00000000e-01, 8.91500209e-01, ...,
        2.00000000e-01, 2.00000000e-01, 2.00000000e-01],
       [1.84550697e-01, 1.84550697e-01, 8.40887737e-01, ...,
        1.84550697e-01, 1.84550697e-01, 1.84550697e-01],
       ...,
       [6.49055551e-06, 6.49055551e-06, 6.49055551e-06, ...,
        5.44589230e-01, 6.49055551e-06, 6.49055551e-06],
       [5.34540083e-05, 5.34540083e-05, 5.34540083e-05, ...,
        5.34540083e-05, 5.34540083e-05, 7.34343214e-01],
       [5.91689174e-01, 8.20402628e-06, 8.20402628e-06, ...,
        8.20402628e-06, 8.20402628e-06, 8.20402628e-06]])

In [16]:
similarity_matrix

array([[9.71902414e-01, 8.41979590e-01, 6.74014763e-01, ...,
        7.57974348e-01, 2.26953612e-01, 2.36991587e-01],
       [8.41979590e-01, 1.66400108e+00, 1.49574637e+00, ...,
        5.10324122e-01, 3.04227825e-01, 3.51743954e-01],
       [6.74014763e-01, 1.49574637e+00, 1.55984699e+00, ...,
        3.56819609e-01, 1.94309444e-01, 4.93722466e-01],
       ...,
       [7.57974348e-01, 5.10324122e-01, 3.56819609e-01, ...,
        6.40017821e-01, 1.36521366e-01, 1.21797284e-01],
       [2.26953612e-01, 3.04227825e-01, 1.94309444e-01, ...,
        1.36521366e-01, 6.40112484e-01, 8.02948318e-05],
       [2.36991587e-01, 3.51743954e-01, 4.93722466e-01, ...,
        1.21797284e-01, 8.02948318e-05, 6.40021997e-01]])

In [23]:
# Função de recomendação
def recommend_app(app_name):
    app_index = data[data['App Name'] == app_name].index[0]
    similarity_scores = list(enumerate(similarity_matrix[app_index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    # Obtenha os índices dos aplicativos mais similares (excluindo o próprio aplicativo)
    similar_app_indices = [score[0] for score in similarity_scores if data.iloc[score[0]]['Rank'] > 10 and data.iloc[score[0]]['App Name'] != app_name]
    
    # Obtenha os índices dos 5 aplicativos mais similares
    similar_app_indices = similar_app_indices[:5]

    # Calcule os índices de afinidade
    affinity_indices = [score[1] for score in similarity_scores if data.iloc[score[0]]['Rank'] > 10][1:6]

    # Mostre os aplicativos recomendados com seus índices de afinidade
    recommended_apps = data.iloc[similar_app_indices]
    recommended_apps['Affinity Index'] = affinity_indices
    return recommended_apps[['App Name', 'App Package Name', 'gender', 'Subgenre', 'Affinity Index']]

In [31]:
# Teste a função de recomendação
app_name = "Lojas Renner - Comprar Roupas"
print(recommend_app(app_name))

                          App Name           App Package Name  gender  \
11  enjoei: comprar e vender roupa          br.com.enjoei.app  female   
21     Riachuelo – Compre pelo app       br.com.riachuelo.app  female   
32   Loja C&A: Roupas, Sapatos e +          br.com.cea.appb2c  female   
34   Marisa: Moda e Comprar Roupas      br.com.marisa.android  female   
18   Natura: perfumes e cosméticos  net.natura.semprepresente  female   

                 Subgenre  Affinity Index  
11  E-Commerce (Retailer)        0.706191  
21  E-Commerce (Retailer)        0.688741  
32  E-Commerce (Retailer)        0.688257  
34  E-Commerce (Retailer)        0.684914  
18  E-Commerce (Retailer)        0.680974  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommended_apps['Affinity Index'] = affinity_indices


O código fornecido não está utilizando Naive Bayes para recomendação. Em vez disso, ele usa uma abordagem baseada em similaridade de conteúdo, utilizando a matriz TF-IDF e o cálculo do cosseno da similaridade (linear_kernel).

Vou explicar novamente o código e como ele funciona:

O código combina as colunas 'Genre' e 'Subgenre' para criar uma nova coluna chamada 'Genre_Subgenre'.
Em seguida, cria uma matriz TF-IDF usando a coluna 'Genre_Subgenre'.
Normaliza a coluna 'Avg Session per User' para ter valores entre 0 e 1.
Combina os recursos numéricos e de texto em uma matriz chamada 'combined_features'.
Calcula a matriz de similaridade usando o cosseno da similaridade (linear_kernel) com base nos recursos combinados.
Define a função recommend_app() que recebe o nome de um aplicativo e retorna uma lista de aplicativos recomendados com base na similaridade do conteúdo.
A matriz TF-IDF é utilizada para calcular a similaridade de texto entre aplicativos, e a coluna 'Avg Session per User' é usada como um recurso numérico adicional. O cálculo da similaridade é realizado com base nos recursos combinados, e os aplicativos mais similares são selecionados como recomendações.

No entanto, se você ainda estiver interessado em utilizar Naive Bayes para recomendação, por favor, deixe-me saber e eu posso tentar fornecer uma solução alternativa. Mas, tenha em mente que Naive Bayes é um algoritmo de classificação supervisionado e pode não ser a melhor opção para sistemas de recomendação baseados em conteúdo. A abordagem atual usando similaridade de conteúdo é mais adequada para esse caso.



In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer

In [20]:
num_features = ['Avg Session per User']
cat_features = ['gender']
target = 'Subgenre'

In [21]:
# Normalizar os valores numéricos entre 0 e 1
scaler = MinMaxScaler()
data[num_features] = scaler.fit_transform(data[num_features])

In [22]:
# One-hot encoding para características categóricas
column_transformer = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(), cat_features)],
    remainder='passthrough'
)
X = column_transformer.fit_transform(data[num_features + cat_features])
y = data[target]

In [23]:
# Divida os dados em conjuntos de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
# Treine o classificador Naive Bayes
clf = GaussianNB()
clf.fit(X_train, y_train)

# Teste o classificador e verifique a precisão
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


                       precision    recall  f1-score   support

                 BNPL       0.00      0.00      0.00         1
    Coupons & Rewards       0.33      1.00      0.50         1
     E-Commerce (B2C)       0.00      0.00      0.00        12
     E-Commerce (C2C)       0.50      0.20      0.29         5
E-Commerce (Retailer)       0.49      0.91      0.63        22
       Other Shopping       0.00      0.00      0.00         3
 Pharmacy & Drugstore       0.00      0.00      0.00         3
               Resell       0.00      0.00      0.00         1

             accuracy                           0.46        48
            macro avg       0.17      0.26      0.18        48
         weighted avg       0.28      0.46      0.33        48



  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
def recommend_app(app_name):
    app_data = data[data['App Name'] == app_name].iloc[0]
    app_features = column_transformer.transform(app_data[num_features + cat_features].to_frame().T)

    # Prever o subgênero do aplicativo de entrada
    predicted_subgenre = clf.predict(app_features)[0]

    # Filtrar aplicativos com o mesmo subgênero previsto e não no top 10
    similar_apps = data[(data['Subgenre'] == predicted_subgenre) & (data['Rank'] > 10)]

    # Retornar os 5 aplicativos mais similares
    return similar_apps[['App Name', 'App Package Name', 'Subgenre']].head(5)

In [26]:
# Teste a função de recomendação
app_name = "Decathlon: Loja de Esportes"
print(recommend_app(app_name))

                          App Name           App Package Name  \
10  MadeiraMadeira: compras online      br.com.madeiramadeira   
11  enjoei: comprar e vender roupa          br.com.enjoei.app   
12      Netshoes: Loja de Esportes        br.com.netshoes.app   
17   Lojas Renner - Comprar Roupas         br.com.lojasrenner   
18   Natura: perfumes e cosméticos  net.natura.semprepresente   

                 Subgenre  
10  E-Commerce (Retailer)  
11  E-Commerce (Retailer)  
12  E-Commerce (Retailer)  
17  E-Commerce (Retailer)  
18  E-Commerce (Retailer)  
