### Import the data

In [1]:
import pandas as pd 

df = pd.read_excel("../data/data_sample.xlsx")
df.head()

Unnamed: 0,id_vente,nom_produit,nom_magasin,date_vente,prix_vente
0,176558,USB-C Charging Cable,"917 1st St, Dallas, TX 75001",04/19/19 08:46,5.975
1,176559,Bose SoundSport Headphones,"682 Chestnut St, Boston, MA 02215",04/07/19 22:30,99.99
2,176560,Google Phone,"669 Spruce St, Los Angeles, CA 90001",04/12/19 14:38,600.0
3,176560,Wired Headphones,"669 Spruce St, Los Angeles, CA 90001",04/12/19 14:38,11.99
4,176561,Wired Headphones,"333 8th St, Los Angeles, CA 90001",04/30/19 09:27,11.99


### Calculate the similarity between product

In [107]:
from fuzzywuzzy import fuzz

df_comparaison_produit = pd.DataFrame(columns=['produitA', 'produitB', 'comparison_score'])

list_products = df.nom_produit.unique()

for idx, produitA in enumerate(list_products):
    for produitB in list_products[idx+1:]:
        df_comparaison_produit.loc[len(df_comparaison_produit)] = {
            'produitA': produitA,
            'produitB': produitB,
            'comparison_score': fuzz.ratio(produitA.lower(), produitB.lower()), 
        }

df_comparaison_produit.sort_values('comparison_score', ascending=False).head(20)


Unnamed: 0,produitA,produitB,comparison_score
106,AA Batteries (4-pack),AAA Batteries (4-pack),98
150,27in FHD Monitor,20in Monitor,79
55,Wired Headphones,Apple Airpods Headphones,70
4,USB-C Charging Cable,Lightning Charging Cable,68
98,27in 4K Gaming Monitor,27in FHD Monitor,68
99,27in 4K Gaming Monitor,20in Monitor,65
24,Bose SoundSport Headphones,Apple Airpods Headphones,64
155,27in FHD Monitor,34in Ultrawide Monitor,63
19,Bose SoundSport Headphones,Wired Headphones,62
104,27in 4K Gaming Monitor,34in Ultrawide Monitor,59


### Identification of product groups 

In [102]:
list_products = df.nom_produit.unique()
list_grp_products = []
produit_already_used = []

for idx, produitA in enumerate(list_products):
    grpe_product = []
    for produitB in list_products[idx+1:]:

        if fuzz.ratio(produitA.lower(), produitB.lower()) >= 43:
            for product in [produitA, produitB]:
                if product not in produit_already_used: 
                    grpe_product += [product]
                    produit_already_used += [product]

    if len(grpe_product):
        list_grp_products.append(grpe_product)
        print(grpe_product)

['USB-C Charging Cable', 'Lightning Charging Cable']
['Bose SoundSport Headphones', 'Wired Headphones', 'Apple Airpods Headphones']
['Google Phone', 'iPhone', 'Vareebadd Phone']
['Macbook Pro Laptop', 'ThinkPad Laptop']
['LG Washing Machine']
['27in 4K Gaming Monitor', '27in FHD Monitor', '20in Monitor', '34in Ultrawide Monitor']
['AA Batteries (4-pack)', 'AAA Batteries (4-pack)']


### Identify the product within each group

In [103]:
from collections import Counter
from nltk import ngrams

def identify_common_words(strings, threshold=2):
    text = ' '.join(strings)
    tokens = text.split()
    tokens = [token.lower() for token in tokens]

    # count of the most common words
    words_counts = Counter(tokens)
    common_words = [(word, count) for word, count in words_counts.items() if count >= threshold]
    
    # count of the most common bi-grams
    bigrams_counts = Counter(list(ngrams(tokens, 2)))
    common_bigrams = [(bigram, count) for bigram, count in bigrams_counts.items() if count >= threshold]

    max_common_bigram_occurrences, max_common_word_occurrences = 0, 0

    if (len(common_bigrams) == 0) & (len(common_words) == 0):
        return []
    if len(common_bigrams):
        most_common_bigram_occurrences, max_common_bigram_occurrences = max(common_bigrams, key=lambda x: x[1])
    if len(common_words):
        most_common_word_occurrences, max_common_word_occurrences = max(common_words, key=lambda x: x[1])

    # priority for bigram 
    if max_common_bigram_occurrences >= max_common_word_occurrences:
        return ' '.join(most_common_bigram_occurrences)
    else:
        return most_common_word_occurrences

for grp_product in list_grp_products:
    potential_product = identify_common_words(grp_product)
    print(grp_product, ' -> ', potential_product)



['USB-C Charging Cable', 'Lightning Charging Cable']  ->  charging cable
['Bose SoundSport Headphones', 'Wired Headphones', 'Apple Airpods Headphones']  ->  headphones
['Google Phone', 'iPhone', 'Vareebadd Phone']  ->  phone
['Macbook Pro Laptop', 'ThinkPad Laptop']  ->  laptop
['LG Washing Machine']  ->  []
['27in 4K Gaming Monitor', '27in FHD Monitor', '20in Monitor', '34in Ultrawide Monitor']  ->  monitor
['AA Batteries (4-pack)', 'AAA Batteries (4-pack)']  ->  batteries (4-pack)


### Identify brand within each product label 

In [108]:
import spacy

nlp = spacy.load('en_core_web_sm')  # Charger le modèle français de spaCy


def identify_brand_within_string(txt):
    noms_marque_identifies = []
    for v in txt.split(' '):
        doc = nlp(v)
        for entite in doc.ents:
            if entite.label_ == "ORG":  # Filtrer les entités de type organisation
                noms_marque_identifies.append(entite.text)
    
    return noms_marque_identifies

# Exemple d'utilisation
texte = "J'adore boire du Coca-Cola et porter des chaussures Nike."
identify_brand_within_string(texte)

for produit in df.nom_produit.unique():
    print(produit, ' -> ', identify_brand_within_string(produit))



USB-C Charging Cable  ->  []
Bose SoundSport Headphones  ->  ['SoundSport']
Google Phone  ->  ['Google']
Wired Headphones  ->  []
Macbook Pro Laptop  ->  []
Lightning Charging Cable  ->  []
27in 4K Gaming Monitor  ->  []
AA Batteries (4-pack)  ->  []
Apple Airpods Headphones  ->  ['Apple']
AAA Batteries (4-pack)  ->  ['AAA']
iPhone  ->  []
Flatscreen TV  ->  []
27in FHD Monitor  ->  ['FHD']
20in Monitor  ->  []
LG Dryer  ->  ['LG']
ThinkPad Laptop  ->  ['ThinkPad']
Vareebadd Phone  ->  []
LG Washing Machine  ->  ['LG']
34in Ultrawide Monitor  ->  []
