In [1]:
import pandas as pd
import numpy as np
import re
import string

In [2]:
# this is an implementation to match a product (entered in free text by the user) with a list of actual products

In [3]:
alertfr = pd.read_csv('alertsfr.csv', sep=';',encoding='utf-8')
productfr = pd.read_csv('productsfr.csv',sep=';',encoding="latin-1")

In [4]:
alertfr

Unnamed: 0,product
0,IPAD MINI 4 32 GO WIFI + 4g Or Débloqué
1,EXPRESSO À CAPSULES Dolce Gusto Krups Yy1786fd
2,ACER SWIFT Sf314-51-39zj 14 Core I3-6006u 2 Gh...
3,IPAD 4 128 GO WIFI NOIR
4,COMPTEUR DE FANS Facebook 5 Chiffres
5,ASUS ZENFONE 2 SELFIE 32 Go Bleu Débloqué
6,DORO 8031 8 GO NOIR Débloqué
7,VENTILATEUR DYSON Cool Am07
8,LG G6 32 GO NOIR Débloqué
9,"IPAD MINI 4 7,9'' 32 GO Wifi Argent"


In [5]:
productfr.head()

Unnamed: 0,product,description
0,"ADVANCE PC Tour 19 CORE-I3-4160 3,6 GHz - HD...",ADVANCE core I5 + Ecran SYNCMASTER SA450 SAMSU...
1,"ALIENWARE 18 17,3 Intel Core i7 2,4 GHz GHz ...","Avec cet Alienware 18, régnez en maître sur v..."
2,"ALIENWARE 18-7312 18,4 Core i7 2,7 GHz - SSD...",<div><strong><br>Ordinateur portable ALIENWARE...
3,"ARCHOS 101 Cesium 10,1 Atom Z3735F 1,33 GHz ...",<div><br></div><div><br></div><h3><strong>ARCH...
4,"ARCHOS 140 Cesium 14 Atom Z3735F 1,33 GHz - ...",<div><br></div><h3><strong>ARCHOS 140 Cesium 1...


In [6]:
# create a set of punctuations and regex to use later
exclude = set(string.punctuation)
regex = re.compile('[%s]' % re.escape(string.punctuation))

In [7]:
def stripp(x, regex):
    r = x.lower()
    r = re.sub('\s+',' ',re.sub(regex,'',re.sub('<\w+>|</\w+>',' ',r)).strip(' '))
    r = re.sub('é','e', r)
    r = re.sub('è','e', r)
    r = re.sub('à', 'a', r)
    if re.match('((\w+)(\d))',r):
        r = re.search('((\w+)(\d))',r).group(2)+' '+re.search('((\w+)(\d))',r).group(3)
    r = r.split(' ')
    r = [i for i in r if i not in exclude]
    r = ' '.join(r)
    return r

In [8]:
# clean text for productfr dataset
productfr['clean_product'] = productfr['product'].apply(lambda x: stripp(x, regex))

In [9]:
productfr['clean_product'].head()

0    advance pc tour 19 corei34160 36 ghz hdd 500 g...
1    alienware 18 173 intel core i7 24 ghz ghz ssd ...
2    alienware 187312 184 core i7 27 ghz ssd 64 go ...
3    archos 101 cesium 101 atom z3735f 133 ghz ssd ...
4    archos 140 cesium 14 atom z3735f 133 ghz ssd 3...
Name: clean_product, dtype: object

In [10]:
# clean text for alertfr dataset
alertfr['clean_product'] = alertfr['product'].apply(lambda x: stripp(x, regex))

In [11]:
alertfr['clean_product']

0                 ipad mini 4 32 go wifi 4g or debloque
1        expresso a capsules dolce gusto krups yy1786fd
2     acer swift sf3145139zj 14 core i36006u 2 ghz s...
3                               ipad 4 128 go wifi noir
4                  compteur de fans facebook 5 chiffres
5             asus zenfone 2 selfie 32 go bleu debloque
6                          doro 8031 8 go noir debloque
7                           ventilateur dyson cool am07
8                             lg g6 32 go noir debloque
9                      ipad mini 4 79 32 go wifi argent
10              robot pâtissier kitchenaid 5ksm150psecr
11               ecouteurs samsung gear iconx 4 go noir
12       expresso a capsules dolce gusto krups yy1786fd
13                   galaxy a5 2017 32 go noir debloque
14                                    xperia zx premium
15                                            iphone 6s
16           robot multifonctions russell hobbs rh19006
17                                             i

In [12]:
# create a function that looks up intersection. this could have been done using set.intersection(), however, 
# we can notice that there are gramar mistakes. this function deals with simple grammar mistakes that concern mostly a reference
# one example is the Sony Experia XZ, the alert had ZX and the product had XZ
def intersect(alert,product):
    inter = set()
    for i in alert:
        if i in product:
            inter.add(i)
        elif i[::-1] in product:
            inter.add(i[::-1])
    return inter

In [13]:
# create a function that calculate the proportion of similarity between an alert and a product
def sim(alert, df_product):
    t2 = df_product.apply(lambda x: set(x.split(' ')))
    prop = t2.apply(lambda x: len(intersect(alert,x))/(len(alert)+len(x))).sort_values(ascending=False)
    ind_top_7 = prop[:7].index
    return ind_top_7

#### Test all alerts

In [14]:
for row in alertfr.itertuples():
    t1 = set(row[2].split(' '))
    inx = sim(t1, productfr['clean_product'])
    print('-------------------------- \n')
    print(row[1])
    print(productfr['product'].loc[inx])
    print('\n-------------------------- \n')

-------------------------- 

IPAD MINI 4 32 GO WIFI + 4g Or Débloqué
12586            IPAD MINI 4 32 GO WIFI + 4g Or - Débloqué
20610        iPad mini 4 16 Go - Wifi + 4G - Or - Débloqué
20617        iPad mini 4 64 Go - Wifi + 4G - Or - Débloqué
20604       iPad mini 4 128 Go - Wifi + 4G - Or - Débloqué
20599    iPad mini 4 - 7,9'' 32 Go - Wifi + 4G - Argent...
20614    iPad mini 4 32 Go - Wifi + 4G - Gris sidéral -...
20601                iPad mini 4 - 7,9'' 32 Go - Wifi - Or
Name: product, dtype: object

-------------------------- 

-------------------------- 

EXPRESSO À CAPSULES Dolce Gusto Krups Yy1786fd
9725       Expresso à capsules Dolce Gusto Krups YY1786FD
9722       Expresso à capsules Dolce Gusto Krups KP100BIB
9723       Expresso à capsules Dolce Gusto Krups KP1105ES
9720       Expresso à capsules Dolce Gusto KRUPS YY1500FD
9721    Expresso à capsules Dolce Gusto Krups KP1008 Y...
9724    Expresso à capsules Dolce Gusto Krups KP110H Oblo
9717         Expresso à capsule Dol

Name: product, dtype: object

-------------------------- 

-------------------------- 

ASUS NEXUS 7 7 32 GO  Noir
2158                     Asus Nexus 7 - 7 32 Go -  - Noir
2157                     Asus Nexus 7 - 7 32 Go -  - Noir
2156                Asus Nexus 7 - 7 16  Go - Wifi - Noir
1889    Asus Asus Google Nexus 7 32Go 2ème Génération ...
2775                      Asus ZenPad - 7 16 Go -  - Noir
1874             Asus 90NP01Z1-M00510 - 7 16 Go -  - Noir
2820              Asus Zenfone Go 32 Go - Noir - Débloqué
Name: product, dtype: object

-------------------------- 

-------------------------- 

MacBook Pro Retina 154
15008    MacBook Pro Retina 13 Core i7 3 GHz  - SSD 256...
15009    MacBook Pro retina 13 Core i5 2 GHz  - SSD 256...
19726                                      Tomtom PRO 7100
14743    MacBook Pro 13 Core 2 duo 2 GHz  - HDD 160 Go ...
14834    MacBook Pro 13 Core i5 2.9 GHz  - SSD 512 Go -...
14845    MacBook Pro 13 Core i7 2.7 GHz  - HDD 750 Go -...
14844    MacBo

### Suggested improvements:

- improve the misspelled words detection function
- implement the model with other languages
- find a common representation of different products for different languages
- use the description of every product as a weight in case of equal proportion
- use of ngrams and TD-IDF for refined lookup