In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from ipywidgets import FloatProgress
from IPython.display import display
import time

In [91]:
#Utilities functions
#launch a progress bar
def initProgBar(maxIter , desc = 'loading...', stepPB = 1):
    step = stepPB * 100 / maxIter 
    f = FloatProgress(min=0, max=100, value=0, step=step , description = desc)
    display(f)
    return f

#update at every iteration the progress bar
def updateProgBar(f):
    f.value += f.step
    return f

#exemple of use:
#import time
#f=initProgBar(50)
#for i in range(50):
#    sleep(0.1)
#    updateProgBar(f)

#export a list in external .txt file
def exportList(filename, listProd):
    f = open (filename,"w") 
    for item in listProd:
        f.write(str(item)+'\n')
    f.close()

In [2]:
train_df = pd.read_csv("data/training/data_train.csv", dtype='unicode')

In [3]:
categories=[]
for i in range(len(train_df)):
    cat=str(train_df['category_lvl_1'][i])
    if  cat not in categories:
        categories.append(cat)

print(categories)

['Fashion', 'Health & Beauty', 'TV, Audio / Video, Gaming & Wearables', 'Computers & Laptops', 'Cameras', 'Home & Living', 'Watches Sunglasses Jewellery', 'Mobiles & Tablets', 'Home Appliances']


In [4]:
def genSeq(string):
    strL=string.split()
    seqs=[]
    for i in range(len(strL)):
        seq=[]
        seq.append(strL[i])
        seqs.append(list(seq))
        
        j=i+1
        while j < len(strL):
            seq.append(strL[j])
            seqs.append(list(seq))
            j+=1
        
    return seqs

def lisToString(liste):
    string=''
    for i in liste:
        string+=str(i)+' '
    return string
    
def simString(string1, string2):
    seqs1=genSeq(string1)
    seqs2=genSeq(string2)
    
    sim={}
    for seq1 in seqs1:
        for seq2 in seqs2:
            if seq1 == seq2:
                sim[lisToString(seq1)]=len(seq1)
    return sim
    
def similarity(string1, string2):
    simseqs=simString(string1, string2)
    
    seqM=''
    lenM=0
    for seq, leng in simseqs.items():
        if leng > lenM:
            seqM = seq
            lenM = leng
            
    lenm = min(len(string1.split()), len(string2.split()))
    return lenM,seqM


In [5]:
string1="my name is James Bond"
string2="James Bond is a spy"
print(genSeq(string1))
print(simString(string1, string2))
print(similarity(string1, string2))

[['my'], ['my', 'name'], ['my', 'name', 'is'], ['my', 'name', 'is', 'James'], ['my', 'name', 'is', 'James', 'Bond'], ['name'], ['name', 'is'], ['name', 'is', 'James'], ['name', 'is', 'James', 'Bond'], ['is'], ['is', 'James'], ['is', 'James', 'Bond'], ['James'], ['James', 'Bond'], ['Bond']]
{'James Bond ': 2, 'James ': 1, 'Bond ': 1, 'is ': 1}
(2, 'James Bond ')


In [87]:
#extract product names from title : "usb charger for iphone" => "usb charger"
def extractProdNames(train_def_cat):
    start = time.time()
    
    #create a list of string for every titles
    liste=[]
    for i in range(len (train_def_cat)):
        title=str(train_df['title'][i])
        liste.append(title)
    
    #group by longest common sequence
    clusters=groupLongSeq(liste, seq_avg_lgt = 2)
    
    #extract the longest commmon sequence string : supposed to be product name
    listProd=[]
    for i in range(len(clusters)):
        if len(clust[i]) > 1:
            listProd.append(similarity(clust[i][0],clust[i][1])[1])
        else:
            listProd.append(clust[i][0])
    end = time.time()
    print("Total time for extracting Product Names: "+str(end - start))
    return listProd
    
#group products by longest seq
def groupLongSeq(liste, seq_avg_lgt = 2):
    #create a matrix which values are length of longest common sequence between two titles
    #for similarity of product with itself, choose value of average product name length
    simMatrix=np.zeros((len(liste),len(liste)))
    f2=initProgBar(len(liste)*len(liste)/2, desc = "Building matrix of similarities:")
    
    for i in range(len(liste)):
        for j in range(i,len(liste)):
            if i == j :
                simMatrix[i][j] = seq_avg_lgt
            else:
                simMatrix[i][j] = similarity(liste[i],liste[j])[0]
            simMatrix[j][i] = simMatrix[i][j]
            updateProgBar(f2)
    pairList=[]
    return groupSimMatrix(simMatrix, pairList,liste)

#process the matrix of similarities to cluster the titles
def groupSimMatrix(simMatrix, pairList,liste):
    length = simMatrix.shape[0]
    
    #stop recurrence once matrix is empty
    if length == 0:
        return pairList
    valueMax=0
    pair=[0]
    #let s find the maximal similarity between 2 items
    for j in range(1,length):
        if simMatrix[0][j] > valueMax:
            valueMax = simMatrix[0][j]
    
    #let's now examine the possible candidates for similarity:
    for j in range(1,length):
        if  valueMax == simMatrix[0][j]:
            #is there another value for which similarity of candidates and other item is higher ?
            isCandidate = 1
            for i in range(1,length):
                if simMatrix[i][j] > valueMax:
                    isCandidate = 0
            
            #if not :
            if isCandidate == 1:
                pair.append(j)
    
    #find strings corresponding to matrix indexes
    pairItem=[]
    for i in pair:
        pairItem.append(liste[i])
    

    #update matrix and list by removing the similar products found => reduce matrix for every recurrence
    newMatrix=np.delete(simMatrix, pair, axis=0)
    newMatrix=np.delete(newMatrix, pair, axis=1)
    liste = [e for e in liste if e not in pairItem]
    
    pairList.append(pairItem)
    return groupSimMatrix(newMatrix,pairList , liste)

In [86]:
liste=['hello you there', 'you are a nuts', 'you are hello', 'you there are a nuts', 'you there come here']
clust=groupLongSeq(liste,2)
print(clust)
liste2=[]
for i in range(len(clust)):
    if len(clust[i]) > 1:
        liste2.append(similarity(clust[i][0],clust[i][1])[1])
    else:
        liste2.append(clust[i][0])
print(liste2)

[['hello you there', 'you there come here'], ['you are a nuts', 'you there are a nuts'], ['you are hello']]
['you there ', 'are a nuts ', 'you are hello']


In [None]:
fashion_df = train_df[train_df['category_lvl_1'] == 'Fashion']
listProd = extractProdNames(fashion_df)
exportList('fashion_product_names.txt', listProd)

In [None]:
#pb with building similarities!!!
#re build same seq checking: iterative and not generative
#try parallelisation