In [99]:
import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from fuzzysearch import find_near_matches


In [100]:
train = pd.read_csv('ttrain.csv')
manufacturers = pd.read_csv("manufacturer_list.csv")
print(train.head())
print(manufacturers.head())

                                    article_name  \
0                    Chicken Burger Patty (250G)   
1              BRITANNIA DAILY FAMILY LONG BREAD   
2  POOOF POTATO KETTLE CHIPS ZESTY MASALA PP 30g   
3                            FEVIBOND TUBE 20 ML   
4             KITKAT DESSERT DELIGHT TRUFFLE 50g   

                            manufacturer_description  Unnamed: 2  Unnamed: 3  \
0  Company Name - VENKYS INDIA LTD<br />Membershi...         NaN         NaN   
1  Company Name - BRITANNIA INDUSTRIES LTD<br />M...         NaN         NaN   
2  Company Name - FUTURE CONSUMER ENTERPRISE LIMI...         NaN         NaN   
3  Membership Discount - Applicable<br />Identifi...         NaN         NaN   
4  Company Name - NESTLE INDIA LTD<br />Membershi...         NaN         NaN   

   Unnamed: 4  
0         NaN  
1         NaN  
2         NaN  
3         NaN  
4         NaN  
         brand_name
0     ORGANIC INDIA
1  head & shoulders
2          Sunfeast
3            Unibic
4         

In [101]:
train.manufacturer_description = train['manufacturer_description'].str.lower()

In [102]:
train["company_name"] = train.manufacturer_description.str.extract(r"company name - ([a-zA-Z-09\. \&]+)")

In [103]:
train.head()

Unnamed: 0,article_name,manufacturer_description,Unnamed: 2,Unnamed: 3,Unnamed: 4,company_name
0,Chicken Burger Patty (250G),company name - venkys india ltd<br />membershi...,,,,venkys india ltd
1,BRITANNIA DAILY FAMILY LONG BREAD,company name - britannia industries ltd<br />m...,,,,britannia industries ltd
2,POOOF POTATO KETTLE CHIPS ZESTY MASALA PP 30g,company name - future consumer enterprise limi...,,,,future consumer enterprise limited
3,FEVIBOND TUBE 20 ML,membership discount - applicable<br />identifi...,,,,
4,KITKAT DESSERT DELIGHT TRUFFLE 50g,company name - nestle india ltd<br />membershi...,,,,nestle india ltd


In [104]:
train['transformed'] = train['article_name'].str.lower()
train['transformed'] = train['transformed'].str.replace("'","")
train['transformed'] = train['transformed'].str.replace("(","")
train['transformed'] = train['transformed'].str.replace(")","")
train['transformed'] = train['transformed'].str.replace(")","")
train['transformed'] = train['transformed'].str.replace("/"," ")
train = train.filter(["article_name", "company_name", "transformed"])

In [105]:
train.head(10)

Unnamed: 0,article_name,company_name,transformed
0,Chicken Burger Patty (250G),venkys india ltd,chicken burger patty 250g
1,BRITANNIA DAILY FAMILY LONG BREAD,britannia industries ltd,britannia daily family long bread
2,POOOF POTATO KETTLE CHIPS ZESTY MASALA PP 30g,future consumer enterprise limited,pooof potato kettle chips zesty masala pp 30g
3,FEVIBOND TUBE 20 ML,,fevibond tube 20 ml
4,KITKAT DESSERT DELIGHT TRUFFLE 50g,nestle india ltd,kitkat dessert delight truffle 50g
5,BLUE BIRD SPAGHETTI PASTA 200G,blue bird foods india pvt ltd,blue bird spaghetti pasta 200g
6,NIRAPARA AVALOSE PODI 500GM,k k r food products,nirapara avalose podi 500gm
7,NIVEA ROLL ON FRESH NATURAL 50ml,,nivea roll on fresh natural 50ml
8,FIAMA DW SOAP EXOTIC DREAM BX 115g,itc india ltd,fiama dw soap exotic dream bx 115g
9,SANGIS KTCN PERI PERI MAYONNAISE BT 200g,future consumer ltd,sangis ktcn peri peri mayonnaise bt 200g


In [106]:
train['quantity'] = train['transformed'].str.extract(r'(\d.{0,4}(ml|g|kg|l|s|p))')[0]
#train['quantity'] = train['transformed'].str.extract(r'([0-9].*(ml|g|kg|l|s|p))')

In [97]:
train.head(50)

Unnamed: 0,article_name,company_name,transformed,quantity
0,Chicken Burger Patty (250G),venkys india ltd,chicken burger patty 250g,250g
1,BRITANNIA DAILY FAMILY LONG BREAD,britannia industries ltd,britannia daily family long bread,
2,POOOF POTATO KETTLE CHIPS ZESTY MASALA PP 30g,future consumer enterprise limited,pooof potato kettle chips zesty masala pp 30g,30g
3,FEVIBOND TUBE 20 ML,,fevibond tube 20 ml,20 ml
4,KITKAT DESSERT DELIGHT TRUFFLE 50g,nestle india ltd,kitkat dessert delight truffle 50g,50g
5,BLUE BIRD SPAGHETTI PASTA 200G,blue bird foods india pvt ltd,blue bird spaghetti pasta 200g,200g
6,NIRAPARA AVALOSE PODI 500GM,k k r food products,nirapara avalose podi 500gm,500g
7,NIVEA ROLL ON FRESH NATURAL 50ml,,nivea roll on fresh natural 50ml,50ml
8,FIAMA DW SOAP EXOTIC DREAM BX 115g,itc india ltd,fiama dw soap exotic dream bx 115g,115g
9,SANGIS KTCN PERI PERI MAYONNAISE BT 200g,future consumer ltd,sangis ktcn peri peri mayonnaise bt 200g,200g


In [206]:
train_small = train.head(100)
manufacturers['brand_name'] = manufacturers['brand_name'].str.lower()
manufacturers['brand_name'] = manufacturers['brand_name'].str.replace("'","")
manufacturers['brand_name'] = manufacturers['brand_name'].str.replace("(","")
manufacturers['brand_name'] = manufacturers['brand_name'].str.replace(")","")
manufacturers['brand_name'] = manufacturers['brand_name'].str.replace(")","")
manufacturers['brand_name'] = manufacturers['brand_name'].str.replace("/"," ")
manufacturers.head()


Unnamed: 0,brand_name
0,organic india
1,head & shoulders
2,sunfeast
3,unibic
4,cremica


In [207]:
def get_brand_name(article_name, choices):
    maxScore = 0
    maxChoice = ""
    for choice in choices:
        score = fuzz.token_set_ratio(" " + article_name + " ", " " + choice + " ")
        if score > maxScore or (score == maxScore and len(choice) > len(maxChoice)):
            maxScore = score
            maxChoice = choice
    return maxChoice, maxScore

def get_strict(article_name, choices):
    matches = []
    for choice in choices:
        choice1 = choice + " "
        choice2 = " " + choice + " "
        choice3 = " " + choice
        if article_name.startswith(choice1) or article_name.endswith(choice3) or choice2 in article_name:
            matches.append(choice)
    return matches
            

In [208]:
train_small["company_name"] = train_small["company_name"].fillna("")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [209]:
train_small.head()

Unnamed: 0,article_name,company_name,transformed,quantity
0,Chicken Burger Patty (250G),venkys india ltd,chicken burger patty 250g,250g
1,BRITANNIA DAILY FAMILY LONG BREAD,britannia industries ltd,britannia daily family long bread,
2,POOOF POTATO KETTLE CHIPS ZESTY MASALA PP 30g,future consumer enterprise limited,pooof potato kettle chips zesty masala pp 30g,30g
3,FEVIBOND TUBE 20 ML,,fevibond tube 20 ml,20 ml
4,KITKAT DESSERT DELIGHT TRUFFLE 50g,nestle india ltd,kitkat dessert delight truffle 50g,50g


In [210]:
train_small['strict_transformed'] = train_small['transformed'].apply(lambda x: get_strict(x, manufacturers['brand_name']))
train_small['strict_company'] = train_small['company_name'].apply(lambda x: get_strict(x, manufacturers['brand_name']))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [211]:
train_small.head()


Unnamed: 0,article_name,company_name,transformed,quantity,strict_transformed,strict_company
0,Chicken Burger Patty (250G),venkys india ltd,chicken burger patty 250g,250g,[],"[venkys, venkys]"
1,BRITANNIA DAILY FAMILY LONG BREAD,britannia industries ltd,britannia daily family long bread,,[britannia],[britannia]
2,POOOF POTATO KETTLE CHIPS ZESTY MASALA PP 30g,future consumer enterprise limited,pooof potato kettle chips zesty masala pp 30g,30g,[],[]
3,FEVIBOND TUBE 20 ML,,fevibond tube 20 ml,20 ml,[fevibond],[]
4,KITKAT DESSERT DELIGHT TRUFFLE 50g,nestle india ltd,kitkat dessert delight truffle 50g,50g,[],[nestle]


In [212]:
train_small['fuzzy_transformed'] = train_small['transformed'].apply(lambda x: get_brand_name(x, manufacturers['brand_name']))
train_small['fuzzy_company'] = train_small['company_name'].apply(lambda x: get_brand_name(x, manufacturers['brand_name']))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [213]:
train_small.head()

Unnamed: 0,article_name,company_name,transformed,quantity,strict_transformed,strict_company,fuzzy_transformed,fuzzy_company
0,Chicken Burger Patty (250G),venkys india ltd,chicken burger patty 250g,250g,[],"[venkys, venkys]","(easy choice, 44)","(venkys, 100)"
1,BRITANNIA DAILY FAMILY LONG BREAD,britannia industries ltd,britannia daily family long bread,,[britannia],[britannia],"(britannia, 100)","(britannia, 100)"
2,POOOF POTATO KETTLE CHIPS ZESTY MASALA PP 30g,future consumer enterprise limited,pooof potato kettle chips zesty masala pp 30g,30g,[],[],"(pooof!, 100)","(center fruit, 48)"
3,FEVIBOND TUBE 20 ML,,fevibond tube 20 ml,20 ml,[fevibond],[],"(fevibond, 100)","(emami fair and handsome, 0)"
4,KITKAT DESSERT DELIGHT TRUFFLE 50g,nestle india ltd,kitkat dessert delight truffle 50g,50g,[],[nestle],"(scrub & bright, 35)","(nestle, 100)"


In [214]:
train_small = train_small.filter(['article_name','company_name','transformed', 'quantity', 'strict_transformed', 'strict_company', 'fuzzy_transformed', 'fuzzy_company']).join(train_small.fuzzy_transformed.apply(lambda loc: pd.Series(loc, index=['fuzzy_brand', 'brand_score']))).join(train_small.fuzzy_company.apply(lambda loc: pd.Series(loc, index=['f_company', 'company_score'])))


In [215]:
train_small

Unnamed: 0,article_name,company_name,transformed,quantity,strict_transformed,strict_company,fuzzy_transformed,fuzzy_company,fuzzy_brand,brand_score,f_company,company_score
0,Chicken Burger Patty (250G),venkys india ltd,chicken burger patty 250g,250g,[],"[venkys, venkys]","(easy choice, 44)","(venkys, 100)",easy choice,44,venkys,100
1,BRITANNIA DAILY FAMILY LONG BREAD,britannia industries ltd,britannia daily family long bread,,[britannia],[britannia],"(britannia, 100)","(britannia, 100)",britannia,100,britannia,100
2,POOOF POTATO KETTLE CHIPS ZESTY MASALA PP 30g,future consumer enterprise limited,pooof potato kettle chips zesty masala pp 30g,30g,[],[],"(pooof!, 100)","(center fruit, 48)",pooof!,100,center fruit,48
3,FEVIBOND TUBE 20 ML,,fevibond tube 20 ml,20 ml,[fevibond],[],"(fevibond, 100)","(emami fair and handsome, 0)",fevibond,100,emami fair and handsome,0
4,KITKAT DESSERT DELIGHT TRUFFLE 50g,nestle india ltd,kitkat dessert delight truffle 50g,50g,[],[nestle],"(scrub & bright, 35)","(nestle, 100)",scrub & bright,35,nestle,100
5,BLUE BIRD SPAGHETTI PASTA 200G,blue bird foods india pvt ltd,blue bird spaghetti pasta 200g,200g,[blue bird],[blue bird],"(blue bird, 100)","(blue bird, 100)",blue bird,100,blue bird,100
6,NIRAPARA AVALOSE PODI 500GM,k k r food products,nirapara avalose podi 500gm,500g,[nirapara],[],"(nirapara, 100)","(hr foods, 56)",nirapara,100,hr foods,56
7,NIVEA ROLL ON FRESH NATURAL 50ml,,nivea roll on fresh natural 50ml,50ml,[nivea],[],"(nivea, 100)","(emami fair and handsome, 0)",nivea,100,emami fair and handsome,0
8,FIAMA DW SOAP EXOTIC DREAM BX 115g,itc india ltd,fiama dw soap exotic dream bx 115g,115g,[],[],"(fiama di wills, 53)","(kitchens of india, 67)",fiama di wills,53,kitchens of india,67
9,SANGIS KTCN PERI PERI MAYONNAISE BT 200g,future consumer ltd,sangis ktcn peri peri mayonnaise bt 200g,200g,[],[],"(sangis kitchen, 60)","(center fruit, 52)",sangis kitchen,60,center fruit,52


In [216]:
del train_small['fuzzy_transformed']
del train_small['fuzzy_company']

In [217]:
train_small

Unnamed: 0,article_name,company_name,transformed,quantity,strict_transformed,strict_company,fuzzy_brand,brand_score,f_company,company_score
0,Chicken Burger Patty (250G),venkys india ltd,chicken burger patty 250g,250g,[],"[venkys, venkys]",easy choice,44,venkys,100
1,BRITANNIA DAILY FAMILY LONG BREAD,britannia industries ltd,britannia daily family long bread,,[britannia],[britannia],britannia,100,britannia,100
2,POOOF POTATO KETTLE CHIPS ZESTY MASALA PP 30g,future consumer enterprise limited,pooof potato kettle chips zesty masala pp 30g,30g,[],[],pooof!,100,center fruit,48
3,FEVIBOND TUBE 20 ML,,fevibond tube 20 ml,20 ml,[fevibond],[],fevibond,100,emami fair and handsome,0
4,KITKAT DESSERT DELIGHT TRUFFLE 50g,nestle india ltd,kitkat dessert delight truffle 50g,50g,[],[nestle],scrub & bright,35,nestle,100
5,BLUE BIRD SPAGHETTI PASTA 200G,blue bird foods india pvt ltd,blue bird spaghetti pasta 200g,200g,[blue bird],[blue bird],blue bird,100,blue bird,100
6,NIRAPARA AVALOSE PODI 500GM,k k r food products,nirapara avalose podi 500gm,500g,[nirapara],[],nirapara,100,hr foods,56
7,NIVEA ROLL ON FRESH NATURAL 50ml,,nivea roll on fresh natural 50ml,50ml,[nivea],[],nivea,100,emami fair and handsome,0
8,FIAMA DW SOAP EXOTIC DREAM BX 115g,itc india ltd,fiama dw soap exotic dream bx 115g,115g,[],[],fiama di wills,53,kitchens of india,67
9,SANGIS KTCN PERI PERI MAYONNAISE BT 200g,future consumer ltd,sangis ktcn peri peri mayonnaise bt 200g,200g,[],[],sangis kitchen,60,center fruit,52


In [224]:
train_small["final_brand"] = train_small.apply(lambda x: x.fuzzy_brand if x.brand_score > 50 else x.strict_company[0] if len(x.strict_company) > 0 else None, axis=1)

In [228]:
train_final = train_small.filter(["transformed", "quantity", "final_brand"])

In [229]:
train_final

Unnamed: 0,transformed,quantity,final_brand
0,chicken burger patty 250g,250g,venkys
1,britannia daily family long bread,,britannia
2,pooof potato kettle chips zesty masala pp 30g,30g,pooof!
3,fevibond tube 20 ml,20 ml,fevibond
4,kitkat dessert delight truffle 50g,50g,nestle
5,blue bird spaghetti pasta 200g,200g,blue bird
6,nirapara avalose podi 500gm,500g,nirapara
7,nivea roll on fresh natural 50ml,50ml,nivea
8,fiama dw soap exotic dream bx 115g,115g,fiama di wills
9,sangis ktcn peri peri mayonnaise bt 200g,200g,sangis kitchen


In [216]:
import difflib

In [236]:
def matches(large_string, query_string, threshold):
    words = large_string.split()
    for word in words:
        s = difflib.SequenceMatcher(None, word, query_string)
        match = ''.join(word[i:i+n] for i, j, n in s.get_matching_blocks() if n)
        if len(match) / float(len(query_string)) >= threshold:
            yield match

# def better_matches(large_string, query_string, threshold):
#     large_string_tokens = large_string.strip().split(" ")
#     query_string_tokens = query_string.strip().split(" ")
#     print(large_string_tokens)
#     print(query_string_tokens)
#     for large_token in large_string_tokens:
#         for query_token in query_string_tokens:
#             score = fuzz.token_set_ratio(query_token, large_token)
#             if score > threshold:
#                 large_string_tokens.remove()
#             print(query_token + " " + large_token + " " + str(score))

In [240]:
large_string = "kwalit wa ice crm tuti frty 700m 1+1 op"
query_string = " kwality walls "
print(list(matches(large_string, query_string, 0.3)))
# print(list(better_matches(large_string, query_string, 70)))

['kwalit']


In [219]:
def get_cleaned_article(large, query_string):
    match_list = matches(large, " " + query_string + " ", 0.3)
    for match in match_list:
        large = large.replace(match, "")
    return large.strip()
get_cleaned_article(large_string, query_string)

'wa ice crm tuti frty 700m 1+1 op'

In [301]:
train_final["filtered"] = train_final.apply(lambda x: x.transformed if x.final_brand is None else get_cleaned_article(x.transformed, x.final_brand), axis=1)

In [302]:
train_filtered = train_final.filter(['filtered', 'quantity', 'final_brand'])

In [303]:
train_filtered

Unnamed: 0,filtered,quantity,final_brand
0,chicken burger patty 250g,250g,venkys
1,daily family long bread,,britannia
2,potato kettle chips zesty masala pp 30g,30g,pooof!
3,tube 20 ml,20 ml,fevibond
4,kitkat dessert delight truffle 50g,50g,nestle
5,spaghetti pasta 200g,200g,blue bird
6,avalose podi 500gm,500g,nirapara
7,roll on fresh natural 50ml,50ml,nivea
8,dw soap exotic dream bx 115g,115g,fiama di wills
9,ktcn peri peri mayonnaise bt 200g,200g,sangis kitchen


In [304]:
train_filtered["quantity"] = train_filtered["quantity"].fillna("")

In [305]:
train_filtered

Unnamed: 0,filtered,quantity,final_brand
0,chicken burger patty 250g,250g,venkys
1,daily family long bread,,britannia
2,potato kettle chips zesty masala pp 30g,30g,pooof!
3,tube 20 ml,20 ml,fevibond
4,kitkat dessert delight truffle 50g,50g,nestle
5,spaghetti pasta 200g,200g,blue bird
6,avalose podi 500gm,500g,nirapara
7,roll on fresh natural 50ml,50ml,nivea
8,dw soap exotic dream bx 115g,115g,fiama di wills
9,ktcn peri peri mayonnaise bt 200g,200g,sangis kitchen


In [306]:
train_filtered["filtered"] = train_filtered.apply(lambda x: get_cleaned_article(x.filtered, str(x.quantity)), axis=1)

In [307]:
train_filtered

Unnamed: 0,filtered,quantity,final_brand
0,chicken burger patty,250g,venkys
1,daily family long bread,,britannia
2,potato kettle chips zesty masala pp,30g,pooof!
3,tube 20 ml,20 ml,fevibond
4,kitkat dessert delight truffle,50g,nestle
5,spaghetti pasta,200g,blue bird
6,avalose podi m,500g,nirapara
7,roll on fresh natural,50ml,nivea
8,dw soap exotic dream bx,115g,fiama di wills
9,ktcn peri peri mayonnaise bt,200g,sangis kitchen


In [308]:
train_filtered.to_csv("filtered.csv", index=False)

In [238]:
function_filtered = pd.read_csv("filtered_with_fn.csv")

In [239]:
function_filtered.head()

Unnamed: 0.1,Unnamed: 0,filtered,quantity,final_brand,class1,class2
0,0,chicken burger patty,250g,venkys,Ready To Eat,Branded Food
1,1,daily family long bread,,britannia,"Biscuits, Cookies & Crackers",Branded Food
2,2,potato kettle chips zesty masala pp,30g,pooof!,"Chips, Namkeens & Snacks",Branded Food
3,3,tube 20 ml,20 ml,fevibond,OTC,Beauty &Personal Care
4,4,kitkat dessert delight truffle,50g,nestle,"Chocolates, Cakes & Confectioneries",Branded Food


In [251]:
def clean_column(column, delimitter_list):
    column = column.str.lower()
    for delimitter in delimitter_list: 
        column = column.str.replace(delimitter," ")
    column = column.str.replace("  "," ")
    column = column.str.replace("  "," ")
    column = column.str.replace("  "," ")
    column = column.str.replace("  "," ")
    column = column.str.replace("  "," ")
    column = column.str.replace("  "," ")
    return column

#clean_column(function_filtered['class1'], ["'", "(", ")", "/", "&", ","])

In [253]:
function_filtered['class1'] = clean_column(function_filtered['class1'], ["'", "(", ")", "/", "&", ","])

In [255]:
function_filtered['class2'] = clean_column(function_filtered['class2'], ["'", "(", ")", "/", "&", ","])

In [259]:
function_filtered['appended_class'] = function_filtered.apply(lambda x: x.class1 + " " + x.class2, axis=1)

Unnamed: 0.1,Unnamed: 0,filtered,quantity,final_brand,class1,class2,appended_class
0,0,chicken burger patty,250g,venkys,ready to eat,branded food,ready to eat branded food
1,1,daily family long bread,,britannia,biscuits cookies crackers,branded food,biscuits cookies crackers branded food
2,2,potato kettle chips zesty masala pp,30g,pooof!,chips namkeens snacks,branded food,chips namkeens snacks branded food
3,3,tube 20 ml,20 ml,fevibond,otc,beauty personal care,otc beauty personal care
4,4,kitkat dessert delight truffle,50g,nestle,chocolates cakes confectioneries,branded food,chocolates cakes confectioneries branded food
5,5,spaghetti pasta,200g,blue bird,noodles pastas vermicelli,branded food,noodles pastas vermicelli branded food
6,6,avalose podi m,500g,nirapara,toys,toys stationery,toys toys stationery
7,7,roll on fresh natural,50ml,nivea,deos perfumes,beauty personal care,deos perfumes beauty personal care
8,8,dw soap exotic dream bx,115g,fiama di wills,skin care,beauty personal care,skin care beauty personal care
9,9,ktcn peri peri mayonnaise bt,200g,sangis kitchen,jams spreads honey,branded food,jams spreads honey branded food


In [267]:
def remove_string(large, queries):
    for query in queries.split(" "):
        large = large.replace(query, "")
    large = large.replace("  ", " ")
    large = large.replace("  ", " ")
    return large
remove_string("potato kettle chips zesty masala pp", "chips namkeens snacks branded food")

'potato kettle zesty masala pp'

In [262]:
??get_cleaned_article