In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, ensemble
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
#import plotly.express as px
from sklearn.linear_model import LinearRegression

In [2]:
import timeit

In [4]:
!pip install  pyjarowinkler

Collecting pyjarowinkler
  Downloading pyjarowinkler-1.8-py2.py3-none-any.whl (5.9 kB)
Installing collected packages: pyjarowinkler
Successfully installed pyjarowinkler-1.8


In [3]:
from pyjarowinkler.distance import get_jaro_distance

## Data Prep Class
##### Generates Unique key for merging
##### Performs standard text preprocessing

In [4]:
class DataPrep():
  def __init__(self,params):
    self.params=params
    self.dataset_1=pd.read_excel(self.params["data_location_1"])
    self.dataset_2=pd.read_excel(self.params["data_location_2"])
    
  def gen_unique_key(self):
    self.dataset_1.reset_index(inplace=True)
    self.dataset_2.reset_index(inplace=True)
    
    self.dataset_1.rename(columns={"index": "unique_key"},inplace=True)
    self.dataset_2.rename(columns={"index": "unique_key"},inplace=True)

    self.dataset_1["merge_col"]=1
    self.dataset_2["merge_col"]=1


  def txt_preprocessing(self):
    for i,fp in enumerate([self.dataset_1,self.dataset_2]): 
      fp['product_name_modified']=fp[params["text_fields"][i]].str.lower()
      fp.product_name_modified=fp.product_name_modified.str.strip()
      fp.product_name_modified = fp.product_name_modified.astype(str)
      fp.product_name_modified=fp['product_name_modified'].str.replace('[^\w\s]', '')

  def return_datsests(self):
    return self.dataset_1[["merge_col","unique_key",'product_name_modified']],self.dataset_2[["merge_col","unique_key",'product_name_modified']]


In [5]:
params={}
params["data_location_1"]='amz_com-ecommerce_sample.xlsx'
params["data_location_2"]='flipkart_com-ecommerce_sample.xlsx'
params["text_fields"]=["product_name","product_name"]

In [6]:
ds=DataPrep(params)

In [7]:
ds.txt_preprocessing()



In [8]:
ds.gen_unique_key()

In [9]:
dataset_1,dataset_2=ds.return_datsests()

## Model Prep Class
##### Performs Merging of both Tables basis first x substrings
##### Uses Jaro Distance to find closest match for each entry

In [54]:
class ModelPrep():
    def __init__(self,model_params):
        self.model_params = model_params
        self.dataset_1=model_params["dataset_1"]
        self.dataset_2=model_params["dataset_2"]
        
        self.Final_dataset_1=model_params["Final_dataset_1"]
        self.Final_dataset_2=model_params["Final_dataset_1"]
        
    def Substr_Generated_Merge(self):
        
        
        self.dataset_1["prod_substr"] = self.dataset_1["product_name_modified"].str[0:self.model_params["substr"]]
        self.dataset_2["prod_substr"] = self.dataset_2["product_name_modified"].str[0:self.model_params["substr"]]
        
        self.merge_df_v1=pd.merge(self.dataset_1,self.dataset_2,on="prod_substr")
    
    def Key_Gen(self):
        self.merge_df_v1['res'] = self.merge_df_v1[["product_name_modified_x","product_name_modified_y"]].\
                                        apply(lambda x: get_jaro_distance(x["product_name_modified_x"], 
                                                                          x["product_name_modified_y"]),axis=1)
        
        self.merge_df_v1.sort_values(by=['unique_key_x', 'res'],ascending=[True,False],inplace=True)
        self.merge_df_v1.drop_duplicates(subset=['unique_key_x'], keep='first',inplace=True)
        
    def Final_Data_Generation(self):
        df_merge_1 = pd.merge(self.Final_dataset_1,self.merge_df_v1,
                              left_on="unique_key",right_on="unique_key_x",how="left")
        df_merge = pd.merge(df_merge_1,self.Final_dataset_2,
                            left_on="unique_key_y",right_on="unique_key",how="left")
        
        return df_merge
        

In [55]:
model_params={}
model_params["dataset_1"]=dataset_1
model_params["dataset_2"]=dataset_2
model_params["substr"]=6

In [56]:
model_params["Final_dataset_1"]=ds.dataset_1

In [57]:
model_params["Final_dataset_2"]=ds.dataset_2

In [58]:
Model_Jaro=ModelPrep(model_params)

In [64]:
Model_Jaro.Substr_Generated_Merge()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [66]:
Model_Jaro.merge_df_v1.shape

(871167, 7)

In [60]:
Model_Jaro.Key_Gen()

In [61]:
Final_Dataset=Model_Jaro.Final_Data_Generation()



In [62]:
Final_Dataset.to_csv("Final_Dataset.csv",index=False)

### End of Code

(871167, 7)

In [43]:
ds.dataset_1.head()

Unnamed: 0,unique_key,uniq_id,crawl_timestamp,product_url,product_name,product_category_tree,pid,retail_price,discounted_price,image,is_FK_Advantage_product,description,product_rating,overall_rating,brand,product_specifications,product_name_modified,merge_col
0,0,c2d766ca982eca8304150849735ffef9,2016-03-25 22:59:23 +0000,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,,SRTEH2FF9KEDEFGF,982,438,"[""http://img5a.flixcart.com/image/short/u/4/a/...",False,Key Features of Alisha Solid Women's Cycling S...,No rating available,No rating available,Alisha,"{""product_specification""=>[{""key""=>""Number of ...",alisha solid womens cycling shorts,1
1,1,7f7036a6d550aaa89d34c77bd39a5e48,2016-03-25 22:59:23 +0000,http://www.flipkart.com/fabhomedecor-fabric-do...,FabHomeDecor Fabric Double Sofa Bed,"[""Furniture >> Living Room Furniture >> Sofa B...",SBEEH3QGU7MFYJFY,32143,29121,"[""http://img6a.flixcart.com/image/sofa-bed/j/f...",False,FabHomeDecor Fabric Double Sofa Bed (Finish Co...,No rating available,No rating available,FabHomeDecor,"{""product_specification""=>[{""key""=>""Installati...",fabhomedecor fabric double sofa bed,1
2,2,f449ec65dcbc041b6ae5e6a32717d01b,2016-03-25 22:59:23 +0000,http://www.flipkart.com/aw-bellies/p/itmeh4grg...,AW Bellies,"[""Footwear >> Women's Footwear >> Ballerinas >...",SHOEH4GRSUBJGZXE,991,551,"[""http://img5a.flixcart.com/image/shoe/7/z/z/r...",False,Key Features of AW Bellies Sandals Wedges Heel...,No rating available,No rating available,AW,"{""product_specification""=>[{""key""=>""Ideal For""...",aw bellies,1
3,3,0973b37acd0c664e3de26e97e5571454,2016-03-25 22:59:23 +0000,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",SRTEH2F6HUZMQ6SJ,694,325,"[""http://img5a.flixcart.com/image/short/6/2/h/...",False,Key Features of Alisha Solid Women's Cycling S...,No rating available,No rating available,Alisha,"{""product_specification""=>[{""key""=>""Number of ...",alisha solid womens cycling shorts,1
4,4,bc940ea42ee6bef5ac7cea3fb5cfbee7,2016-03-25 22:59:23 +0000,http://www.flipkart.com/sicons-all-purpose-arn...,Sicons All Purpose Arnica Dog Shampoo,"[""Pet Supplies >> Grooming >> Skin & Coat Care...",PSOEH3ZYDMSYARJ5,208,258,"[""http://img5a.flixcart.com/image/pet-shampoo/...",False,Specifications of Sicons All Purpose Arnica Do...,No rating available,No rating available,Sicons,"{""product_specification""=>[{""key""=>""Pet Type"",...",sicons all purpose arnica dog shampoo,1


In [44]:
df_merge_1 = pd.merge(ds.dataset_1,merge_df_v2,left_on="unique_key",right_on="unique_key_x",how="left")

In [45]:
df_merge_1.shape

(20000, 26)

In [51]:
df_merge = pd.merge(df_merge_1,ds.dataset_2,left_on="unique_key_y",right_on="unique_key",how="left")

  """Entry point for launching an IPython kernel.


In [47]:
df_merge.head()

Unnamed: 0,unique_key_x,uniq_id_x,crawl_timestamp_x,product_url_x,product_name_x,product_category_tree_x,pid_x,retail_price_x,discounted_price_x,image_x,...,discounted_price_y,image_y,is_FK_Advantage_product_y,description_y,product_rating_y,overall_rating_y,brand_y,product_specifications_y,product_name_modified_y,merge_col_y
0,0,c2d766ca982eca8304150849735ffef9,2016-03-25 22:59:23 +0000,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,,SRTEH2FF9KEDEFGF,982,438,"[""http://img5a.flixcart.com/image/short/u/4/a/...",...,438.0,"[""http://img5a.flixcart.com/image/short/u/4/a/...",False,Key Features of Alisha Solid Women's Cycling S...,No rating available,No rating available,Alisha,"{""product_specification""=>[{""key""=>""Number of ...",alisha solid womens cycling shorts,1.0
1,1,7f7036a6d550aaa89d34c77bd39a5e48,2016-03-25 22:59:23 +0000,http://www.flipkart.com/fabhomedecor-fabric-do...,FabHomeDecor Fabric Double Sofa Bed,"[""Furniture >> Living Room Furniture >> Sofa B...",SBEEH3QGU7MFYJFY,32143,29121,"[""http://img6a.flixcart.com/image/sofa-bed/j/f...",...,29121.0,"[""http://img6a.flixcart.com/image/sofa-bed/j/f...",False,FabHomeDecor Fabric Double Sofa Bed (Finish Co...,No rating available,No rating available,FabHomeDecor,"{""product_specification""=>[{""key""=>""Installati...",fabhomedecor fabric double sofa bed,1.0
2,2,f449ec65dcbc041b6ae5e6a32717d01b,2016-03-25 22:59:23 +0000,http://www.flipkart.com/aw-bellies/p/itmeh4grg...,AW Bellies,"[""Footwear >> Women's Footwear >> Ballerinas >...",SHOEH4GRSUBJGZXE,991,551,"[""http://img5a.flixcart.com/image/shoe/7/z/z/r...",...,551.0,"[""http://img5a.flixcart.com/image/shoe/7/z/z/r...",False,Key Features of AW Bellies Sandals Wedges Heel...,No rating available,No rating available,AW,"{""product_specification""=>[{""key""=>""Ideal For""...",aw bellies,1.0
3,3,0973b37acd0c664e3de26e97e5571454,2016-03-25 22:59:23 +0000,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",SRTEH2F6HUZMQ6SJ,694,325,"[""http://img5a.flixcart.com/image/short/6/2/h/...",...,438.0,"[""http://img5a.flixcart.com/image/short/u/4/a/...",False,Key Features of Alisha Solid Women's Cycling S...,No rating available,No rating available,Alisha,"{""product_specification""=>[{""key""=>""Number of ...",alisha solid womens cycling shorts,1.0
4,4,bc940ea42ee6bef5ac7cea3fb5cfbee7,2016-03-25 22:59:23 +0000,http://www.flipkart.com/sicons-all-purpose-arn...,Sicons All Purpose Arnica Dog Shampoo,"[""Pet Supplies >> Grooming >> Skin & Coat Care...",PSOEH3ZYDMSYARJ5,208,258,"[""http://img5a.flixcart.com/image/pet-shampoo/...",...,258.0,"[""http://img5a.flixcart.com/image/pet-shampoo/...",False,Specifications of Sicons All Purpose Arnica Do...,No rating available,No rating available,Sicons,"{""product_specification""=>[{""key""=>""Pet Type"",...",sicons all purpose arnica dog shampoo,1.0


In [52]:
df_merge.product_name_x.nunique()

12734

In [53]:
df_merge.product_name_y.nunique()

12597

In [18]:
class ModelPrep():
    def __init__(self,model_params):
        self.model_params = model_params
        self.dataset_1=model_params["dataset_1"]
        self.dataset_2=model_params["dataset_2"]
    def Key_Gen(self):
        tic=timeit.default_timer()
        merge_df=pd.merge(self.dataset_1.loc[self.dataset_1.unique_key<self.model_params["hist"]],
                          self.dataset_2,on="merge_col")
        merge_df['res'] = merge_df[["product_name_modified_x","product_name_modified_y"]].\
                            apply(lambda x: get_jaro_distance(x["product_name_modified_x"], 
                                                              x["product_name_modified_y"]),axis=1)
        
        toc=timeit.default_timer()
        print("print initial bin time : {}".format(str(toc - tic)))
        
        merge_df.sort_values(by=['unique_key_x', 'res'],ascending=[True,False],inplace=True)
        merge_df.drop_duplicates(subset=['unique_key_x'], keep='first',inplace=True)
        self.dfs=merge_df
        hist=self.model_params["hist"]
        for i in range(hist,len(dataset_1.index),hist):
            merge_df=pd.merge(self.dataset_1.loc[(self.dataset_1.unique_key<i+hist)&(self.dataset_1.unique_key>=i)],
                                                                                  self.dataset_2,on="merge_col")
                              
            merge_df['res'] = merge_df[["product_name_modified_x","product_name_modified_y"]].\
                              apply(lambda x: get_jaro_distance(x["product_name_modified_x"], 
                                                                x["product_name_modified_y"]),axis=1)
            
            merge_df.sort_values(by=['unique_key_x', 'res'],ascending=[True,False],inplace=True)
            merge_df.drop_duplicates(subset=['unique_key_x'], keep='first',inplace=True)
            self.dfs=self.dfs.append(merge_df)
            
            print("print bin {} time : {}".format(i,str(toc - tic)))
            print("length of self.dfs : ",len(self.dfs.index))
        

In [19]:
model_params={}
model_params["dataset_1"]=dataset_1
model_params["dataset_2"]=dataset_2
model_params["substr"]=2

In [20]:
Model_Jaro=ModelPrep(model_params)

In [21]:
Model_Jaro.Key_Gen()

print initial bin time : 232.2078394
print bin 100 time : 232.2078394
length of self.dfs :  200
print bin 200 time : 232.2078394
length of self.dfs :  300
print bin 300 time : 232.2078394
length of self.dfs :  400
print bin 400 time : 232.2078394
length of self.dfs :  500
print bin 500 time : 232.2078394
length of self.dfs :  600
print bin 600 time : 232.2078394
length of self.dfs :  700
print bin 700 time : 232.2078394
length of self.dfs :  800
print bin 800 time : 232.2078394
length of self.dfs :  900
print bin 900 time : 232.2078394
length of self.dfs :  1000
print bin 1000 time : 232.2078394
length of self.dfs :  1100
print bin 1100 time : 232.2078394
length of self.dfs :  1200
print bin 1200 time : 232.2078394
length of self.dfs :  1300
print bin 1300 time : 232.2078394
length of self.dfs :  1400
print bin 1400 time : 232.2078394
length of self.dfs :  1500
print bin 1500 time : 232.2078394
length of self.dfs :  1600
print bin 1600 time : 232.2078394
length of self.dfs :  1700
prin

KeyboardInterrupt: 

In [17]:
Model_Jaro.dataset_1[(sel.dataset_1.unique_key<100+100)&(Model_Jaro.dataset_1.unique_key>=100)]

Unnamed: 0,merge_col,unique_key,product_name_modified
100,1,100,rorlig rr028 expedition analog watch for men...
101,1,101,catwalk boots
102,1,102,magnum footwear lifestyle
103,1,103,t star ufttsw005bkbr analog watch for boys
104,1,104,rialto boots
...,...,...,...
195,1,195,ridas apl_led_black apple shape digital watch ...
196,1,196,shuz touch boots
197,1,197,la briza andria boots
198,1,198,skmei 1070blk sports analogdigital watch for...


In [13]:
ds.dataset_1.head()

Unnamed: 0,unique_key,uniq_id,crawl_timestamp,product_url,product_name,product_category_tree,pid,retail_price,discounted_price,image,is_FK_Advantage_product,description,product_rating,overall_rating,brand,product_specifications,product_name_modified,merge_col
0,0,c2d766ca982eca8304150849735ffef9,2016-03-25 22:59:23 +0000,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,,SRTEH2FF9KEDEFGF,982,438,"[""http://img5a.flixcart.com/image/short/u/4/a/...",False,Key Features of Alisha Solid Women's Cycling S...,No rating available,No rating available,Alisha,"{""product_specification""=>[{""key""=>""Number of ...",alisha solid womens cycling shorts,1
1,1,7f7036a6d550aaa89d34c77bd39a5e48,2016-03-25 22:59:23 +0000,http://www.flipkart.com/fabhomedecor-fabric-do...,FabHomeDecor Fabric Double Sofa Bed,"[""Furniture >> Living Room Furniture >> Sofa B...",SBEEH3QGU7MFYJFY,32143,29121,"[""http://img6a.flixcart.com/image/sofa-bed/j/f...",False,FabHomeDecor Fabric Double Sofa Bed (Finish Co...,No rating available,No rating available,FabHomeDecor,"{""product_specification""=>[{""key""=>""Installati...",fabhomedecor fabric double sofa bed,1
2,2,f449ec65dcbc041b6ae5e6a32717d01b,2016-03-25 22:59:23 +0000,http://www.flipkart.com/aw-bellies/p/itmeh4grg...,AW Bellies,"[""Footwear >> Women's Footwear >> Ballerinas >...",SHOEH4GRSUBJGZXE,991,551,"[""http://img5a.flixcart.com/image/shoe/7/z/z/r...",False,Key Features of AW Bellies Sandals Wedges Heel...,No rating available,No rating available,AW,"{""product_specification""=>[{""key""=>""Ideal For""...",aw bellies,1
3,3,0973b37acd0c664e3de26e97e5571454,2016-03-25 22:59:23 +0000,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",SRTEH2F6HUZMQ6SJ,694,325,"[""http://img5a.flixcart.com/image/short/6/2/h/...",False,Key Features of Alisha Solid Women's Cycling S...,No rating available,No rating available,Alisha,"{""product_specification""=>[{""key""=>""Number of ...",alisha solid womens cycling shorts,1
4,4,bc940ea42ee6bef5ac7cea3fb5cfbee7,2016-03-25 22:59:23 +0000,http://www.flipkart.com/sicons-all-purpose-arn...,Sicons All Purpose Arnica Dog Shampoo,"[""Pet Supplies >> Grooming >> Skin & Coat Care...",PSOEH3ZYDMSYARJ5,208,258,"[""http://img5a.flixcart.com/image/pet-shampoo/...",False,Specifications of Sicons All Purpose Arnica Do...,No rating available,No rating available,Sicons,"{""product_specification""=>[{""key""=>""Pet Type"",...",sicons all purpose arnica dog shampoo,1


In [14]:
ds.dataset_1.shape

(20000, 18)

In [15]:
ds.dataset_1.unique_key.nunique()


20000

In [16]:
ds.dataset_2.head()

Unnamed: 0,unique_key,uniq_id,crawl_timestamp,product_url,product_name,product_category_tree,pid,retail_price,discounted_price,image,is_FK_Advantage_product,description,product_rating,overall_rating,brand,product_specifications,product_name_modified,merge_col
0,0,c2d766ca982eca8304150849735ffef9,2016-03-25 22:59:23 +0000,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",SRTEH2FF9KEDEFGF,999.0,379.0,"[""http://img5a.flixcart.com/image/short/u/4/a/...",False,Key Features of Alisha Solid Women's Cycling S...,No rating available,No rating available,Alisha,"{""product_specification""=>[{""key""=>""Number of ...",alisha solid womens cycling shorts,1
1,1,7f7036a6d550aaa89d34c77bd39a5e48,2016-03-25 22:59:23 +0000,http://www.flipkart.com/fabhomedecor-fabric-do...,FabHomeDecor Fabric Double Sofa Bed,"[""Furniture >> Living Room Furniture >> Sofa B...",SBEEH3QGU7MFYJFY,32157.0,22646.0,"[""http://img6a.flixcart.com/image/sofa-bed/j/f...",False,FabHomeDecor Fabric Double Sofa Bed (Finish Co...,No rating available,No rating available,FabHomeDecor,"{""product_specification""=>[{""key""=>""Installati...",fabhomedecor fabric double sofa bed,1
2,2,f449ec65dcbc041b6ae5e6a32717d01b,2016-03-25 22:59:23 +0000,http://www.flipkart.com/aw-bellies/p/itmeh4grg...,AW Bellies,"[""Footwear >> Women's Footwear >> Ballerinas >...",SHOEH4GRSUBJGZXE,999.0,499.0,"[""http://img5a.flixcart.com/image/shoe/7/z/z/r...",False,Key Features of AW Bellies Sandals Wedges Heel...,No rating available,No rating available,AW,"{""product_specification""=>[{""key""=>""Ideal For""...",aw bellies,1
3,3,0973b37acd0c664e3de26e97e5571454,2016-03-25 22:59:23 +0000,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",SRTEH2F6HUZMQ6SJ,699.0,267.0,"[""http://img5a.flixcart.com/image/short/6/2/h/...",False,Key Features of Alisha Solid Women's Cycling S...,No rating available,No rating available,Alisha,"{""product_specification""=>[{""key""=>""Number of ...",alisha solid womens cycling shorts,1
4,4,bc940ea42ee6bef5ac7cea3fb5cfbee7,2016-03-25 22:59:23 +0000,http://www.flipkart.com/sicons-all-purpose-arn...,Sicons All Purpose Arnica Dog Shampoo,"[""Pet Supplies >> Grooming >> Skin & Coat Care...",PSOEH3ZYDMSYARJ5,220.0,210.0,"[""http://img5a.flixcart.com/image/pet-shampoo/...",False,Specifications of Sicons All Purpose Arnica Do...,No rating available,No rating available,Sicons,"{""product_specification""=>[{""key""=>""Pet Type"",...",sicons all purpose arnica dog shampoo,1


In [17]:
 ds.dataset_2.shape

(20000, 18)

In [18]:
ds.dataset_2.unique_key.nunique()

20000

In [20]:
merge_df=pd.merge(dataset_1.loc[dataset_1.unique_key<100],dataset_2,on="merge_col")

In [21]:
merge_df.shape

(2000000, 5)

In [22]:
merge_df.head()

Unnamed: 0,merge_col,unique_key_x,product_name_modified_x,unique_key_y,product_name_modified_y
0,1,0,alisha solid womens cycling shorts,0,alisha solid womens cycling shorts
1,1,0,alisha solid womens cycling shorts,1,fabhomedecor fabric double sofa bed
2,1,0,alisha solid womens cycling shorts,2,aw bellies
3,1,0,alisha solid womens cycling shorts,3,alisha solid womens cycling shorts
4,1,0,alisha solid womens cycling shorts,4,sicons all purpose arnica dog shampoo


In [25]:
import timeit
tic=timeit.default_timer()
merge_df['res'] = merge_df[["product_name_modified_x","product_name_modified_y"]].apply(lambda x: get_jaro_distance(x["product_name_modified_x"], x["product_name_modified_y"]),axis=1)
toc=timeit.default_timer()

In [26]:
print(toc - tic)

238.47558109999997


In [27]:
merge_df.head()

Unnamed: 0,merge_col,unique_key_x,product_name_modified_x,unique_key_y,product_name_modified_y,res
0,1,0,alisha solid womens cycling shorts,0,alisha solid womens cycling shorts,1.0
1,1,0,alisha solid womens cycling shorts,1,fabhomedecor fabric double sofa bed,0.58
2,1,0,alisha solid womens cycling shorts,2,aw bellies,0.53
3,1,0,alisha solid womens cycling shorts,3,alisha solid womens cycling shorts,1.0
4,1,0,alisha solid womens cycling shorts,4,sicons all purpose arnica dog shampoo,0.67


In [32]:
import timeit
tic=timeit.default_timer()
merge_df=pd.merge(dataset_1.loc[dataset_1.unique_key<100],dataset_2,on="merge_col")
merge_df['res'] = merge_df[["product_name_modified_x","product_name_modified_y"]].apply(lambda x: get_jaro_distance(x["product_name_modified_x"], x["product_name_modified_y"]),axis=1)
toc=timeit.default_timer()

In [33]:
print(toc - tic)

234.8138439999998


In [36]:
merge_df.sort_values(by=['unique_key_x', 'res'],ascending=[True,False],inplace=True)

In [37]:
merge_df.head()

Unnamed: 0,merge_col,unique_key_x,product_name_modified_x,unique_key_y,product_name_modified_y,res
0,1,0,alisha solid womens cycling shorts,0,alisha solid womens cycling shorts,1.0
3,1,0,alisha solid womens cycling shorts,3,alisha solid womens cycling shorts,1.0
6,1,0,alisha solid womens cycling shorts,6,alisha solid womens cycling shorts,1.0
9,1,0,alisha solid womens cycling shorts,9,alisha solid womens cycling shorts,1.0
13,1,0,alisha solid womens cycling shorts,13,alisha solid womens cycling shorts,1.0


In [38]:
merge_df.drop_duplicates(subset=['unique_key_x'], keep='first',inplace=True)

In [39]:
merge_df.head()

Unnamed: 0,merge_col,unique_key_x,product_name_modified_x,unique_key_y,product_name_modified_y,res
0,1,0,alisha solid womens cycling shorts,0,alisha solid womens cycling shorts,1.0
20001,1,1,fabhomedecor fabric double sofa bed,1,fabhomedecor fabric double sofa bed,1.0
40002,1,2,aw bellies,2,aw bellies,1.0
60000,1,3,alisha solid womens cycling shorts,0,alisha solid womens cycling shorts,1.0
80004,1,4,sicons all purpose arnica dog shampoo,4,sicons all purpose arnica dog shampoo,1.0


In [41]:
merge_df.shape

(100, 6)

In [None]:
print("Initial ",toc - tic)

In [None]:
dfs=pd.DataFrameFrame()

In [30]:
hist=1000
for i in range(hist,len(dataset_2.index),hist):
    print("i : ",i)

i :  1000
i :  2000
i :  3000
i :  4000
i :  5000
i :  6000
i :  7000
i :  8000
i :  9000
i :  10000
i :  11000
i :  12000
i :  13000
i :  14000
i :  15000
i :  16000
i :  17000
i :  18000
i :  19000


In [28]:
merge_df.shape

(2000000, 6)

In [12]:
ama=pd.read_csv('/content/amz_com-ecommerce_sample.csv',encoding= 'unicode_escape')

ParserError: ignored

In [None]:
ama.head()

Unnamed: 0,uniq_id,crawl_timestamp,product_url,product_name,product_category_tree,pid,retail_price,discounted_price,image,is_FK_Advantage_product,description,product_rating,overall_rating,brand,product_specifications
0,c2d766ca982eca8304150849735ffef9,2016-03-25 22:59:23 +0000,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",SRTEH2FF9KEDEFGF,982,438,"[""http://img5a.flixcart.com/image/short/u/4/a/...",False,Key Features of Alisha Solid Women's Cycling S...,No rating available,No rating available,Alisha,"{""product_specification""=>[{""key""=>""Number of ..."
1,7f7036a6d550aaa89d34c77bd39a5e48,2016-03-25 22:59:23 +0000,http://www.flipkart.com/fabhomedecor-fabric-do...,FabHomeDecor Fabric Double Sofa Bed,"[""Furniture >> Living Room Furniture >> Sofa B...",SBEEH3QGU7MFYJFY,32143,29121,"[""http://img6a.flixcart.com/image/sofa-bed/j/f...",False,FabHomeDecor Fabric Double Sofa Bed (Finish Co...,No rating available,No rating available,FabHomeDecor,"{""product_specification""=>[{""key""=>""Installati..."
2,f449ec65dcbc041b6ae5e6a32717d01b,2016-03-25 22:59:23 +0000,http://www.flipkart.com/aw-bellies/p/itmeh4grg...,AW Bellies,"[""Footwear >> Women's Footwear >> Ballerinas >...",SHOEH4GRSUBJGZXE,991,551,"[""http://img5a.flixcart.com/image/shoe/7/z/z/r...",False,Key Features of AW Bellies Sandals Wedges Heel...,No rating available,No rating available,AW,"{""product_specification""=>[{""key""=>""Ideal For""..."
3,0973b37acd0c664e3de26e97e5571454,2016-03-25 22:59:23 +0000,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",SRTEH2F6HUZMQ6SJ,694,325,"[""http://img5a.flixcart.com/image/short/6/2/h/...",False,Key Features of Alisha Solid Women's Cycling S...,No rating available,No rating available,Alisha,"{""product_specification""=>[{""key""=>""Number of ..."
4,bc940ea42ee6bef5ac7cea3fb5cfbee7,2016-03-25 22:59:23 +0000,http://www.flipkart.com/sicons-all-purpose-arn...,Sicons All Purpose Arnica Dog Shampoo,"[""Pet Supplies >> Grooming >> Skin & Coat Care...",PSOEH3ZYDMSYARJ5,208,258,"[""http://img5a.flixcart.com/image/pet-shampoo/...",False,Specifications of Sicons All Purpose Arnica Do...,No rating available,No rating available,Sicons,"{""product_specification""=>[{""key""=>""Pet Type"",..."


In [None]:
ama['product_name_modified']=ama['product_name'].str.lower()

In [None]:
ama.product_name_modified=ama.product_name_modified.str.strip()

In [None]:
#ama['product_name_modified'].dtypes
ama.product_name_modified = ama.product_name_modified.astype(str)

In [None]:
ama.product_name_modified=ama['product_name_modified'].str.replace('[^\w\s]', '')

  """Entry point for launching an IPython kernel.


In [None]:
ama.product_name_modified=ama['product_name_modified'].str.replace(r'\s+', ' ', regex=True).str.strip()

In [None]:
fp=pd.read_csv('/content/flipkart_com-ecommerce_sample.csv',encoding= 'unicode_escape')

In [None]:
fp['product_name_modified']=fp['product_name'].str.lower()
fp.product_name_modified=fp.product_name_modified.str.strip()
fp.product_name_modified = fp.product_name_modified.astype(str)
fp.product_name_modified=fp['product_name_modified'].str.replace('[^\w\s]', '')

  after removing the cwd from sys.path.


In [None]:
fp_v=fp[['uniq_id','product_name','retail_price','discounted_price']]

In [None]:
fp.head()

Unnamed: 0,uniq_id,crawl_timestamp,product_url,product_name,product_category_tree,pid,retail_price,discounted_price,image,is_FK_Advantage_product,description,product_rating,overall_rating,brand,product_specifications,product_name_modified
0,c2d766ca982eca8304150849735ffef9,2016-03-25 22:59:23 +0000,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",SRTEH2FF9KEDEFGF,999.0,379.0,"[""http://img5a.flixcart.com/image/short/u/4/a/...",False,Key Features of Alisha Solid Women's Cycling S...,No rating available,No rating available,Alisha,"{""product_specification""=>[{""key""=>""Number of ...",alisha solid womens cycling shorts
1,7f7036a6d550aaa89d34c77bd39a5e48,2016-03-25 22:59:23 +0000,http://www.flipkart.com/fabhomedecor-fabric-do...,FabHomeDecor Fabric Double Sofa Bed,"[""Furniture >> Living Room Furniture >> Sofa B...",SBEEH3QGU7MFYJFY,32157.0,22646.0,"[""http://img6a.flixcart.com/image/sofa-bed/j/f...",False,FabHomeDecor Fabric Double Sofa Bed (Finish Co...,No rating available,No rating available,FabHomeDecor,"{""product_specification""=>[{""key""=>""Installati...",fabhomedecor fabric double sofa bed
2,f449ec65dcbc041b6ae5e6a32717d01b,2016-03-25 22:59:23 +0000,http://www.flipkart.com/aw-bellies/p/itmeh4grg...,AW Bellies,"[""Footwear >> Women's Footwear >> Ballerinas >...",SHOEH4GRSUBJGZXE,999.0,499.0,"[""http://img5a.flixcart.com/image/shoe/7/z/z/r...",False,Key Features of AW Bellies Sandals Wedges Heel...,No rating available,No rating available,AW,"{""product_specification""=>[{""key""=>""Ideal For""...",aw bellies
3,0973b37acd0c664e3de26e97e5571454,2016-03-25 22:59:23 +0000,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",SRTEH2F6HUZMQ6SJ,699.0,267.0,"[""http://img5a.flixcart.com/image/short/6/2/h/...",False,Key Features of Alisha Solid Women's Cycling S...,No rating available,No rating available,Alisha,"{""product_specification""=>[{""key""=>""Number of ...",alisha solid womens cycling shorts
4,bc940ea42ee6bef5ac7cea3fb5cfbee7,2016-03-25 22:59:23 +0000,http://www.flipkart.com/sicons-all-purpose-arn...,Sicons All Purpose Arnica Dog Shampoo,"[""Pet Supplies >> Grooming >> Skin & Coat Care...",PSOEH3ZYDMSYARJ5,220.0,210.0,"[""http://img5a.flixcart.com/image/pet-shampoo/...",False,Specifications of Sicons All Purpose Arnica Do...,No rating available,No rating available,Sicons,"{""product_specification""=>[{""key""=>""Pet Type"",...",sicons all purpose arnica dog shampoo


In [None]:
ama_v=ama[['uniq_id','product_name','retail_price','discounted_price']]

In [None]:
merge=fp_v.merge(ama_v,on='uniq_id',how='inner')

In [None]:
merge.head()

Unnamed: 0,uniq_id,product_name_x,retail_price_x,discounted_price_x,product_name_y,retail_price_y,discounted_price_y
0,c2d766ca982eca8304150849735ffef9,Alisha Solid Women's Cycling Shorts,999.0,379.0,Alisha Solid Women's Cycling Shorts,982,438
1,7f7036a6d550aaa89d34c77bd39a5e48,FabHomeDecor Fabric Double Sofa Bed,32157.0,22646.0,FabHomeDecor Fabric Double Sofa Bed,32143,29121
2,f449ec65dcbc041b6ae5e6a32717d01b,AW Bellies,999.0,499.0,AW Bellies,991,551
3,0973b37acd0c664e3de26e97e5571454,Alisha Solid Women's Cycling Shorts,699.0,267.0,Alisha Solid Women's Cycling Shorts,694,325
4,bc940ea42ee6bef5ac7cea3fb5cfbee7,Sicons All Purpose Arnica Dog Shampoo,220.0,210.0,Sicons All Purpose Arnica Dog Shampoo,208,258


In [None]:
merge = merge.rename(columns={'product_name_x': 'Product Name in Flipkart', 'product_name_y': 'Product Name in Amazon',
                              'retail_price_x':'Retail Price in Flipkart','retail_price_y':'Retail Price in Amazon',
                              'discounted_price_x':'Discounted Price in Amazon', 'discounted_price_y':'Discounted Price in Flipkart'})

In [None]:
merge.head()

Unnamed: 0,uniq_id,Product Name in Flipkart,Retail Price in Flipkart,Discounted Price in Amazon,Product Name in Amazon,Retail Price in Amazon,Discounted Price in Flipkart
0,c2d766ca982eca8304150849735ffef9,Alisha Solid Women's Cycling Shorts,999.0,379.0,Alisha Solid Women's Cycling Shorts,982,438
1,7f7036a6d550aaa89d34c77bd39a5e48,FabHomeDecor Fabric Double Sofa Bed,32157.0,22646.0,FabHomeDecor Fabric Double Sofa Bed,32143,29121
2,f449ec65dcbc041b6ae5e6a32717d01b,AW Bellies,999.0,499.0,AW Bellies,991,551
3,0973b37acd0c664e3de26e97e5571454,Alisha Solid Women's Cycling Shorts,699.0,267.0,Alisha Solid Women's Cycling Shorts,694,325
4,bc940ea42ee6bef5ac7cea3fb5cfbee7,Sicons All Purpose Arnica Dog Shampoo,220.0,210.0,Sicons All Purpose Arnica Dog Shampoo,208,258


In [None]:
from pyjarowinkler.distance import get_jaro_distance

In [None]:
!pip install  pyjarowinkler

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyjarowinkler
  Downloading pyjarowinkler-1.8-py2.py3-none-any.whl (5.9 kB)
Installing collected packages: pyjarowinkler
Successfully installed pyjarowinkler-1.8


In [None]:
merge['res'] = [get_jaro_distance(x, y) for x, y in zip(merge['product_name_x'], merge['product_name_y'])]

In [None]:
merge.head()

Unnamed: 0,uniq_id,product_name_x,retail_price_x,product_name_y,retail_price_y,res
0,c2d766ca982eca8304150849735ffef9,Alisha Solid Women's Cycling Shorts,999.0,Alisha Solid Women's Cycling Shorts,982,1.0
1,7f7036a6d550aaa89d34c77bd39a5e48,FabHomeDecor Fabric Double Sofa Bed,32157.0,FabHomeDecor Fabric Double Sofa Bed,32143,1.0
2,f449ec65dcbc041b6ae5e6a32717d01b,AW Bellies,999.0,AW Bellies,991,1.0
3,0973b37acd0c664e3de26e97e5571454,Alisha Solid Women's Cycling Shorts,699.0,Alisha Solid Women's Cycling Shorts,694,1.0
4,bc940ea42ee6bef5ac7cea3fb5cfbee7,Sicons All Purpose Arnica Dog Shampoo,220.0,Sicons All Purpose Arnica Dog Shampoo,208,1.0
