## Importing libraries and loading the data

In [1]:
import numpy as np
import pandas as pd

In [2]:
fkt_df = pd.read_csv('flipkart_com-ecommerce_sample.csv', encoding="ISO-8859-1")
amz_df = pd.read_csv('amz_com-ecommerce_sample.csv', encoding="ISO-8859-1")

## Data cleaning and preprocessing with flipkart dataset

In [3]:
len(fkt_df)

20000

In [4]:
fkt_df.isna().sum()

uniq_id                       0
crawl_timestamp               0
product_url                   0
product_name                  0
product_category_tree         0
pid                           0
retail_price                 78
discounted_price             78
image                         3
is_FK_Advantage_product       0
description                   2
product_rating                0
overall_rating                0
brand                      5864
product_specifications       14
dtype: int64

In [5]:
# Removing missing values
fkt_df = fkt_df.dropna(subset=["retail_price","discounted_price","image","description","product_specifications","brand"])
fkt_df.reset_index(drop=True, inplace=True)

In [6]:
len(fkt_df)

14058

In [7]:
# Splitting category into seperate tags
import re
for i in range(len(fkt_df)):
    x = fkt_df.loc[i, "product_category_tree"].split(">>")
    tags = [re.sub("[^a-zA-Z]+", " ", i).strip() for i in x]
    fkt_df.loc[i, "category_1"] = tags[0]
    try:
        fkt_df.loc[i, "category_2"] = tags[1]
    except:
        fkt_df.loc[i, "category_2"] = " "
    try:
        fkt_df.loc[i, "category_3"] = tags[2:]
    except:
        fkt_df.loc[i, "category_3"] = " "

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, v, pi)


In [8]:
#Dropping irrelevant columns
fkt_df.drop(["product_category_tree","uniq_id","crawl_timestamp","product_url","pid","is_FK_Advantage_product","overall_rating","product_rating"],axis=1, inplace=True)

In [9]:
# Sorting the dataset by category
fkt_df.sort_values(by=["category_1","category_2","category_3"], inplace=True)

In [10]:
fkt_df.head()

Unnamed: 0,product_name,retail_price,discounted_price,image,description,brand,product_specifications,category_1,category_2,category_3
10261,"ABEEZ Boys, Men, Girls",399.0,289.0,"[""http://img5a.flixcart.com/image/wrist-band/z...","Key Features of ABEEZ Boys, Men, Girls Wrist b...",ABEEZ,"{""product_specification""=>[{""key""=>""Material"",...",ABEEZ Boys Men Girls Black Pack of,,
607,ANAND ARCHIES Girls Flats,499.0,499.0,"[""http://img5a.flixcart.com/image/sandal/y/y/f...",Specifications of ANAND ARCHIES Girls Flats Ge...,ANAND ARCHIES,"{""product_specification""=>[{""key""=>""Ideal For""...",ANAND ARCHIES Girls Flats,,
13073,ANAND ARCHIES Girls Flats,499.0,399.0,"[""http://img6a.flixcart.com/image/sandal/k/h/f...",Specifications of ANAND ARCHIES Girls Flats Ge...,ANAND ARCHIES,"{""product_specification""=>[{""key""=>""Ideal For""...",ANAND ARCHIES Girls Flats,,
608,ANAND ARCHIES Girls Wedges,899.0,899.0,"[""http://img5a.flixcart.com/image/sandal/8/g/j...",Specifications of ANAND ARCHIES Girls Wedges G...,ANAND ARCHIES,"{""product_specification""=>[{""key""=>""Ideal For""...",ANAND ARCHIES Girls Wedges,,
4203,ANASAZI Casual 3/4 Sleeve Solid Women's Top,899.0,899.0,"[""http://img5a.flixcart.com/image/top/x/z/j/21...",Specifications of ANASAZI Casual 3/4 Sleeve So...,ANASAZI,"{""product_specification""=>[{""key""=>""Sleeve"", ""...",ANASAZI Casual Sleeve Solid Women s Top,,


## Data cleaning and preprocessing with amazon dataset




In [11]:
len(amz_df)

20000

In [12]:
amz_df.isna().sum()

uniq_id                       0
crawl_timestamp               0
product_url                   0
product_name                  0
product_category_tree         0
pid                           0
retail_price                  0
discounted_price              0
image                         3
is_FK_Advantage_product       0
description                   2
product_rating                0
overall_rating                0
brand                      5864
product_specifications       14
dtype: int64

In [13]:
# Removing missing values
amz_df = amz_df.dropna(subset=["image","description","product_specifications","brand"])
amz_df.reset_index(drop=True, inplace=True)

In [14]:
len(amz_df)

14121

In [15]:
# Splitting category into seperate tags
for i in range(len(amz_df)):
    x = amz_df.loc[i, "product_category_tree"].split(">>")
    tags = [re.sub("[^a-zA-Z]+", " ", i).strip() for i in x]
    amz_df.loc[i, "category_1"] = tags[0]
    try:
        amz_df.loc[i, "category_2"] = tags[1]
    except:
        amz_df.loc[i, "category_2"] = " "
    try:
        amz_df.loc[i, "category_3"] = tags[2:]
    except:
        amz_df.loc[i, "category_3"] = " "

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, v, pi)


In [16]:
#Dropping irrelevant columns
amz_df.drop(["product_category_tree","uniq_id","crawl_timestamp","product_url","pid","is_FK_Advantage_product","overall_rating","product_rating"],axis=1,inplace=True)

In [17]:
# Sorting the dataset by category
amz_df.sort_values(by = ["category_1","category_2","category_3"], inplace=True)

In [18]:
amz_df.head()

Unnamed: 0,product_name,retail_price,discounted_price,image,description,brand,product_specifications,category_1,category_2,category_3
10319,"ABEEZ Boys, Men, Girls",382,348,"[""http://img5a.flixcart.com/image/wrist-band/z...","Key Features of ABEEZ Boys, Men, Girls Wrist b...",ABEEZ,"{""product_specification""=>[{""key""=>""Material"",...",ABEEZ Boys Men Girls Black Pack of,,
609,ANAND ARCHIES Girls Flats,491,641,"[""http://img5a.flixcart.com/image/sandal/y/y/f...",Specifications of ANAND ARCHIES Girls Flats Ge...,ANAND ARCHIES,"{""product_specification""=>[{""key""=>""Ideal For""...",ANAND ARCHIES Girls Flats,,
13136,ANAND ARCHIES GIRLS FLATS,497,475,"[""http://img6a.flixcart.com/image/sandal/k/h/f...",Specifications of ANAND ARCHIES Girls Flats Ge...,ANAND ARCHIES,"{""product_specification""=>[{""key""=>""Ideal For""...",ANAND ARCHIES Girls Flats,,
610,ANAND ARCHIES Girls Wedges,888,1094,"[""http://img5a.flixcart.com/image/sandal/8/g/j...",Specifications of ANAND ARCHIES Girls Wedges G...,ANAND ARCHIES,"{""product_specification""=>[{""key""=>""Ideal For""...",ANAND ARCHIES Girls Wedges,,
4210,ANASAZI Casual 3/4 Sleeve Solid Women's Top,882,1026,"[""http://img5a.flixcart.com/image/top/x/z/j/21...",Specifications of ANASAZI Casual 3/4 Sleeve So...,ANASAZI,"{""product_specification""=>[{""key""=>""Sleeve"", ""...",ANASAZI Casual Sleeve Solid Women s Top,,


In [27]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

## Product matching using Word Embeddings and Cosine Similarity

In [26]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')

In [22]:
# word embeddings
embeddings = model.encode(amz_df["description"])

In [23]:
# Final dataframe
final_df = pd.DataFrame(columns=["Product name in Flipkart","Retail Price in Flipkart","Discounted price in Flipkart",
                          "Product name in Amazon","Retail Price in Amazon","Discounted price in Amazon"])

In [24]:
# Index of rows
x = 0

for i in range(len(fkt_df)):
    try:
        # store the brand name of the product from Flipkart
        tag = fkt_df.loc[i, "brand"]

        # Get all the products from Amazon that match with brand name
        amz = amz_df[amz_df["brand"] == tag]

        # Store the product category of the product from Flipkart
        tag = fkt_df.loc[i, "category_1"]

        # Get all the products from Amazon that match the category
        amz = amz_df[amz_df["category_1"] == tag]

        # Store the index of matched products
        index = [i for i in amz.index]

        # Find the embeddings of matched products
        embedding = embeddings[index]
        amz.reset_index(drop=True, inplace=True)

        # Store the description of the product from Flipkart
        description = fkt_df.loc[i, "description"]
        embedding = np.vstack((model.encode(description), embedding))

        # Calculate the cosine similarity of the product with Amazon products
        cs = cosine_similarity([embedding[0]], embedding[1:])
        cs = cs[0]
        c = np.argmax(cs)

        # Store the product details of Flipkart
        fkt_name = fkt_df.loc[i, "product_name"]
        fkt_retail = fkt_df.loc[i, "retail_price"]
        fkt_discounted = fkt_df.loc[i, "discounted_price"]

        # Store the product details of Amazon
        amz_name = amz.loc[c, "product_name"]
        amz_retail = amz.loc[c, "retail_price"]
        amz_discounted = amz.loc[c, "discounted_price"]

        final_df.loc[x] = [fkt_name, fkt_retail, fkt_discounted, amz_name, amz_retail, amz_discounted]

        x += 1
    except:
        pass

In [25]:
final_df.head()

Unnamed: 0,Product name in Flipkart,Retail Price in Flipkart,Discounted price in Flipkart,Product name in Amazon,Retail Price in Amazon,Discounted price in Amazon
0,Alisha Solid Women's Cycling Shorts,999.0,379.0,Alisha Solid Women's Cycling Shorts,982,438
1,FabHomeDecor Fabric Double Sofa Bed,32157.0,22646.0,FabHomeDecor Fabric Double Sofa Bed,32143,29121
2,AW Bellies,999.0,499.0,AW Bellies,991,551
3,Alisha Solid Women's Cycling Shorts,699.0,267.0,Alisha Solid Women's Cycling Shorts,694,325
4,Sicons All Purpose Arnica Dog Shampoo,220.0,210.0,Sicons All Purpose Arnica Dog Shampoo,208,258
...,...,...,...,...,...,...
14053,WallDesign Small Vinyl Sticker,1500.0,730.0,WALLDESIGN S VINYL STICKER,1495,934
14054,Wallmantra Large Vinyl Stickers Sticker,1429.0,1143.0,WALLMANTRA LARGE VINYL STICKERS STICKER,1422,1484
14055,Elite Collection Medium Acrylic Sticker,1299.0,999.0,ELITE COLLECTION M ACRYLIC STICKER,1279,1254
14056,Elite Collection Medium Acrylic Sticker,1499.0,1199.0,ELITE COLLECTION M ACRYLIC STICKER,1588,1471
