In [1]:
import os
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset,DataLoader
import torch.cuda
import math
import matplotlib.pyplot as plt
import torchvision.transforms as transforms
from skimage import io
from   torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
plt.ion()

# Data Loading and Preprocessing

In [2]:
#Using Pandas to read data from csv file
data = pd.read_csv('data.csv')
data.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,productId,title,description,imageUrlStr,mrp,sellingPrice,specialPrice,productUrl,categories,productBrand,...,keySpecsStr,detailedSpecsStr,specificationList,sellerName,sellerAverageRating,sellerNoOfRatings,sellerNoOfReviews,sleeve,neck,idealFor
0,SPWDS7UPDCMEG8WH,Clovia Women's Shapewear,,http://img.fkcdn.com/image/shapewear/8/w/h/sw0...,1199.0,959.0,959.0,http://dl.flipkart.com/dl/clovia-women-s-shape...,Apparels>Women>Lingerie & Sleepwear>Shapewears,Clovia,...,Type: Shapewear;Women's Shapewear;Solid Patter...,Type: Shapewear;Women's Shapewear;Solid Patter...,,Neha Kant,3.6,2123,58,,,
1,SPWDTSM9QHVDHTN6,Clovia Women's Shapewear,,http://img.fkcdn.com/image/shapewear/z/x/w/sw0...,999.0,649.0,616.0,http://dl.flipkart.com/dl/clovia-women-s-shape...,Apparels>Women>Lingerie & Sleepwear>Shapewears,Clovia,...,Type: Thigh Slimmer;Women's Shapewear;Solid Pa...,Type: Thigh Slimmer;Women's Shapewear;Solid Pa...,,Neha Kant,3.6,2123,58,,,
2,SPWDTSMAGMHDNENJ,Clovia Women's Shapewear,,http://img.fkcdn.com/image/shapewear/z/x/w/sw0...,999.0,649.0,616.0,http://dl.flipkart.com/dl/clovia-women-s-shape...,Apparels>Women>Lingerie & Sleepwear>Shapewears,Clovia,...,Type: Thigh Slimmer;Women's Shapewear;Solid Pa...,Type: Thigh Slimmer;Women's Shapewear;Solid Pa...,,Neha Kant,3.6,2123,58,,,
3,SPWDWDT6NQDAHETZ,Triumph Shape 04 PLY Women's Shapewear,,http://img.fkcdn.com/image/shapewear/e/t/z/401...,1899.0,1599.0,1599.0,http://dl.flipkart.com/dl/triumph-shape-04-ply...,Apparels>Women>Lingerie & Sleepwear>Shapewears,Triumph,...,Women's Shapewear;Checkered Pattern;Fabric: 20...,Women's Shapewear;Checkered Pattern;Fabric: 20...,,Satvinder Singh,4.1,245590,2573,,,
4,SPWE4FHG6HUMGBQH,Golden Girl Red Corset Women's Shapewear,,http://img.fkcdn.com/image/shapewear/b/q/h/pan...,1399.0,475.0,475.0,http://dl.flipkart.com/dl/golden-girl-red-cors...,Apparels>Women>Lingerie & Sleepwear>Shapewears,Golden Girl,...,Type: Shapewear;Women's Shapewear;Solid Patter...,Type: Shapewear;Women's Shapewear;Solid Patter...,,AMIT PRAKASH,3.9,379,10,Sleeveless,,


In [3]:
#Filtering "Tops" data
X = []
for i in range (0,data.shape[0]):
    if type(data.loc[i,'categories']) != float:
        if data.loc[i,'categories'].find("Tops & Tunics>Tops") != -1:
            X.append(i)

print(len(X))

346927


In [4]:
#Creating Training data , using 30000 images to train the autoencoder
data_ = data.loc[X[:50000],:]
#Creating Test data
dev_data = data.loc[X[300001:],:]

In [5]:
#Defining the custom dataset
class DuplicateProductsDataset(Dataset):
    def __init__(self,data):
        self.data = data
        #Transforming the image to PIL inorder to resize all the images to same size and then converting it to tensor inorder to  perform convolution operations
        self.transformations = transforms.Compose([transforms.ToPILImage(),transforms.Resize((200,100)),transforms.ToTensor()])
    
    def __len__(self):
        return(self.data.shape[0])
    
    def __getitem__(self,idx):
        #Getting image from the URL and preprocessing
        img_url = self.data.loc[X[idx],'imageUrlStr']
        i = img_url.find(';')
        j = img_url[i+1:].find(';')
        img_url = img_url[i+1:i+j+1]
        try:
            img = io.imread(img_url)
            img = self.transformations(img)
            return(img)
        except: 
            #Handling the HTTPError
            print('HttPError encountered while retrieving image')

In [6]:
#Initializing dataset object
custom_dataset = DuplicateProductsDataset(data_)

batch_size = 128                                                                      

def my_collate(batch):
    batch = list(filter(lambda x:x is not None, batch))
    return torch.utils.data.dataloader.default_collate(batch)

#Initializing dataloader object
dataset_loader = torch.utils.data.DataLoader(dataset=custom_dataset,
                                                    batch_size=batch_size,
                                                    shuffle=True,collate_fn= my_collate)

# AutoEncoder

In [7]:
#Defining the autoencoder module
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder,self).__init__()
        self.en_cnn1 = nn.Conv2d(3, 6, kernel_size=5)
        self.en_cnn2 = nn.Conv2d(6,16,kernel_size=5)
        self.max_pool = nn.MaxPool2d(4,return_indices = True)
        self.dec_cnn1 = nn.ConvTranspose2d(16,6,kernel_size=5)
        self.max_unpool= nn.MaxUnpool2d(4)
        self.dec_cnn2 = nn.ConvTranspose2d(6,3,kernel_size=5)
        self.en_linear1 = nn.Linear(16*48*23,1024)
        self.dec_linear1 = nn.Linear(1024,16*48*23)
        self.en_linear2 = nn.Linear(1024,256)
        self.dec_linear2 = nn.Linear(256,1024)
        
    
    
    ## Using Conv2d->Conv2d->MaxPool->Linear->Linear for encoding the images in to a 256 bit vector
    def encoder(self,x):
        code = F.relu(self.en_cnn1(x))
        code = F.relu(self.en_cnn2(code))
        code,i2 = self.max_pool(code)
        code = code.view(-1,16*48*23)
        code = self.en_linear1(code)
        code = self.en_linear2(code)
        return code,i2

    #Using Linear->Linear->MaxUnpool->ConvTranspose2d->ConvTranspose2d for generating images from the encodings(Latent Space/Bottleneck)
    def decoder(self,x,i2):
        code = self.dec_linear2(x)
        code = self.dec_linear1(code)
        code = code.view(-1,16,48,23)
        code = self.max_unpool(code,i2)
        code = F.relu(self.dec_cnn1(code))
        code = F.relu(self.dec_cnn2(code))
        return code
        
            
            
        
    def forward(self,x):
        x,i2 = self.encoder(x)
        x = self.decoder(x,i2)
        return x

## Training of autoencoder 

An already trained model is saved in the "saved_model" file. Uncomment following cell to load that model

In [8]:
#autoencoder = torch.load("saved_model")

In [9]:
num_epochs = 3                                                           #Number of iterations of entire training set
distance = nn.MSELoss()                                                  #Using mean squared error loss to compare original and decoded images
autoencoder = Autoencoder().cuda()                                       #Creating an object of Autoencoder class
optimizer = torch.optim.Adam(autoencoder.parameters(),weight_decay=1e-5) #Using Adam optimizer for optimization and added weight_decay for L2 regularization

In [None]:
#Train the autoencoder model
for epoch in range(num_epochs):
    for dat in dataset_loader:
        print(i)
        i = i+1
        img = dat
        img = Variable(img).cuda()
        # ===================forward=====================
        output = autoencoder(img)
        loss = distance(output,img)
        # ===================backward====================
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # ===================log========================
    print('epoch [{}/{}], loss:{:.4f}'.format(epoch+1, num_epochs, loss))

Uncomment the following code if you freshly trained the model and wish to save it

In [None]:
#Uncomment inorder to save a freshly trained model
#torch.save(autoencoder,"saved_model")

# KMeans Clustering 

In [None]:
#Creating a custom dataset for training the KMeans Clustering
kmeans_dataset = DuplicateProductsDataset(data.loc[X[0:10000],:])

#Creating encodings for all the images
myArray = []
for i in range(0,10000):
    print(i)
    try:
        img = kmeans_dataset.__getitem__(i)
        img.unsqueeze_(0)
        img_code,_ = autoencoder.encoder(Variable(img).cuda())
        img_code = img_code.cpu()
        img_code = img_code.detach().numpy()
        myArray.append(img_code)
    except:
        print("HttpError")


In [9]:
#Uncomment inorder to save the encodings array
# np.save("Encodings",myArray)

#Uncomment inorder to load the saved encodings array
#myArray = np.load("Encodings.npy")

In [10]:
#Using sklearn.cluster.KMeans to perform KMeans clustering on image encodings

from sklearn.cluster import KMeans

myArray = np.squeeze(myArray,1)

kmeans = KMeans(n_clusters=200, random_state=0).fit(myArray)

In [None]:
#Assingning clusters to all the tops
Tops_data = data.loc[X[:10000],:]

#Adding a new column for Cluster
Tops_data['Cluster'] = -1       

Total_Dataset = DuplicateProductsDataset(Tops_data)

for i in range(0,Tops_data.shape[0]):
    print(i)
    try:
        img = Total_Dataset.__getitem__(i)
        img.unsqueeze_(0)
        img_code,_ = autoencoder.encoder(Variable(img).cuda())        #Generating Encodings for images
        img_code = img_code.cpu()
        img_code = img_code.detach().numpy()                          #Converting torch.tensor to numpy.ndarray
        img_code= np.squeeze(img_code,0)
        cluster = kmeans.predict([img_code])                          #Predicting the cluster for a given product
        Tops_data.at[X[i],'Cluster'] = cluster[0]
    except:
        print("HttpError")

Tops_data.head()
Tops_data_sorted = Tops_data.sort_values('Cluster',ascending = 1)

In [13]:
Tops_data.head()
Tops_data.shape

#Uncomment the following code to save the Tops dataframe along with their clusters
#Tops_data.to_pickle('Tops_data.pkl')

(10000, 33)

In [11]:
#Uncomment the following code to load the saved Tops data along with their clusters
#Tops_data = pd.read_pickle('Tops_data.pkl')

# Comparisons among products to identify duplicates

In [12]:
dict_duplicates = {}

In [None]:
for i in range(0,10000):
    cluster = Tops_data.loc[X[i],'Cluster']
    for j in range(0,10000):
        if i==j:
            continue
        
        if Tops_data.loc[X[j],'Cluster'] != cluster:
            continue
        try:    
            img = Total_dataset.__getitem__(i)
            img.unsqueeze_(0)
            img_code,_ = autoencoder.encoder(Variable(img).cuda())
            img_code = img_code.cpu()
            img_code = img_code.detach().numpy()

            img_ = Total_dataset.__getitem__(j)
            img_.unsqueeze_(0)
            img_code_,_ = autoencoder.encoder(Variable(img_).cuda())
            img_code_ = img_code_.cpu()
            img_code_ = img_code_.detach().numpy()

            diff = np.linalg.norm(img-img_)
            print(i)
            print(j)
            print(diff)

            if(diff == 0):
                key = Tops_data.loc[X[i],'productId']
                value = Tops_data.loc[X[j],'productId']
                dict_duplicates.setdefault(key, []).append(value)
        
        except:
            continue
        

In [36]:
import json
with open("file1.json", "wb") as f:
    f.write(json.dumps(dict_duplicates).encode("utf-8"))

In [28]:
print(dict_duplicates)

{'TOPE9ABBZU3HZRHN': ['TOPE9ABBBTJYDSQE'], 'TOPE9ABBBTJYDSQE': ['TOPE9ABBZU3HZRHN'], 'TOPE6ZCYHTJEMZMW': ['TOPE7GUGYEG7JKHJ', 'TOPE7GUGPZSJCHHX', 'TOPE7GUGY65MGPGJ'], 'TOPE6XZPXBP5APH9': ['TOPE6XZPRUAFWPBH'], 'TOPE6XZPRUAFWPBH': ['TOPE6XZPXBP5APH9'], 'TOPE7CD4ETPFHCDX': ['TOPE7CD4FZXYEY2F'], 'TOPE7CD4FZXYEY2F': ['TOPE7CD4ETPFHCDX'], 'TOPE8M6R2XZCZG8Z': ['TOPE8M6RMN7SBFVG'], 'TOPE8M6RMN7SBFVG': ['TOPE8M6R2XZCZG8Z'], 'TOPE7G33BKEXZZHT': ['TOPE7U8MTATAGP5P', 'TOPE7U8MSDMAP6FE', 'TOPE7U8MW5DPK2MV', 'TOPE7U8MDZ2MCHXH'], 'TOPEF2HAJZ5PAXY6': ['TOPEF2HA7XH6TEEZ'], 'TOPEF2HA7XH6TEEZ': ['TOPEF2HAJZ5PAXY6'], 'TOPE7U8MFBZXJZQX': ['TOPE7U8MEDJHMJDT'], 'TOPE7U8MEDJHMJDT': ['TOPE7U8MFBZXJZQX'], 'TOPE6KWBB7GFT8ET': ['TOPE6KWB7QASBYVW'], 'TOPE6KWB7QASBYVW': ['TOPE6KWBB7GFT8ET'], 'TOPE6T9FGHWGTSB8': ['TOPE6T9FCG4FQGTC', 'TOPE6T9FCG4QYSVZ'], 'TOPE6T9FCG4FQGTC': ['TOPE6T9FGHWGTSB8', 'TOPE6T9FCG4QYSVZ'], 'TOPE7G334D9WRSGP': ['TOPE7U8MN9PXFVG9', 'TOPE7U8M8DWXG8NY'], 'TOPE7U8MZFBHRHHP': ['TOPE7U8MGBCRV7QG', 