# Import packages

In [None]:
# General packages
import urllib.request
import os
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter(action='ignore')

In [None]:
# Self defined methods
from src.data.data_preprocessing import *
from src.label.make_dataset import *
from src.label.product_graph import *


# Parse and clean original Amazon datasets

The original datasets are in JSON format. They have to be cleaned up and parsed into dataframe. Codes, which were used in this project to parse original data, are based on the colab notbook that the author of the datasets provided: https://colab.research.google.com/drive/1Zv6MARGQcrBbLHyjPVVMZVnRWsRnVMpV

Before running the following codes you have to download the original datasets that could be find on this page: http://deepyeti.ucsd.edu/jianmo/amazon/index.html and save them in OriginalData folder. You can download any category of you choice.

In [None]:
def make_dir_if_not_exists(name):
    if not os.path.exists(name):
        os.makedirs(name)

In [None]:
# This dictionary provides aliases for original data file names. For this project we used the following categories. 
# Replace them with the categories of your choice. 
datasets = {'art': 'meta_Arts_Crafts_and_Sewing',
 'pet': 'meta_Pet_Supplies',
 'home': 'meta_Home_and_Kitchen',
 'garden': 'meta_Patio_Lawn_and_Garden',
 'sport': 'meta_Sports_and_Outdoors',
 'toy': 'meta_Toys_and_Games',
 'tool': 'meta_Tools_and_Home_Improvement'}

In [None]:
# Read the downloaded original files, parsed them into pandas dataframes then cleaned up the data.
# Cleaned data will be save into FilteredData folder for later use.
# This process could take some time to finish.
output = "FilteredData"
make_dir_if_not_exists(output)
for data in datasets:
    df = parse_dataset(f'OriginalData/{datasets[data]}.json.gz')
    df = parse_column(df)
    df.to_pickle(f'FilteredData/{datasets[data]}_df_rm_duplicate_clean_similar_item.pickle')

In [None]:
# Example of filtered data
df=pd.read_pickle("FilteredData/meta_Arts_Crafts_and_Sewing_df_rm_duplicate_clean_similar_item.pickle")
df.head()

# Generate pair of similar and not similar products and triplets for training

## Define extra functions

In [None]:
def get_image(ID, df):
    """Download the image that belongs to the given product ID (ASIN) then save it into Images folder.

    Parameters:
    ID (str): ASIN of products whose image should be downloaded
    df (pandas.Dataframe): cleaned dataframe of one product category

   """
    url = df[df.asin==ID]['image'].values[0][0]
    r = urllib.request.urlopen(url)
    with open(f'images/{ID}.jpg', 'wb') as handler:
        handler.write(r.read())

In [None]:
def get_matches(num, connected_components):
    """Returns a list of tuples of 2 similar products. A pair of products will be considered as similar
    if they are in the same connected components.

    Parameters:
    num (int): Number of matches that should be generated
    connected_components (nested list of strings): Generated connected components from product graph

    Returns:
    Dict<(str, str),int>: Dictionary with key is a tuple of 2 ASINs that presents 2 similar products and values is
    the constant integer 1.

   """
    # loaded_image_ID is a global defined object that contains all downloaded images.
    global loaded_image_ID
    matches={}; edges=[] 
    # Flatten the netsted list of connected components
    connected_components = flatten(connected_components)
    # Save all edges in the components into one lists. Only products with direct connection will be drawn.
    for c in connected_components: 
        edges.extend(list(c.edges))
    while True:
        # Draw a random pair of directly connected products
        m1, m2 = edges[np.random.choice(len(edges))]
        # Download their images
        if m1 not in loaded_image_ID:            
            try:
                get_image(m1, df)
                loaded_image_ID[m1]=1
            except: print(m1, ' not loaded'); continue
        if m2 not in loaded_image_ID:            
            try:
                get_image(m2, df)
                loaded_image_ID[m2]=1
            except: print(m2, ' not loaded'); continue
        matches[(m1,m2)]=1
        # If the desired number of pairs of similar products are reached, the processed will stop.
        if len(matches)==num: 
            break
    return matches

In [None]:
def get_mismatches(num, connected_components):
    """Returns a list of tuples of 2 not similar products. A pair of products will be considered as not similar
    if they are in the same connected components.

    Parameters:
    num (int): Number of matches that should be generated
    connected_components (nested list of strings): Generated connected components from product graph

    Returns:
    Dict<(str, str),int>: Dictionary with key is a tuple of 2 ASINs that presents 2 not similar products and values is
    the constant integer 1.

   """
    # loaded_image_ID is a global defined object that contains all downloaded images.
    global loaded_image_ID
    mismatches={}
    # Remove category with only one subcategory
    connected_components = [i for i in connected_components if len(i)>1]
    sub_cats = [[list(cc.nodes) for cc in sub_cat] for sub_cat in connected_components]
    while True:
        sub_cat = sub_cats[np.random.choice(len(sub_cats))]
        # Choose 2 random products that belong to 2 different connected component in the same subcategory
        component_1, component_2 = np.random.choice(len(sub_cat), size=2, replace=False)
        component_1, component_2 = sub_cat[component_1], sub_cat[component_2]
        node_1, node_2 = component_1[np.random.choice(len(component_1))], component_2[np.random.choice(len(component_2))]  
        # Download their images
        if node_1 not in loaded_image_ID:
            try:
                loaded_image_ID[node_1]=1
                get_image(node_1)
            except:
                continue
        if node_2 not in loaded_image_ID:
            try:
                loaded_image_ID[node_2]=1
                get_image(node_2)
            except:
                continue
        mismatches[(node_1, node_2)]=1
        if len(mismatches)>=num:
            break
    return mismatches

In [None]:
def get_image_triplet(num, connected_components):
    """Returns a list of product triplets (anchor, positive and negative)

    Parameters:
    num (int): Number of triplets that should be generated
    connected_components (nested list of strings): Generated connected components from product graph

    Returns:
    list[(str, str, str)]: List of triplets.

   """
    # loaded_image_ID is a global defined object that contains all downloaded images.
    global loaded_image_ID
    # Remove category with only one subcategory
    triplets = {}
    connected_components = [i for i in connected_components if len(i)>1]
    sub_cats = [[list(cc.edges) for cc in sub_cat] for sub_cat in connected_components]
    while True:
        sub_cat = sub_cats[np.random.choice(len(sub_cats))]
        component_1, component_2 = np.random.choice(len(sub_cat), size=2, replace=False)
        component_1, component_2 = sub_cat[component_1], sub_cat[component_2]
        edge_1, edge_2 = component_1[np.random.choice(len(component_1))], component_2[np.random.choice(len(component_2))]
        node_1, node_2, node_3 = edge_1[0], edge_1[1], edge_2[0]
        if node_1 not in loaded_image_ID:
            try:
                loaded_image_ID[node_1]=1
                get_image(node_1)
            except:
                continue
        if node_2 not in loaded_image_ID:
            try:
                loaded_image_ID[node_2]=1
                get_image(node_2)
            except:
                continue
        if node_3 not in loaded_image_ID:
            try:
                loaded_image_ID[node_3]=1
                get_image(node_3)
            except:
                continue
        triplets[(node_1, node_2, node_3)]=1        
        if len(triplets)>=num:
            break
    return list(triplets.keys())

## Generate traning datasets

In [None]:
output_img = "Images"
make_dir_if_not_exists(output_img)

In [None]:
# loaded_image_ID is a global defined object that contains all downloaded images.
# It should be empty at first.
loaded_image_ID={}
for filename in os.listdir(output_img):
    loaded_image_ID[filename]=1
print(len(loaded_image_ID))

In [None]:
output_all_cat = "TrainingData/AllCategories/"
output_single_cat = "TrainingData/SingleCategory/"
make_dir_if_not_exists(output_all_cat)
make_dir_if_not_exists(output_single_cat)

In [None]:
serialized_dataset=[]
img_triplets = []
for ds in datasets:
    print(datasets[ds])
    # Read parsed dataset
    df = pd.read_pickle(f'FilteredData/{datasets[ds]}_df_rm_duplicate_clean_similar_item.pickle')
    # Remove products with missing relevant informations
    df_cleaned = clean_dataframe(df, ["title","feature","description","category","similar_item", "image"])
    # Make connected components bases on the original dataframe then remove the nodes that are not in the cleaned one.
    connected_components = df_component(df, df_cleaned)
    # Generate similar products pair. You can specify the number you wish to have.
    matches = get_matches(12, connected_components)
    print(len(matches), ' pairs of similar products.')
    mismatches = get_mismatches(30, connected_components)
    print(len(mismatches), ' pairs of not similar products.')
    # Convert the product attributs into one string
    data = serialize(df_cleaned, matches, mismatches)
    serialized_dataset.extend(data)
    # Generate product triplets
    triplets = get_image_triplet(20, connected_components)
    img_triplets.extend(triplets)
    # Write training data for single category
    write_training_data(f'{output_single_cat}{ds_}', serialized_dataset)
    write_triplets_data(f'{output_single_cat}{ds_}', img_triplets)
    

In [None]:
# Mix data of all categories then write in one single training dataset
write_training_data(output_all_cat, serialized_dataset)
write_triplets_data(output_all_cat, img_triplets)