#Preparation
This notebook prepares the dataset for description generation


1. Copy one or multiple excel files to the "dataset/raw" folder in the drive.
2. Make sure that the first row the files contains the lables of the columns.
3. Make sure all the tags (values) are already translated to English
4. Make sure the sheet titles are consistent with the value defined in the read_csv function. (default: 'BatchImport')

This notebook is compatible with raw data from Griffati and Brandsdistribution catalogs. For other datasets and formats, minor changes may be needed.


In [1]:
# Mount the drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
!rm -r /content/drive/MyDrive/dataset/test
!rm -r /content/drive/MyDrive/dataset/ref
!rm -r /content/drive/MyDrive/dataset/gen
!mkdir /content/drive/MyDrive/dataset/test
!mkdir /content/drive/MyDrive/dataset/ref
!mkdir /content/drive/MyDrive/dataset/gen

In [3]:
import pandas as pd
from pandas import read_excel
from pathlib import Path
import numpy as np
import lxml.html
import string
import os
import re

In [4]:
def read_batch(read_dir,limit):
    dfs = []
    c=0
    for path in os.listdir(read_dir):
        full_path = os.path.join(read_dir, path)
        if os.path.isfile(full_path):
            dfs.append(read_csv(read_dir,path))
            c+=1
            print("read file #"+ str(c)) 

    for i in range(len(dfs)):
        dfs[i] = dfs[i].sample(min(limit,len(dfs[i].index)))
    
    df = pd.concat(dfs)
    #print(len(df.index))
    df.reset_index(drop=True, inplace=True)
    return df


def read_csv(path,file_name):
    my_sheet = 'BatchImport' 
    #file_name = '02_batch_import_Dior.xlsx'
    df = read_excel(Path(path,file_name), sheet_name = my_sheet,keep_default_na=False)
    return clean(df)

In [5]:
def clean(df):
    
    df["material"] = ""

    df["description_en"] = df["description-en"]

    #name and category were removed.
    to_keep=["brand","code","madein","subcategory","season",
            "color","bicolors","gender","neckline","neck_shirt","sleeves","pattern","fastening","sole","pockets","description_en","dimensions","material"
            ,"neck","sleeve"]
    to_drop=[]
    for col in df.columns:
        if col not in to_keep:
            to_drop.append(col)
    df.drop(to_drop, inplace=True, axis=1)
    df.drop(df[df.description_en==""].index, inplace=True)
    df["description_en"] = df["description_en"].apply(erase_tags)
    df = add_features(df)
    return df

def erase_tags(st):
    st = lxml.html.fromstring(st).text_content()
    st = re.sub(r"(\w)([A-Z])", r"\1 \2", st)
    return re.sub(r"\s+", " ",st)

def separate_words(st):
    return re.sub(r"(\w)([A-Z])", r"\1 \2", st)

If more features are required to be added, use https://github.com/niyoushanajmaei/product_description_process/blob/main/extract.py for relevant functions.

In [6]:
def add_features(df):
    to_delete=[]
    materials = []
    for index, row in df.iterrows():
        # A version of the description, all low case, all punctuations removed
        # The possible words are separated after removing the punctuations
        desc_procs = row["description_en"].lower().translate(str.maketrans('', '', string.punctuation))
        desc_procs = separate_words(desc_procs)
    
        material = add_material(desc_procs)
        materials.append(str(material))
        if (material == "[]") :
            to_delete.append(index)
        #print(material)

    df["material"] = materials
    #print(df.material.to_string(index = False))
    print(f"deleted {len(to_delete)} rows. remaining: {len(df.index)}")
    return df

def add_material(desc):
    all_materials = ["canvas","cashmere","chenille","chiffon","cotton","crêpe","crepe","damask","georgette","gingham","jersey",
                    "lace","leather","linen","wool","modal","muslin","organza","polyester","satin","silk","spandex","suede","taffeta",
                    "toile","tweed","twill","velvet","viscose","synthetic matrials"]
    materials =  []
    for m in all_materials:
        if m in desc:
            materials.append(m)
    return materials

In [7]:
def clean_txt(st):
    st = st.replace('"','')
    st = st.replace('[','')
    st = st.replace(']','')
    st = st.replace("'",'')
    st = st.strip()
    return st

In [8]:
def write(df,write_dir):
    path = write_dir + "test/"
    ref_path = write_dir + "ref/"
    c=0
    #print(df.material.to_string(index = False))
    data= df.to_dict('index')
    #write the test set with and without the lables to have a reference
    for k,value in data.items():
        value = {k:v for k,v in value.items() if str(v)!= '' and str(v).strip() != '' and str(v)!='nan' and str(v)!='null' and str(v)!= '[]'}
        write_dict(value,ref_path+"product"+str(c)+".txt","n")
        c+=1
    c=0
    for k,value in data.items():
        value = {k:v for k,v in value.items() if str(v)!= '' and str(v).strip() != '' and str(v)!='nan' and str(v)!='null'and str(v)!= '[]'}
        write_dict(value,path+"product"+str(c)+".txt","t")
        c+=1 
    print("writing successful")

# writes the file with format:
# when type in "n" for normal
# {"tag1" : "value1", "tag2": "value2", ....} \n description: "description_en" \n ### \n
# when type is "t" for test
# {"tag1" : "value1", "tag2": "value2", ....} \n description: 
def write_dict(dict, path, type):
    desc = dict.pop("description_en", None)
    code = dict.pop("code",None)
    with open(path, 'w') as f:
        txt = ""
        if type == "n":
            txt += f"code: {code}\n"
        txt += f"features: {str(dict)} \ndescription: "
        txt = clean_txt(txt)
        if type != 'n':
            print(txt,file =f,end = '')
        if type == "n":
            txt += desc + "\n###\n"
            print(txt,file =f)


In [9]:
#make an empty checkpoint file used in the generation notebook
def ckpt_file(dir):
    with open(dir+"checkpoint.txt","w") as f:
        pass

Change the limit of products chosen from each excel file if needed

In [10]:
!rm -r /content/drive/MyDrive/dataset/ref/
!rm -r /content/drive/MyDrive/dataset/test/
!mkdir /content/drive/MyDrive/dataset/ref/
!mkdir /content/drive/MyDrive/dataset/test/

In [11]:
limit = 1000
read_dir = "/content/drive/MyDrive/dataset/raw/"
write_dir = "/content/drive/MyDrive/dataset/"
write(read_batch(read_dir,limit),write_dir)
ckpt_file(write_dir)

deleted 0 rows. remaining: 4
read file #1
writing successful
