####  Packages import

In [1]:
import pandas as pd
import numpy as np
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

#### Load data

In [63]:
#Train data
xtrain = pd.read_pickle('./data/x_train_cleaned.pkl') 
xtrain.head()

Unnamed: 0,designation,description,productid,imageid,prdtypecode,image_name,text
0,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046,10,image_1263597046_product_3804725264.jpg,olivia personalisiertes notizbuch seiten punkt...
1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237,2280,image_1008141237_product_436067568.jpg,journal arts art marche salon art asiatique pa...
2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978,50,image_938777978_product_201115110.jpg,grand stylet ergonomique bleu gamepad nintendo...
3,Peluche Donald - Europe - Disneyland 2000 (Mar...,,50418756,457047496,1280,image_457047496_product_50418756.jpg,peluche donald europe disneyland marionnette d...
4,La Guerre Des Tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786,2705,image_1077757786_product_278535884.jpg,guerre tuquesluc grandeur veut organiser jeu g...


L’exploration visuelle des images et l’utilisation des WordCould par classe nous ont permis d’identifier les catégories de produits suivantes :

In [71]:
#Dictionary of prdtypecode and their corresponding categories
dict_prdtypecode = {"prdtypecode" : [50, 2705, 2522, 2582, 1560, 1281, 1920, 1280, 1140, 1300, 2060, 2583,
                                     60, 1320, 2280, 1302, 2220, 40, 2905, 2585, 1940, 1160, 1301, 10, 1180,
                                     2403, 2462],                 
            
 
                     "Label" : ["video games accessories", "books", "stationery", "furniture kitchen and garden", 
                               "interior furniture and bedding", "board games", "interior accessories",
                               "toys for children","figurines and Toy Pop", "remote controlled models", "decoration interior",
                               "piscine spa","games and consoles", "early childhood", "magazines", "toys, outdoor playing, clothes",
                               "supplies for domestic animals", "imported video games", "online distribution of video games",
                               "gardening and DIY","Food","playing cards", "accessories children", "adult books",
                               "figurines, masks and role playing games", "children books and magazines",
                                "games"]                                   
                   }

df_class = pd.DataFrame(data=dict_prdtypecode)
df_class

# Create dictionnay from prdtypecode and corresponding labels
dict_code_label = dict(zip(df_class.prdtypecode, df_class.Label))
dict_code_label

{50: 'video games accessories',
 2705: 'books',
 2522: 'stationery',
 2582: 'furniture kitchen and garden',
 1560: 'interior furniture and bedding',
 1281: 'board games',
 1920: 'interior accessories',
 1280: 'toys for children',
 1140: 'figurines and Toy Pop',
 1300: 'remote controlled models',
 2060: 'decoration interior',
 2583: 'piscine spa',
 60: 'games and consoles',
 1320: 'early childhood',
 2280: 'magazines',
 1302: 'toys, outdoor playing, clothes',
 2220: 'supplies for domestic animals',
 40: 'imported video games',
 2905: 'online distribution of video games',
 2585: 'gardening and DIY',
 1940: 'Food',
 1160: 'playing cards',
 1301: 'accessories children',
 10: 'adult books',
 1180: 'figurines, masks and role playing games',
 2403: 'children books and magazines',
 2462: 'games'}

In [72]:
from tabulate import tabulate

def custom_print(dframe):
    print(tabulate(dframe, headers='keys', tablefmt='psql', showindex=False))

In [73]:
df_class_labels= df_class.sort_values('prdtypecode', ascending=True) 
custom_print(df_class_labels)

+---------------+-----------------------------------------+
|   prdtypecode | Label                                   |
|---------------+-----------------------------------------|
|            10 | adult books                             |
|            40 | imported video games                    |
|            50 | video games accessories                 |
|            60 | games and consoles                      |
|          1140 | figurines and Toy Pop                   |
|          1160 | playing cards                           |
|          1180 | figurines, masks and role playing games |
|          1280 | toys for children                       |
|          1281 | board games                             |
|          1300 | remote controlled models                |
|          1301 | accessories children                    |
|          1302 | toys, outdoor playing, clothes          |
|          1320 | early childhood                         |
|          1560 | interior furniture and

#### Add Label column to data

In [78]:
xtrain['Label'] = xtrain['prdtypecode'].replace(dict_code_label)
xtrain.head()

Unnamed: 0,designation,description,productid,imageid,prdtypecode,image_name,text,Label
0,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046,10,image_1263597046_product_3804725264.jpg,olivia personalisiertes notizbuch seiten punkt...,adult books
1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237,2280,image_1008141237_product_436067568.jpg,journal arts art marche salon art asiatique pa...,magazines
2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978,50,image_938777978_product_201115110.jpg,grand stylet ergonomique bleu gamepad nintendo...,video games accessories
3,Peluche Donald - Europe - Disneyland 2000 (Mar...,,50418756,457047496,1280,image_457047496_product_50418756.jpg,peluche donald europe disneyland marionnette d...,toys for children
4,La Guerre Des Tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786,2705,image_1077757786_product_278535884.jpg,guerre tuquesluc grandeur veut organiser jeu g...,books


In [84]:
df = xtrain.groupby(['prdtypecode', 'Label'])['Label'].count().reset_index(name="Total")
custom_print(df)

+---------------+-----------------------------------------+---------+
|   prdtypecode | Label                                   |   Total |
|---------------+-----------------------------------------+---------|
|             0 | adult books                             |    3116 |
|             1 | imported video games                    |    2508 |
|             2 | video games accessories                 |    1681 |
|             3 | games and consoles                      |     832 |
|             4 | figurines and Toy Pop                   |    2671 |
|             5 | playing cards                           |    3953 |
|             6 | figurines, masks and role playing games |     764 |
|             7 | toys for children                       |    4870 |
|             8 | board games                             |    2070 |
|             9 | remote controlled models                |    5045 |
|            10 | accessories children                    |     807 |
|            11 | to

#### Change the 27 product codes to 0 to 26 (Mondatory for Image Deep Learning models training)

In [80]:
xtrain["prdtypecode_org"] = xtrain["prdtypecode"]

In [81]:
xtrain.replace({'prdtypecode':      {10:0,
                                    40:1,
                                    50:2,
                                    60:3,
                                    1140:4,
                                    1160:5,
                                    1180:6,
                                    1280:7,
                                    1281:8,
                                    1300:9                                    
                                    }}, inplace = True)

xtrain.replace({'prdtypecode':       {1301:10,
                                    1302:11,
                                    1320:12,
                                    1560:13,
                                    1920:14,
                                    1940:15,
                                    2060:16,
                                    2220:17,
                                    2280:18,
                                    2403:19,
                                    2462:20,
                                    2522:21,
                                    2582:22,
                                    2583:23,
                                    2585:24,
                                    2705:25,
                                    2905:26
                                    }}, inplace = True)

In [85]:
df = xtrain.groupby(['prdtypecode_org', 'prdtypecode', 'Label'])['Label'].count().reset_index(name="Total")
custom_print(df)

+-------------------+---------------+-----------------------------------------+---------+
|   prdtypecode_org |   prdtypecode | Label                                   |   Total |
|-------------------+---------------+-----------------------------------------+---------|
|                10 |             0 | adult books                             |    3116 |
|                40 |             1 | imported video games                    |    2508 |
|                50 |             2 | video games accessories                 |    1681 |
|                60 |             3 | games and consoles                      |     832 |
|              1140 |             4 | figurines and Toy Pop                   |    2671 |
|              1160 |             5 | playing cards                           |    3953 |
|              1180 |             6 | figurines, masks and role playing games |     764 |
|              1280 |             7 | toys for children                       |    4870 |
|         

#### Save final DataFrame for later use

In [86]:
xtrain.to_pickle('./data/df_train_final.pkl')
xtrain.to_csv('./data/df_train_final.csv')