In [1]:
import pandas as pd
import numpy as np
import unidecode
import itertools

from collections import Counter
from operator import itemgetter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from joblib import dump, load
from sklearn.preprocessing import MultiLabelBinarizer
from os import listdir

# Cleaning the Data
** I have found a problem with three lines in the file that I have solved manually

In [3]:
dtype = {'price': float}

data = pd.read_csv('catalog-data-corrected.zip', sep=';', dtype=dtype).set_index('sku')
data.head()

Unnamed: 0_level_0,price,name,brand,type,categories,product_image_url
sku,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
I1700-67500-2750,64.83,Bota Timberland Trail Dust Marrom,Timberland,sportsshoes,"Botas,Botas Outdoor,Calçados,Calçados Feminino...",http://static.dafiti.com.br/424/1-zoom.jpg
Y3750-48500-2750,50.42,Tênis Timberland Gorge C2 Verde,Timberland,sportsshoes,"Aventura,Black Friday 2014,Calçados,Calçados F...",http://static.dafiti.com.br/524/1-zoom.jpg
I7600-48500-2750,27.9,Tênis Timberland Gorge C2 Cinza,Timberland,sportsshoes,"48 Horas Tenis,Aventura,Calçados,Calçados Femi...",http://static.dafiti.com.br/624/1-zoom.jpg
B7020-31600-3060,21.78,Tênis ME8 Mid Preto e Vermelho,And 1,sportsshoes,"Calçados,Calçados Masculinos,Esportes,Tênis",http://static.dafiti.com.br/544/1-zoom.jpg
Y3120-21600-3060,72.34,Tênis Pearl Mid Branco e Azul,And 1,sportsshoes,"Calçados,Calçados Masculinos,Esportes,Tênis",http://static.dafiti.com.br/644/1-zoom.jpg


### Functions for cleaning the category names
I have decided to take out the accents from the category names, so I have a higher match rate, as well as take out the _R$_ symbol, use the integer part of the price and lowercase.

In [3]:
def clean_out(s):
    """removes the strings in take_out from the given string"""
    take_out = {
        ',90',
        '.90',
        'r$'
    }
    
    for w in take_out:
        s = s.replace(w, '')
    return s


def clean_cat_name(cat_name):
    """removes accents, lowercase it and trims the spaces from the given category name"""
    return clean_out(unidecode.unidecode(cat_name).strip().lower())


def get_cat_list(cat_string):
    """returns a list with the clean category names for a given categories string"""
    return [clean_cat_name(cat_name) for cat_name in cat_string.split(',')]

def flatten_2d_list(l):
    return list(itertools.chain.from_iterable(l))


In [4]:
categories = (pd.read_table('categories.txt', encoding='utf8')
                .reset_index()
                .rename(columns={'index': 'cat_id'})
                .reindex(columns=['name', 'cat_id'])
                .assign(name=lambda df: df.name.apply(clean_cat_name)))

categories.head()

Unnamed: 0,name,cat_id
0,mega liquida tenis e sapatenis app,0
1,2 bermudas por 129,1
2,2 bermudas por 149,2
3,2 bermudas por 169,3
4,2 bermudas por 169,4


### Category name match rate

Now, I want to find out which categories are in the dataset but not in the categories.txt file, so we might consider adding them based on the number of times they appear.

In [5]:
# get a two dimension list with all the category names found
all_categories_found = data.categories.apply(get_cat_list).values.tolist()

# flatten it to a one dimension list with all the occurrences of the names
flat_all_categories_found = flatten_2d_list(all_categories_found)

# create a set so we have the unique occurrences
set_all_categories_found = set(flat_all_categories_found)

# see what categories where not found
categories_not_found = set_all_categories_found - set(categories.name.values)

# count how many occurrences we have for each category
occurrences_by_cat = Counter(flat_all_categories_found)

# filter out the categories not found in categories.txt
not_found_occurrences = {k: v for k, v in occurrences_by_cat.items() if k in categories_not_found}

top_cat_names = [cat[0] for cat in sorted(occurrences_by_cat.items(), key=itemgetter(1), reverse=True)]

In [6]:
# n of categories in categories.txt
n_categories = len(categories)
n_categories

1673

In [7]:
#n of categories found
n_cat_found = len(set_all_categories_found)
n_cat_found

1719

In [8]:
# n of cataegories not in the categories.txt file
n_cat_not_found = len(categories_not_found)
n_cat_not_found

98

In [9]:
n_cat_not_found / n_cat_found

0.05700988947062245

In [10]:
n_occurrences = sum(v for v in occurrences_by_cat.values())
n_occurrences

2708322

In [11]:
n_occurrences_not_found = sum(v for v in not_found_occurrences.values())
n_occurrences_not_found

30475

In [12]:
n_occurrences_not_found / n_occurrences

0.011252354779084614

In [13]:
n_categories - (n_cat_found - n_cat_not_found)

52

We can see that the number of categories in the `categories.txt` file are `1.673`, while we have found `1.719`different ones in the dataset, `98` out of these `1.719` are not in the `categories.txt` file, which represents `5.7%` of the categories, but only `1.1%` of the occurrences. This means that `52` categories from the `categories.txt` file are not being used.

In [14]:
# top 20 categories not found
sorted(occurrences_by_cat.items(), key=itemgetter(1), reverse=True)[:20]

[('roupas', 203108),
 ('esportes', 189423),
 ('homens', 142097),
 ('roupas masculinas', 125311),
 ('casual', 123671),
 ('esportes masculinos', 111039),
 ('camisetas', 91889),
 ('mulheres', 75269),
 ('roupas femininas', 70873),
 ('surf', 63084),
 ('calcados', 56311),
 ('manga curta', 55716),
 ('camiseta manga curta', 46601),
 ('camisetas manga curta', 44176),
 ('tenis', 41884),
 ('skate', 37452),
 ('calcados masculinos', 36929),
 ('acessorios', 36585),
 ('esportes femininos', 35163),
 ('acessorios masculinos', 28810)]

In [15]:
top_cat_names[:20]

['roupas',
 'esportes',
 'homens',
 'roupas masculinas',
 'casual',
 'esportes masculinos',
 'camisetas',
 'mulheres',
 'roupas femininas',
 'surf',
 'calcados',
 'manga curta',
 'camiseta manga curta',
 'camisetas manga curta',
 'tenis',
 'skate',
 'calcados masculinos',
 'acessorios',
 'esportes femininos',
 'acessorios masculinos']

In [16]:
sorted(not_found_occurrences.items(), key=itemgetter(1), reverse=True)[:10]

[('90', 17129),
 ('bone', 2383),
 ('chapeu e viseira', 2383),
 ('90 - masculino', 2377),
 ('90 - 2', 1831),
 ('90 feminino', 939),
 ('2 blusas por 99', 544),
 ('90 feminino 2', 514),
 ('90 camisetas', 408),
 ('00 feminino casual', 352)]

We might consider to add these categories to the `categories.txt` file, so we don't leave them out. For now, these categories will be removed from the dataset.

# Model to predict the category
## One model for each category

My plan is to create one binary classifier for each category (in this case, I will build just for the top 20 categories, but the rest follows the same pipeline).

## Building the dataset
### The labels
I will create a function that will give me the `True` or `False` value for a given `category_name`. The output of this function will be used as the labels for the dataset

In [17]:
def get_labels(cat_name):
    return data.categories.apply(lambda x: cat_name in get_cat_list(x))

### Features to predict
#### product name
For the product name, I'm using a TfIdf vectorizer to transform the string names into vectors.

#### Brand and Type
Since these variables are categorical, I will parse them into dummy variables.

#### Price
Since this is a continuous variable, I'm going to use it as it is

### Final Dataset
I'm going to combine the variables described above to merge everything into a dataset to train the classifier

In [18]:
# Parsing the product name

prod_name_vectorizer = TfidfVectorizer()

corpus = data['name'].values

prod_names = prod_name_vectorizer.fit_transform(corpus)

brand_binarizer = MultiLabelBinarizer(sparse_output=True)
prod_type_binarizer = MultiLabelBinarizer(sparse_output=True)

brand_binarized = brand_binarizer.fit_transform(data['brand'].values.reshape(-1, 1))
prod_type_binarized = prod_type_binarizer.fit_transform(data['type'].values.reshape(-1, 1))

In [19]:
dump(prod_name_vectorizer, 'feature_extraction/prod_name_vectorizer.joblib')

['feature_extraction/prod_name_vectorizer.joblib']

In [20]:
dump(brand_binarizer, 'feature_extraction/brand_binarizer.joblib')

['feature_extraction/brand_binarizer.joblib']

In [21]:
dump(prod_type_binarizer, 'feature_extraction/prod_type_binarizer.joblib')

['feature_extraction/prod_type_binarizer.joblib']

In [24]:
# Merging all the variables

features = [
    prod_names,
    brand_binarized,
    prod_type_binarized,
    data[['price']].values
]

X = hstack(features)

## Training the model
I have built a small pipeline to train and save a model based on a category name.
The pipeline was built with the possibility of adding more complex classifiers in mind.
For now, for sake of training time, I will use `Logistic Regression`, but more complex algorithms like `xgboost` are easy to add.

In [25]:
def print_confusion_matrix(cm):
    print("           |    predicted    |")
    print("     real  |  False |   True |")
    print("           |========|========|")
    print(f"     False | {cm[0][0]:6} | {cm[0][1]:6} |")
    print(f"     True  | {cm[1][0]:6} | {cm[1][1]:6} |")

    
def print_evaluation(clf, x_train, y_train, x_test, y_test):
    train_score = clf.score(x_train, y_train)
    test_score = clf.score(x_test, y_test)
    train_cm = confusion_matrix(y_train, clf.predict(x_train), labels=[False, True])
    test_cm = confusion_matrix(y_test, clf.predict(x_test), labels=[False, True])

    
    print(f"train accuracy: {train_score * 100: .2f}%")
    print("train confusion matrix")
    print_confusion_matrix(train_cm)
    print()
    print(f"test score: {test_score * 100: .2f}%")
    print("test confusion_matrix")
    print_confusion_matrix(test_cm)


def train_model(cat_name, classifier=DecisionTreeClassifier, classifier_params={}):
    print("========")
    print(f"creating model for {cat_name}")
    print("building labels...")
    y = get_labels(cat_name)
    
    print("building dataset...")
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)
    
    print("training classifier...")
    print()
    
    clf = classifier(**classifier_params)
    clf.fit(x_train, y_train)
    
    print_evaluation(clf, x_train, y_train, x_test, y_test)
    
    print()
    print("saving model")
    print("======")
    print()
    dump(clf, f'models/{cat_name}.joblib')
          
    return clf

In [26]:
lr_params = {'solver': 'liblinear'}

clf = train_model('tenis', classifier=LogisticRegression, classifier_params=lr_params)

creating model for tenis
building labels...
building dataset...
training classifier...

train accuracy:  99.39%
train confusion matrix
           |    predicted    |
     real  |  False |   True |
     False | 181033 |    875 |
     True  |    423 |  28821 |

test score:  99.27%
test confusion_matrix
           |    predicted    |
     real  |  False |   True |
     False |  77413 |    442 |
     True  |    215 |  12425 |

saving model



## Training a model for the top n categories

In [27]:
n_cat = 20

for cat_name in top_cat_names[:n_cat]:
    train_model(cat_name, classifier=LogisticRegression, classifier_params=lr_params)

creating model for roupas
building labels...
building dataset...
training classifier...





train accuracy:  98.28%
train confusion matrix
           |    predicted    |
     real  |  False |   True |
     False |  67712 |   1160 |
     True  |   2464 | 139816 |

test score:  98.28%
test confusion_matrix
           |    predicted    |
     real  |  False |   True |
     False |  29171 |    496 |
     True  |   1059 |  59769 |

saving model

creating model for esportes
building labels...
building dataset...
training classifier...

train accuracy:  91.64%
train confusion matrix
           |    predicted    |
     real  |  False |   True |
     False |  66125 |  12384 |
     True  |   5266 | 127377 |

test score:  90.79%
test confusion_matrix
           |    predicted    |
     real  |  False |   True |
     False |  27954 |   5761 |
     True  |   2570 |  54210 |

saving model

creating model for homens
building labels...
building dataset...
training classifier...

train accuracy:  87.36%
train confusion matrix
           |    predicted    |
     real  |  False |   True |
     

building dataset...
training classifier...

train accuracy:  97.59%
train confusion matrix
           |    predicted    |
     real  |  False |   True |
     False | 181599 |   3793 |
     True  |   1288 |  24472 |

test score:  97.08%
test confusion_matrix
           |    predicted    |
     real  |  False |   True |
     False |  77406 |   1920 |
     True  |    721 |  10448 |

saving model

creating model for acessorios
building labels...
building dataset...
training classifier...

train accuracy:  99.26%
train confusion matrix
           |    predicted    |
     real  |  False |   True |
     False | 184925 |    545 |
     True  |   1012 |  24670 |

test score:  99.20%
test confusion_matrix
           |    predicted    |
     real  |  False |   True |
     False |  79356 |    236 |
     True  |    492 |  10411 |

saving model

creating model for esportes femininos
building labels...
building dataset...
training classifier...

train accuracy:  92.83%
train confusion matrix
         

# Predicting Categories

I have built a simple small pipeline to predict the categories (from the ones trained) a certain product belogns to. It is not really difficult to extend this for a list of products instead of only one.

In [38]:
def load_models():
    return {name.split('.')[0]: load(f'models/{name}') for name in listdir('models')}

def predict_product_categories(prod_features):
    brand_binarizer = load('feature_extraction/brand_binarizer.joblib')
    prod_type_binarizer = load('feature_extraction/prod_type_binarizer.joblib')
    prod_name_vectorizer = load('feature_extraction/prod_name_vectorizer.joblib')
    models = load_models()
    
    prod_name = prod_name_vectorizer.transform([prod_features['name']])

    prod_brand = brand_binarizer.transform([[prod_features['brand']]])

    prod_type = prod_type_binarizer.transform([[prod_features['type']]])

    prod_price = [prod_features['price']]

    features = [
        prod_name,
        prod_brand,
        prod_type,
        prod_price
    ]

    pred_x = hstack(features)

    return {cat_name for cat_name, model in models.items() if model.predict(pred_x)[0]}

In [39]:
# features of the product to be categorized

prod_features ={
    'name': 'Mochila Santino Preta',
    'price': 1,
    'brand': 'Santino',
    'type': 'sportsacessories'
}

In [40]:
predict_product_categories(prod_features)

{'acessorios', 'acessorios masculinos'}