In [1]:
import pandas as pd
import numpy as np
from fasttext import *
%matplotlib inline
import evaluation_metrics

from sklearn.model_selection import train_test_split

import scipy.spatial.distance as sd
import sklearn.metrics.pairwise as pw
import re

In [2]:
#load data
df = pd.read_csv("data/foodcategories_3labels.csv", sep = "\x01")
df.en_tags = df.en_tags.apply(eval)
df = df[df.en_tags.str.len()>0]

In [3]:
label, freq = np.unique(np.concatenate(df.en_tags.values), return_counts = True)
sr_labels = pd.Series(index=label, data=freq).sort_values()


In [4]:
df.en_tags.str.len().value_counts()

3    256401
2     47258
1     25644
Name: en_tags, dtype: int64

# Prepare

In [5]:
#generate features
feature_cols = ['product_name', 'generic_name', 'brands']
print(df[df.en_tags.str.len()==0])
X = df[feature_cols].fillna("").apply(lambda x: " ".join(x), axis = 1)
y = df.en_tags

Empty DataFrame
Columns: [product_name, generic_name, brands, categories, categories_tags, origins, manufacturing_places, labels, emb_codes, countries, main_category, en_tags]
Index: []


In [6]:
# preprocess and split train and test date
X = X.str.lower().apply(lambda x: re.sub(r'[^\w\s]','',x))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

dftrain = y_train.apply(lambda x: " ".join(["__label__" + y
                    for y in x])) + " " + X_train


In [7]:
dftrain_emb = y_train.apply(lambda x: " ".join(x)) + " " + X_train
dftrain_emb.to_csv(
    "train_emb.csv", index = False, sep=";")

In [8]:
dftest = y_test.apply(lambda x: " ".join(["__label__" + y
                    for y in x])) + " " + X_test


In [9]:
dftrain.to_csv(
    "train.csv", index = False, sep=";")
dftest.to_csv(
    "test.csv", index = False, sep=";")

# Train

In [10]:
# train_supervised("/Users/evelyn.trautmann/projects/openfood/train.csv",
#                 autotuneValidationFile = "test.csv")

In [11]:
!/Users/evelyn.trautmann/repos/fasttext3/fastText/fasttext supervised \
    -input "/Users/evelyn.trautmann/projects/openfood/train.csv" -output model \
    -autotune-validation "test.csv" -autotune-duration 1000

Progress: 100.0% Trials:   21 Best score:  0.473464 ETA:   0h 0m 0s14m25ssm 6s54s23s31s20s2s  0h 5m50s 1m59sm 2sh 0m57s ETA:   0h 0m50sm42s0h 0m18sm 5s
Training again with best arguments
Read 2M words
Number of words:  79212
Number of labels: 137
Progress: 100.0% words/sec/thread:  155338 lr:  0.000000 avg.loss:  1.339714 ETA:   0h 0m 0s 1s0h 1m55ss  0h 1m40s0h 1m39s 1.728153 ETA:   0h 1m39s  0h 1m28sh 1m25s  0h 1m15s 1m12s  0h 1m 1sh 0m59sm57s57.8% words/sec/thread:  163398 lr:  0.129317 avg.loss:  1.477740 ETA:   0h 0m48s 0m29ss 0m24sm22s 0m18s0m11s 0m 8s1.344346 ETA:   0h 0m 1s


In [12]:
!/Users/evelyn.trautmann/repos/fasttext3/fastText/fasttext dump model.bin args

dim 25
ws 5
epoch 100
minCount 1
neg 5
wordNgrams 4
loss softmax
model sup
bucket 1215864
minn 0
maxn 0
lrUpdateRate 100
t 0.0001


In [13]:
!/Users/evelyn.trautmann/repos/fasttext3/fastText/fasttext test model.bin test.csv 3

N	108670
P@3	0.761
R@3	0.846


In [14]:
model = load_model("model.bin")




In [15]:
model_emb = train_unsupervised("/Users/evelyn.trautmann/projects/openfood/train_emb.csv")

In [None]:
# model = train_supervised(input="/Users/evelyn.trautmann/projects/openfood/train.csv", 
#                          autotuneValidationFile="test.csv")

# Test

In [16]:
model.test("test.csv", k=3)

(108670, 0.7606576485383884, 0.8459420828742287)

In [17]:
df_test = X_test.to_frame()
df_test.columns = ["feature"]

In [18]:
# determine number of labels
df_test = df_test.join(y_test)

K = int(df_test.en_tags.str.len().quantile(0.75))
K

3

# Multilabel Evaluation 

In [19]:
df_test["prediction"] = df_test.feature.apply(lambda x: model.predict(x, k=K))

for k in range(K):
    df_test["label_predicted%i" %k] = df_test.prediction.str[0].str[k].str.replace("__label__","")
    df_test["confidence%i" %k] = df_test.prediction.str[1].str[k]
    df_test["truth%i" %k] = df_test.en_tags.str[k].str.replace("__label__","")


In [20]:
df_test.count()

feature             108670
en_tags             108670
prediction          108670
label_predicted0    108670
confidence0         108670
truth0              108670
label_predicted1    108670
confidence1         108670
truth1              100092
label_predicted2    108670
confidence2         108670
truth2               84382
dtype: int64

In [21]:
# assertion no empty tags
assert(len(df_test.en_tags[df_test.en_tags.str.len()==0])==0)

In [22]:
classes = sr_labels.index.values
#classes = list(set(np.concatenate(df_test.en_tags[df_test.en_tags.str.len()>0].values)))

truth = list()
predictions = list()
for k in range(int(K)):
    truth.append("truth%i" %k)
    predictions.append("label_predicted%i" %k)

In [23]:
%time confusion = evaluation_metrics.get_confusion(df_test, classes, K, truth, predictions)

CPU times: user 9min 42s, sys: 15.6 s, total: 9min 58s
Wall time: 15min 43s


In [24]:
print(np.diag(confusion).sum())
print(confusion.sum().sum())
print(df_test.shape[0])

print(np.diag(confusion).sum()/confusion.sum().sum())

247982.0
293144.0
108670
0.8459391971181399


In [25]:
df_report = evaluation_metrics.get_report(confusion)

In [26]:
evaluation_metrics.get_summary(df_report, confusion)

Unnamed: 0,precision,recall,f1-score,support
micro avg,0.845939,0.845939,0.845939,293144.0
macro avg,0.711866,0.791126,0.749406,293144.0
weighted avg,0.85662,0.845939,0.851246,293144.0


# Similarity

In [27]:
class_vecs = pd.Series({cl:model_emb.get_sentence_vector(cl) for cl in classes})

In [28]:
model_emb.get_sentence_vector??

In [29]:
Dst = class_vecs.apply(lambda x: class_vecs.apply(lambda y: sd.euclidean(x,y)))

In [31]:
mask = Dst > 0.5
Dst[mask]=np.nan
Dst.apply(np.argmax)

will be corrected to return the positional maximum in the future.
Use 'series.values.argmax' to get the position of the maximum now.
  return getattr(obj, method)(*args, **kwds)


fruit-juices-and-nectars                            juices-and-nectars
plant-based-spreads                                plant-based-spreads
crisps                                                          crisps
fruit-based-beverages                            plant-based-beverages
chicken-breasts                                               chickens
yogurts                                                  fruit-yogurts
spreadable-fats                                        spreadable-fats
legumes                                                        legumes
bee-products                                              bee-products
vegetable-fats                                          vegetable-fats
white-hams                                                  white-hams
plant-based-pickles                                plant-based-pickles
dried-plant-based-foods                       canned-plant-based-foods
chocolate-candies                                    chocolate-candies
plant-

In [32]:
Dst[mask].count().value_counts()

0    138
dtype: int64

In [33]:
S = np.exp(-Dst**2).fillna(0)

In [34]:
confusion.sum()

fruit-juices-and-nectars                 1.000000
plant-based-spreads                      0.000000
crisps                                   0.000000
fruit-based-beverages                    0.000000
chicken-breasts                          0.000000
yogurts                                  2.000000
spreadable-fats                          3.000000
legumes                                  1.000000
bee-products                            19.000000
vegetable-fats                          14.666667
white-hams                              18.333333
plant-based-pickles                     21.666667
dried-plant-based-foods                 18.000000
chocolate-candies                       30.000000
plant-based-beverages                   38.000000
fruits-based-foods                      93.000000
canned-plant-based-foods               126.666667
salmons                                158.666667
cereals-and-potatoes                   158.000000
seeds                                  176.333333


In [35]:
df_report = evaluation_metrics.get_report(confusion, S)
#.dropna()

In [36]:
evaluation_metrics.get_summary(df_report, confusion, S)

Unnamed: 0,precision,recall,f1-score,support
micro avg,0.849167,0.849167,0.849167,293144.0
macro avg,0.718927,0.797948,0.756379,293144.0
weighted avg,0.859261,0.849167,0.854184,293144.0


In [45]:
df.to_csv("data/foodcategories_3labels.csv", index=False, sep="\x01")

In [48]:
!ls -lh


total 725328
-rw-r--r--  1 evelyn.trautmann  666409104    85K Sep 23 22:15 OpenFoodFactExample-Copy1.ipynb
-rw-r--r--  1 evelyn.trautmann  666409104    27K Sep 23 22:28 OpenFoodFactExample.ipynb
-rw-r--r--  1 evelyn.trautmann  666409104   103K Sep 18 22:38 PrepareData.ipynb
-rw-r--r--  1 evelyn.trautmann  666409104   169K Sep 15 13:32 PyDataExample.ipynb
-rw-r--r--  1 evelyn.trautmann  666409104   1.5K Sep 23 21:32 README.md
-rw-r--r--  1 evelyn.trautmann  666409104   9.4K Sep 19 21:56 Untitled.ipynb
drwxr-xr-x  3 evelyn.trautmann  666409104    96B Sep 23 21:27 [34m__pycache__[m[m
-rw-r--r--@ 1 evelyn.trautmann  666409104    14M Sep  7 14:39 alc_beverages.csv
-rw-r--r--@ 1 evelyn.trautmann  666409104   2.7M Sep  7 14:40 beef.csv
-rw-r--r--@ 1 evelyn.trautmann  666409104    18M Sep  7 14:29 biscuits.csv
-rw-r--r--@ 1 evelyn.trautmann  666409104    13M Sep  7 14:29 cake.csv
drwxr-xr-x  4 evelyn.trautmann  666409104   128B Sep 23 22:27 [34mdata[m[m
drwxr-xr-x  5 evelyn.