In [1]:
from collections import Counter

import categories
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, precision_score
from sklearn.externals import joblib
from more_itertools import flatten

from ml import import_data, create_classifier

In [2]:
df = import_data("data/en.openfoodfacts.org.products.csv")
fr_df = df[df['countries_tags'] == 'en:france']
filt_df = fr_df[pd.notnull(fr_df['categories_tags'])]

  if self.run_code(code, result):


In [3]:
categories_counter = Counter(flatten(c for c in filt_df['categories_tags']))
print("Number of categories: {}".format(len(categories_counter)))

Number of categories: 12390


In [4]:
selected_categories = [
    "en:beverages",
    "en:sugary-snacks",
    "en:meals",
    "en:dairies",
    "en:meats",
    "en:desserts",
    "en:frozen-foods",
    "en:breakfasts",
    "en:cheeses",
    "en:biscuits",
    "en:groceries",
    "en:fats",
    "en:chocolates",
    "en:sauces",
]

selected_categories_set = set(selected_categories)

In [5]:
criterion = filt_df['categories_tags'].map(lambda x: bool(set(x).intersection(selected_categories_set)))
cat_df = filt_df[criterion]
cat_df['categories_tags_int'] = cat_df.categories_tags.apply(lambda categories: [selected_categories.index(x) for x in categories if x in selected_categories])
cat_df['categories_tags_array'] = cat_df.categories_tags.apply(lambda categories: [1 if x in categories else 0 for x in selected_categories])
print(f"{len(fr_df)} elements in original dataframe, {len(cat_df)} after category filter")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


405973 elements in original dataframe, 92160 after category filter


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [6]:
TRAIN = True

df_train, df_test = train_test_split(cat_df, random_state=42)
y_train = np.array(list(df_train.categories_tags_array.values))
y_test = np.array(list(df_test.categories_tags_array.values))

if TRAIN:
    classifier = create_classifier(custom_tokenizer=True)
    classifier.fit(df_train, y_train)
else:
    classifier = joblib.load('clf-model-wide.joblib')

y_pred = classifier.predict(df_test)
print(classification_report(y_test, y_pred, target_names=selected_categories))



                  precision    recall  f1-score   support

    en:beverages       0.96      0.89      0.92      5315
en:sugary-snacks       0.94      0.89      0.91      4552
        en:meals       0.92      0.84      0.88      2990
      en:dairies       0.96      0.84      0.90      3171
        en:meats       0.96      0.87      0.91      1947
     en:desserts       0.89      0.63      0.74      1707
 en:frozen-foods       0.86      0.54      0.66      1569
   en:breakfasts       0.95      0.80      0.87      1820
      en:cheeses       0.97      0.86      0.91      1343
     en:biscuits       0.91      0.75      0.82       997
    en:groceries       0.95      0.77      0.85      1505
         en:fats       0.98      0.78      0.87       963
   en:chocolates       0.91      0.71      0.80      1110
       en:sauces       0.94      0.76      0.84       948

       micro avg       0.94      0.81      0.87     29937
       macro avg       0.93      0.78      0.85     29937
    weighted

  'precision', 'predicted', average, warn_for)




                  precision    recall  f1-score   support

    en:beverages       0.96      0.89      0.92      5315
en:sugary-snacks       0.95      0.88      0.91      4552
        en:meals       0.92      0.83      0.87      2990
      en:dairies       0.96      0.84      0.90      3171
        en:meats       0.96      0.87      0.91      1947
     en:desserts       0.89      0.63      0.74      1707
 en:frozen-foods       0.86      0.53      0.66      1569
   en:breakfasts       0.96      0.80      0.87      1820
      en:cheeses       0.97      0.86      0.91      1343
     en:biscuits       0.91      0.74      0.82       997
    en:groceries       0.95      0.77      0.85      1505
         en:fats       0.98      0.78      0.87       963
   en:chocolates       0.91      0.71      0.79      1110
       en:sauces       0.95      0.76      0.84       948

       micro avg       0.94      0.81      0.87     29937
       macro avg       0.94      0.78      0.85     29937
    weighted

  'precision', 'predicted', average, warn_for)


In [8]:
y = np.array(list(cat_df.categories_tags_array.values))
X = cat_df.ingredients_text.values

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [9]:
classifier = Pipeline([
    ('vectorizer', CountVectorizer(min_df=5, preprocessor=preprocess)),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LogisticRegression()))])

classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred, target_names=selected_categories))



                  precision    recall  f1-score   support

    en:beverages       0.94      0.69      0.80      5420
en:sugary-snacks       0.93      0.75      0.83      4463
        en:meals       0.88      0.73      0.80      2967
      en:dairies       0.95      0.70      0.80      3191
        en:meats       0.94      0.77      0.85      1958
     en:desserts       0.84      0.47      0.60      1717
 en:frozen-foods       0.82      0.40      0.54      1514
   en:breakfasts       0.93      0.57      0.70      1825
      en:cheeses       0.95      0.72      0.82      1355
     en:biscuits       0.84      0.61      0.70      1037
    en:groceries       0.87      0.56      0.68      1491
         en:fats       0.97      0.58      0.72       993
   en:chocolates       0.87      0.56      0.68      1073
       en:sauces       0.86      0.53      0.66       937

       micro avg       0.91      0.65      0.76     29941
       macro avg       0.90      0.62      0.73     29941
    weighted

  'precision', 'predicted', average, warn_for)


In [10]:
i = 0
print(X_test[i])
print(preprocess(X_test[i]))

Eau de source 84,6%, sucre, jus de pomme à base de concentré 6,5%, jus de cassis à base de concentré 3,5%, acidifiant (acide citrique), arômes       
eau de source sucre jus de pomme a base de concentre jus de cassis a base de concentre acidifiant acide citrique aromes 


In [11]:
def predict(text):
    pred = classifier.predict([text])[0]
    return [selected_categories[i] for i in range(len(pred)) if pred[i]]

In [12]:
i = 10
print(X_test[i])
print(predict(X_test[i]))
print([selected_categories[k] for k in range(len(y_test[i])) if y_test[i][k]])

Jambon de porc VPF, sel, dextrose, conservateurs : erythorbate et nitrite de sodium, arômes.
['en:meats']
['en:meats']


In [13]:
print(predict("eau, sucres"))

['en:beverages']


In [14]:
df_train, df_test = train_test_split(cat_df)
y_train = np.array(list(df_train.categories_tags_array.values))
y_test = np.array(list(df_test.categories_tags_array.values))

column_trans = ColumnTransformer([
    ('ingredients_vectorizer', CountVectorizer(min_df=5, preprocessor=preprocess), 'ingredients_text'),
    ('product_name_vectorizer', CountVectorizer(min_df=5, preprocessor=preprocess), 'product_name'),
])

column_trans.fit(df_train)

classifier = Pipeline([
    ('column_transformer', column_trans),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LogisticRegression()))])

classifier.fit(df_train, y_train)
y_pred = classifier.predict(df_test)
print(classification_report(y_test, y_pred, target_names=selected_categories))



                  precision    recall  f1-score   support

    en:beverages       0.96      0.90      0.93      5313
en:sugary-snacks       0.95      0.87      0.91      4583
        en:meals       0.92      0.84      0.88      3047
      en:dairies       0.96      0.84      0.90      3106
        en:meats       0.96      0.87      0.91      1906
     en:desserts       0.92      0.61      0.73      1776
 en:frozen-foods       0.87      0.52      0.65      1534
   en:breakfasts       0.94      0.81      0.87      1837
      en:cheeses       0.97      0.87      0.92      1361
     en:biscuits       0.91      0.72      0.80      1033
    en:groceries       0.95      0.76      0.84      1581
         en:fats       0.98      0.75      0.85       938
   en:chocolates       0.88      0.73      0.80      1019
       en:sauces       0.96      0.78      0.86      1015

       micro avg       0.94      0.81      0.87     30049
       macro avg       0.94      0.78      0.85     30049
    weighted

  'precision', 'predicted', average, warn_for)


In [15]:
clf = classifier.steps[2][1]
beverage_coeff = clf.coef_[0, :]

In [16]:
column_trans

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('ingredients_vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 1),
        preprocessor=<fu...=None,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None), 'product_name')])

In [17]:
def analyze_top_features(features, scores, top=10, negative=False):
    sorted_scores_features = sorted(zip(features, scores), key=lambda x: x[1], reverse=not negative)
    return sorted_scores_features[:top]

In [18]:
len(column_trans.get_feature_names())

10711

In [19]:
for i in range(len(selected_categories)):
    category = selected_categories[i]
    print(category)
    coef = clf.coef_[i, :]
    top_features = analyze_top_features(column_trans.get_feature_names(), coef, 20)
    for feature, score in top_features:
        print("{}: {}".format(feature, score))
    print("")

en:beverages
product_name_vectorizer__jus: 10.775450263052933
ingredients_vectorizer__eau: 7.428921536441256
product_name_vectorizer__boire: 7.04305341051363
ingredients_vectorizer__jus: 6.979689280532793
product_name_vectorizer__the: 6.829278696948448
product_name_vectorizer__nectar: 6.678503539160729
product_name_vectorizer__boisson: 6.391223996845006
ingredients_vectorizer__gazeifiee: 5.754354999331071
product_name_vectorizer__infusion: 5.6564581111475265
product_name_vectorizer__eau: 5.260356371111634
ingredients_vectorizer__the: 5.118364271741177
product_name_vectorizer__sirop: 5.087890451664498
ingredients_vectorizer__cafe: 4.78472644641649
ingredients_vectorizer__base: 4.497150313101522
product_name_vectorizer__limonade: 4.311640329192033
ingredients_vectorizer__houblon: 4.303870276415494
product_name_vectorizer__tea: 4.267282271220333
ingredients_vectorizer__sulfites: 4.1789964325046665
product_name_vectorizer__cidre: 3.942779854597489
product_name_vectorizer__smoothie: 3.77111

product_name_vectorizer__huile: 12.001321298231485
product_name_vectorizer__beurre: 9.789060872672353
ingredients_vectorizer__huile: 6.5757227379229075
product_name_vectorizer__oil: 5.052350312531394
ingredients_vectorizer__beurre: 4.687024320207681
product_name_vectorizer__doux: 4.3435908292495
ingredients_vectorizer__oil: 4.248746668194252
product_name_vectorizer__margarine: 4.13767548186729
product_name_vectorizer__graisse: 4.052872365797842
product_name_vectorizer__vierge: 3.883334764489878
product_name_vectorizer__baratte: 3.677751347891233
product_name_vectorizer__huiles: 3.5689958488343674
ingredients_vectorizer__huiles: 3.4122012661825183
product_name_vectorizer__demi: 3.2863713395668386
product_name_vectorizer__omega: 3.2716326074880255
product_name_vectorizer__olio: 3.243184333491343
product_name_vectorizer__tartine: 3.209841036888579
product_name_vectorizer__butter: 3.1876476028264533
ingredients_vectorizer__vegetales: 3.057573197368988
product_name_vectorizer__oliva: 3.0108

In [39]:
no_cat_df = fr_df[pd.isnull(fr_df['categories_tags']) & pd.notnull(fr_df['ingredients_text'])]

In [40]:
y_pred = classifier.predict_proba(no_cat_df)

In [41]:
y_pred_cat = []
y_pred_prob = []

for i in range(y_pred.shape[0]):
    row = y_pred[i]
    indexes = (row > 0.5).nonzero()[0]
    
    if len(indexes):
        row_cat = selected_categories[row.argmax()]
        y_pred_cat.append(row_cat)
        y_pred_prob.append(row.max())
    else:
        y_pred_cat.append(np.NaN)
        y_pred_prob.append(np.NaN)

In [42]:
no_cat_df['predicted_categories_tag'] = y_pred_cat
no_cat_df['predicted_categories_prob'] = y_pred_prob

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [43]:
no_cat_df

Unnamed: 0,code,url,last_modified_t,product_name,generic_name,brands_tags,categories_tags,countries_tags,ingredients_text,main_category_en,predicted_categories_tag,predicted_categories_prob
0,0000000000017,http://world-en.openfoodfacts.org/product/0000...,1529059204,Vitória crackers,,,,en:france,,,,
1,0000000000031,http://world-en.openfoodfacts.org/product/0000...,1539464817,Cacao,,,,en:france,,,en:sugary-snacks,0.878868
2,0000000000123,http://world-en.openfoodfacts.org/product/0000...,1535737986,Sauce Sweety chili 0%,,,,en:france,,,en:groceries,0.835508
3,0000000000291,http://world-en.openfoodfacts.org/product/0000...,1534239732,Mendiants,,,,en:france,,,,
4,0000000000949,http://world-en.openfoodfacts.org/product/0000...,1523440823,Salade de carottes râpées,,,,en:france,,,en:meals,0.991342
5,0000000000970,http://world-en.openfoodfacts.org/product/0000...,1520506371,Fromage blanc aux myrtilles,,,,en:france,,,en:cheeses,0.679874
6,0000000001001,http://world-en.openfoodfacts.org/product/0000...,1537766416,,,,,en:france,,,,
7,0000000001007,http://world-en.openfoodfacts.org/product/0000...,1538127565,Vainilla,,,,en:france,,,,
8,0000000001137,http://world-en.openfoodfacts.org/product/0000...,1539781578,Baguette parisien,,,,en:france,,,,
9,0000000001151,http://world-en.openfoodfacts.org/product/0000...,1537883538,&quot;Baguette Lyonnais&quot;,,,,en:france,,,,


In [44]:
export_df = no_cat_df.drop(['url', 'generic_name', 'brands_tags', 'categories_tags', 'countries_tags', 'product_name', 'ingredients_text', 'main_category_en'], axis=1)
export_df = export_df[pd.notnull(export_df.predicted_categories_tag)]
export_df.to_json('predicted_categories.json', orient='records', lines=True)