In [1]:
from collections import Counter

import pandas as pd
import numpy as np
from more_itertools import flatten

def import_data(path):
    return pd.read_csv(
        str(path),
        sep='\t',
        usecols=['code', 'url', 'product_name', 'generic_name', 'brands_tags',
                 'categories_tags', 'ingredients_text', 'main_category_en',
                 'countries_tags', 'last_modified_t'],
        dtype={'code': 'str', 'product_name': 'str'},
        converters={'categories_tags': lambda x: x.split(',') if x else np.NaN}
    )


In [2]:
df = import_data("data/en.openfoodfacts.org.products.csv")
df = df.fillna(value={'ingredients_text': "", 'product_name': ""})
fr_df = df[df['countries_tags'] == 'en:france']
filt_df = fr_df[pd.notnull(fr_df['categories_tags'])]
filt_df = filt_df[pd.notnull(filt_df['ingredients_text'])]

categories_counter = Counter(flatten(c for c in filt_df['categories_tags']))

  if self.run_code(code, result):


In [3]:
#for name, count in categories_counter.most_common(500):
#    print(f"{name} - {count}")

In [4]:
CATEGORY_COUNT_THRESHOLD = 25
excluded_categories = {"en:plant-based-foods-and-beverages", "en:plant-based-foods"}

selected_categories = [cat for (cat, count) in categories_counter.items() if count >= CATEGORY_COUNT_THRESHOLD and cat not in excluded_categories]
selected_categories_set = set(selected_categories)

In [5]:
criterion = filt_df['categories_tags'].map(lambda x: bool(set(x).intersection(selected_categories_set)))
cat_df = filt_df[criterion]
cat_df['categories_tags_int'] = cat_df.categories_tags.apply(lambda categories: [selected_categories.index(x) for x in categories if x in selected_categories])
cat_df['categories_tags_array'] = cat_df.categories_tags.apply(lambda categories: [1 if x in categories else 0 for x in selected_categories])
print(f"{len(fr_df)} elements in original dataframe, {len(cat_df)} after category filter")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


405973 elements in original dataframe, 120358 after category filter


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [6]:
cat_df.head(5)

Unnamed: 0,code,url,last_modified_t,product_name,generic_name,brands_tags,categories_tags,countries_tags,ingredients_text,main_category_en,categories_tags_int,categories_tags_array
11,1281,http://world-en.openfoodfacts.org/product/0000...,1527070794,Tarte noix de coco,,"crous-resto,crous","[en:pies, en:sweet-pies, en:coconut-pies]",en:france,,Pies,"[0, 1]","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
17,1885,http://world-en.openfoodfacts.org/product/0000...,1518126491,Compote de poire,,crous,"[en:plant-based-foods-and-beverages, en:plant-...",en:france,,Plant-based foods and beverages,"[2, 3, 4, 5]","[0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
45,5470,http://world-en.openfoodfacts.org/product/0000...,1537211991,BAguette bressan,,crousresto,"[en:plant-based-foods-and-beverages, en:plant-...",en:france,"Bâguette Bressan Pain baguette 46,2%,' fqrine ...",Plant-based foods and beverages,"[6, 7, 8]","[0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, ..."
47,5661,http://world-en.openfoodfacts.org/product/0000...,1522437072,Pavé de saumon fumé à la ficelle,,,"[en:seafood, en:fishes, en:salmons, en:smoked-...",en:france,,Seafood,"[9, 10, 11, 12, 13, 14]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, ..."
105,20114,http://world-en.openfoodfacts.org/product/0000...,1524490564,Naturablue original,,natura4ever,[en:dietary-supplements],en:france,"Glycérine d'origine naturelle (stabilisant), s...",Dietary supplements,[15],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [7]:
import numpy as np
from preprocessing import preprocess

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, precision_score
from sklearn.externals import joblib

def predict(text, classifier):
    pred = classifier.predict([text])[0]
    return [selected_categories[i] for i in range(len(pred)) if pred[i]]

In [7]:
column_trans = ColumnTransformer([
    ('ingredients_vectorizer', CountVectorizer(min_df=5, preprocessor=preprocess), 'ingredients_text'),
    ('product_name_vectorizer', CountVectorizer(min_df=5, preprocessor=preprocess), 'product_name'),
])

classifier = Pipeline([
    ('column_transformer', column_trans),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LogisticRegression()))])

In [8]:
# df_train, df_test = train_test_split(cat_df)
# y_train = np.array(list(df_train.categories_tags_array.values))
# y_test = np.array(list(df_test.categories_tags_array.values))

# column_trans.fit(df_train)
# classifier.fit(df_train, y_train)

# y_pred = classifier.predict(df_test)
# print(classification_report(y_test, y_pred, target_names=selected_categories))

# joblib.dump(classifier, 'clf-model.joblib')

In [8]:
classifier = joblib.load('clf-model.joblib')

In [9]:
y_pred = classifier.predict_proba(filt_df)

In [10]:
import categories
category_taxonomy = categories.generate_category_taxonomy('categories.json')

In [24]:
y_pred_cat = []
y_pred_prob = []

for i in range(y_pred.shape[0]):
    row = y_pred[i]
    indexes = (row > 0.5).nonzero()[0]
    
    current_cat = filt_df.iloc[i].categories_tags
    if len(indexes):
        cat = [selected_categories[i] for i in indexes]
        cat = [c for c in cat if c not in current_cat]
        
        if cat:
            row_cat = categories.Category.find_deepest_item(cat, category_taxonomy)
            y_pred_cat.append(row_cat)
            y_pred_prob.append(row.max())
        else:
            y_pred_cat.append(np.NaN)
            y_pred_prob.append(np.NaN)       
    else:
        y_pred_cat.append(np.NaN)
        y_pred_prob.append(np.NaN)

In [25]:
filt_df['predicted_categories_tag'] = y_pred_cat
filt_df['predicted_categories_prob'] = y_pred_prob

In [26]:
filt_df

Unnamed: 0,code,url,last_modified_t,product_name,generic_name,brands_tags,categories_tags,countries_tags,ingredients_text,main_category_en,predicted_categories_tag,predicted_categories_prob
11,0000000001281,http://world-en.openfoodfacts.org/product/0000...,1527070794,Tarte noix de coco,,"crous-resto,crous","[en:pies, en:sweet-pies, en:coconut-pies]",en:france,,Pies,,
17,0000000001885,http://world-en.openfoodfacts.org/product/0000...,1518126491,Compote de poire,,crous,"[en:plant-based-foods-and-beverages, en:plant-...",en:france,,Plant-based foods and beverages,,
45,0000000005470,http://world-en.openfoodfacts.org/product/0000...,1537211991,BAguette bressan,,crousresto,"[en:plant-based-foods-and-beverages, en:plant-...",en:france,"Bâguette Bressan Pain baguette 46,2%,' fqrine ...",Plant-based foods and beverages,,
47,0000000005661,http://world-en.openfoodfacts.org/product/0000...,1522437072,Pavé de saumon fumé à la ficelle,,,"[en:seafood, en:fishes, en:salmons, en:smoked-...",en:france,,Seafood,,
105,0000000020114,http://world-en.openfoodfacts.org/product/0000...,1524490564,Naturablue original,,natura4ever,[en:dietary-supplements],en:france,"Glycérine d'origine naturelle (stabilisant), s...",Dietary supplements,,
108,0000000024600,http://world-en.openfoodfacts.org/product/0000...,1434535914,Filet de bœuf,,,[fr:filet-de-boeuf],en:france,,fr:Filet-de-boeuf,en:meats,0.746351
248,0000000274722,http://world-en.openfoodfacts.org/product/0000...,1535310176,Blanquette de Volaille et son Riz,Blanquette de Volaille et son Riz,comme-j-aime,"[en:meals, en:meat-based-products, en:meals-wi...",en:france,"Riz précuit 40,4 % (eau, riz, huile de colza, ...",Meals,,
249,0000000274739,http://world-en.openfoodfacts.org/product/0000...,1513097739,Raviolini au Fromage de chèvre et Pesto,,comme-j-aime,"[en:meals, en:microwave-meals]",en:france,,Meals,,
282,0000005200016,http://world-en.openfoodfacts.org/product/0000...,1442570752,lentilles vertes,,bertrand-lejeune,"[en:plant-based-foods-and-beverages, en:plant-...",en:france,lentilles vertes,Plant-based foods and beverages,,
287,0000007730009,http://world-en.openfoodfacts.org/product/0000...,1482359150,Biscuits sablés fourrage au cacao,Biscuits sablés déclassés fourrage au cacao,st-michel,"[en:sugary-snacks, en:biscuits-and-cakes, en:b...",en:france,"Sucre, farine de _Blé_, graisse et huiles végé...",Sugary snacks,,


In [14]:
export_df = filt_df.drop(['url', 'generic_name', 'brands_tags', 'categories_tags', 'countries_tags', 'product_name', 'ingredients_text', 'main_category_en'], axis=1)
export_df = export_df[pd.notnull(export_df.predicted_categories_tag)]
export_df.to_json('predicted_categories_all.json', orient='records', lines=True)