In [1]:
from typing import List

In [2]:
import pandas as pd
import matplotlib
import re
import spacy
from spacy.tokenizer import Tokenizer
from spacy.tokens import Doc

In [3]:
nlp_fr = spacy.load("fr_core_news_sm")

The file is way too big to open it with a regular notepad. But we still need to know how it was formatted in order to import it:

cpt=0
with open("openfoodfacts.csv") as f:
    for line in f:
        cpt+=1
        print(line)
        if cpt>5:
            break

This allows us to see that the delimiter is the '\t' special character

df = pd.read_csv("openfoodfacts.csv", encoding = 'utf-8', delimiter="\t")

df

print(df.columns)

na_counts = df.isna().sum()

dict(na_counts)

df.loc[df['code'] == 225]

Apply a condition on every entry of a column, then filter the dataframe using the boolean array:

%%time
df.loc[pd.Series(list(map(lambda f: "Prince Chocolat" in str(f), df['product_name'])))]

%%time
df.loc[df['product_name'].str.contains("Prince Chocolat", na=False)]

df["ingredients_text"].loc[14454]#.split(" ")

from collections import defaultdict
ingredients = defaultdict(int)
for ingr in df["ingredients_text"].dropna():
    for token in ingr.split(" "):
        ingredients[token] += 1
        
ingredients

packaging = defaultdict(int)
for ingr in df["packaging_text"].dropna():
    for token in ingr.split(" "):
        packaging[token] += 1
        
packaging

sorted(ingredients.items(), key=lambda k_v: k_v[1], reverse=True)

sorted(packaging.items(), key=lambda k_v: k_v[1], reverse=True)

df['countries_tags'].head(10)

countries_tags = defaultdict(int)
for product in df["countries_tags"].dropna():
    for country in product.split(","):
        countries_tags[country] += 1
        
countries_tags

fr_db_temp = df.loc[df['countries_tags'].str.contains("france", na=False)]
fr_db_temp

relevant_columns = ['code', 'product_name', 'ingredients_text', 'packaging_text']

fr_db= fr_db_temp.dropna(subset= ['ingredients_text', 'product_name']).filter(items= relevant_columns)
fr_db

ingredient_lists = list(fr_db['ingredients_text'])
ingredient_lists

def custom_tokenizer(nlp):
    
    
    special_cases = {":)": [{"ORTH": ":)"}]}
    prefix_re = re.compile(r'''^[\[\("']''')
    suffix_re = re.compile(r'''[\]\)"'\%]$''')
    infix_re = re.compile(r'''[-~]''')
    simple_url_re = re.compile(r'''^https?://''')
    
    return Tokenizer(nlp.vocab, rules=special_cases,
                                prefix_search=prefix_re.search,
                                suffix_search=suffix_re.search,
                                infix_finditer=infix_re.finditer,
                                url_match=simple_url_re.match)





nlp_fr = spacy.load("fr_core_news_sm")
nlp_fr.tokenizer = custom_tokenizer(nlp_fr)


my_str= u"PAIN BAGEL nature 44,2 % : farine de BLE, eau, huile végétale raffinée de colza, sucre, GLUTEN DE BLE, farine de MALT DE BLE, levure, sel, farine de SOJA, émulsifiants : E471, E322, agents de traitement de la farine : E300, E920. Garniture 55,8% : Aubergines grillées 41,4%, tomate 27.6%, FROMAGE FRAIS 13,8% (BABEURRE, CREME pasteurisée 13,3%, protéines de LAIT, acidifiant : acide citrique, sel, épaississant : farine de graines de caroube  conservateur : sorbate de potassium, arôme naturel), GRANA PADANO 13,8% (LAIT de vache cru, sel, présure, conservateur : lysozyme [protéine d'OEUF]), SAUCE PESTO 3,4% (huile de tournesol 42 %, basilic 29%, FROMAGES italiens 12,7% [grana padano, pecorino romano], farine de NOIX DE CAJOU, sel, BABEURRE en poudre/PIGNONS 9.8% fibres de BLE. acidifiant : acide lactique  antioxydant: acide ascorbique, ail). (% exprimés sur le garniture)"
doc = nlp_fr(my_str)
print([t.text for t in doc])
print(doc.char_span(18, 24))
indexes = [m.span() for m in re.finditer('[0-9]?[0-9] ?[,.]? ?[0-9]?[0-9]? ?%,?',my_str,flags=re.IGNORECASE)]
print(indexes)
for start, end in indexes:
    print(doc.char_span(start, end+1))
    print(doc.char_span(start, end))
    if doc.char_span(start, end):
        with doc.retokenize() as retokenizer:
            retokenizer.merge(doc.char_span(start, end))
    
print([t.text for t in doc])

def tokenize_ingredients(ingredient_lists: List[str]):
    nlp = spacy.load("fr_core_news_sm", disable= ['tagger', 'parser', 'ner', 'lemmatizer'])
    return list(nlp.pipe(ingredients))

tokenize_ingredients(ingredient_lists)

## Language detection

In [4]:
from langdetect import detect

In [5]:
def try_detect(text):
    try:
        language = detect(text)
    except:
        language = "error"
        print("This throws an error:", text)
    return language

fr_db['detected_lang'] = fr_db['ingredients_text'].apply(try_detect)

fr_db2 = fr_db.loc[fr_db['detected_lang'].str.contains("fr", na=False)]

In [6]:
fr_db2 = pd.read_csv('filtered_openfoodfacts.csv', sep = '\t')

In [7]:
fr_db2

Unnamed: 0.1,Unnamed: 0,code,product_name,ingredients_text,packaging_text,detected_lang
0,6,0000000000100,moutarde au moût de raisin,eau graines de téguments de moutarde vinaigre ...,,fr
1,25,0000000001199,Solène céréales poulet,"antioxydant : érythorbate de sodium, colorant ...",,fr
2,41,0000000002264,Baguette Poitevin,"baguette Poite vin Pain baguette 50,6%: farine...",,fr
3,52,0000000003827,Suedois saumon,"Paln suédois 42,6%: farine de BLÉ, eau, farine...",,fr
4,58,0000000004510,Salade shaker taboulé,"Taboulé 76,2%, légumes 12%, huile de colza, se...",,fr
...,...,...,...,...,...,...
201107,1989475,99903336,Jus multifruits,Jus de fruits à base de jus concentrés et puré...,,fr
201108,1989480,9991111111154,Compote à Boire Pomme Poire,"Pomme 71 %, poire 26 %, sucre 4%, antioxydant ...",,fr
201109,1989483,99911522,Chipolatas,"Viande de porc 92%(origine France) eau,sel,aro...",,fr
201110,1989531,9999200847366,Choco pearls,"sucre,beurre de cacao,poudre de jait.entier,fa...",,fr


fr_db2.to_csv("filtered_openfoodfacts.csv", sep= "\t")

# Clustering

In [8]:
RELEVANT_COLUMNS = ['code', 'product_name', 'ingredients_text', 'packaging_text']

def country_filter(df) -> pd.DataFrame:
    """Filters products which do not exist in France"""
    return df.loc[df['countries_tags'].str.contains("france", na=False)]

def na_filter(df) -> pd.DataFrame:
    """Filters products without an ingredient list"""
    return df.dropna(subset= ['ingredients_text', 'product_name']).filter(items=RELEVANT_COLUMNS)

# ~45 minutes on 250k entries
def detect_language(text: str) -> str:
    """Returns the language of the text. 'fr' for French"""
    try:
        language = detect(text)
    except:
        language = "error"
        # print("This throws an error:", text)
    return language

def language_filter(df) -> pd.DataFrame:
    """Filters products whose ingredient list is not in French"""
    df['detected_lang'] = df['ingredients_text'].apply(detect_language)
    return df.loc[df['detected_lang'].str.contains("fr", na=False)]

def filter_df(df) -> pd.DataFrame:
    """Applies all filters on the DataFrame"""
    df1 = country_filter(df)
    df2 = na_filter(df1)
    return language_filter(df2)

def read_data():
    df = pd.read_csv('en.openfoodfacts.org.products.csv', encoding = 'utf-8', delimiter="\t")
    return filter_df(df).to_csv("filtered_openfoodfacts.csv", sep= "\t") 

def compute_similarity(str1, str2) -> float:
    """Compute similarity ratio between two strings."""
    return ratio(str1, str2)

def remove_accent(accented_string) -> str:
    return unidecode.unidecode(accented_string)

def remove_special_char(noisy_string) -> str:
    """remove special characters from noisy strings."""
    return noisy_string.translate(str.maketrans("", "", "*•"))

def replace_char(noisy_string) -> str:
    return noisy_string.replace("_", " ")

def split_numbers(transaction_description) -> list:
    """This function separate the numbers and words if there is no space in between.
    Return a list containing the words and numbers of the transaction description """
    transaction_description_unglued = []
    for word in transaction_description:
        split = re.split(r'(\d+/?\d+)', word)
        transaction_description_unglued += split
    transaction_description_unglued = list(filter(lambda a: a != '' and a is not None,
                                                  transaction_description_unglued))
    return transaction_description_unglued

def remove_pourcentage(ingredient) -> list:
    """This function remove any pourcentage from a string"""
    return re.sub(r'(\d+( |.)?(\d+)?( )?%)', "", ingredient)

def preprocess_ingredient_list(ingredient_list) -> str:
    """Pre-processing Pipeline"""
    #ingredient_list = replace_slash(ingredient_list)
    ingredient_list = replace_char(ingredient_list.lower())
    #ingredient_list = remove_accent(ingredient_list)
    return remove_special_char(ingredient_list)

def parse_ingredient(data, index) -> pd.DataFrame:
    """This is a function to extract 'root ingredients' from ingredient lists.
    A 'root ingredient' is defined as a basic component of an item 
    (similar to a factor in mathematics)"""
    columns = ["ingredient", "subingredient_1", "subingredient_2", "root_ingredient"]
    new_data = pd.DataFrame(columns=columns)
    ingredient = ''
    subingredient_1 = ''
    subingredient_2 = ''
    paren_count = 0
    last_delimiter = ''
    to_append = pd.DataFrame(columns=columns)
    for token in data:
        #print(str(token))
        #print()
        if str(token) in ['(', '[', ':']:
            last_delimiter = str(token)
            paren_count = paren_count + 1
        elif (str(token) in [')', ']']) or (str(token) in [','] and last_delimiter == ':'):
            last_delimiter = str(token)
            paren_count = paren_count - 1
            
        if str(token) in ['%', ]:
            if paren_count == 0:
                ingredient = ingredient + ' ' + str(token)
            elif paren_count == 1:   
                subingredient_1 = subingredient_1 + ' ' + str(token)
            elif paren_count == 2:
                subingredient_2 = subingredient_2 + ' ' + str(token) 
                
            if (ingredient != '') | (subingredient_1 != '') | (subingredient_2 != ''):
                new_data=new_data.append(to_append)
                to_append = pd.DataFrame({'ingredient': ingredient.strip(), 
                                          'subingredient_1': subingredient_1.strip(),
                                          'subingredient_2': subingredient_2.strip()},
                                         index=[index])
            if paren_count == 0:
                ingredient = ''
                subingredient_1 = ''
                subingredient_2 = ''
            elif paren_count == 1:
                subingredient_1 = ''
                subingredient_2 = ''
            elif paren_count == 2:
                subingredient_2 = ''
            
        elif str(token) not in ['(', ')', '[', ']', ',', ':', '.']:
            if paren_count == 0:
                ingredient = ingredient + ' ' + str(token)
            elif paren_count == 1:   
                subingredient_1 = subingredient_1 + ' ' + str(token)
            elif paren_count == 2:
                subingredient_2 = subingredient_2 + ' ' + str(token)        

        elif str(token) in [',', ':', '.']:
            #print(str(token))
            #print(ingredient)
            #print(subingredient_1)
            #print(subingredient_2)
            
            if (ingredient != '') | (subingredient_1 != '') | (subingredient_2 != ''):
                new_data=new_data.append(to_append)
                to_append = pd.DataFrame({'ingredient': ingredient.strip(), 
                                          'subingredient_1': subingredient_1.strip(),
                                          'subingredient_2': subingredient_2.strip()},
                                         index=[index])
            if paren_count == 0:
                ingredient = ''
                subingredient_1 = ''
                subingredient_2 = ''
            elif paren_count == 1:
                subingredient_1 = ''
                subingredient_2 = ''
            elif paren_count == 2:
                subingredient_2 = ''

    return new_data
    
def parse_ingredients(ingredients_dataframe) -> pd.DataFrame:
    new_data = pd.DataFrame()
    ingredients_dataframe["tokenized"] = [nlp_fr(preprocess_ingredient_list(text)) for text in ingredients_dataframe.ingredients]
    for i in range(len(ingredients_dataframe)):
        new_data = new_data.append(parse_ingredient(ingredients_dataframe.tokenized[i], i))
    return new_data

def parse_ingredients_test() -> pd.DataFrame:
    """Unit test"""
    data = pd.DataFrame(
    ["""45 % massepain (44% amandes, sucre, sirop de glucose et de fructose, eau), farine de blé, 10 % de raisins secs, 10% beurre, dextrose, sucre, 2% amandes, levure, œuf entier*, sel de cuisine, graisse de palme, amidon de blé. *d'élevage au sol. Peut contenir traces des arachides, de soja et des autres fruits à coque. 
     Conserver à l'abri de la chaleur et au sec.""",
     """BRIOCHE TRANCHÉE AU BEURRE ET À LA CRÉME FRAÏCHE Farine de blé française 53%, 
     sucre, beurre pâtissier 8%, œufs entiers frais 7.5%, eau, crème fraîche française 3.2%, 
     sirop de sucre inverti, sel, ez/ uten de blé, levure, arômes naturels (contient ainol), 
     émulsifiant (E471), protéines de lait, levwe désactivée, colorant (béta carotène).""", 
     """antioxydant : érythorbate de sodium, colorant : caramel - origine UE), 
     tomate 33,3%, MAYONNAISE 11,1% (huile de colza 78,9%, eau, jaunes d'OEUF 6%, 
     vinaigre, MOUTARDE [eau, graines de MOUTARDE, sel, vinaigre, curcuma], sel, 
     dextrose, stabilisateur : gomme de cellulose, conservateur : sorbate de potassium, colorant : ?-carotène, arôme)""",
     """MAItSON GAUCHER chocolat de couverture noir 64% chocolat de couverture lait 38,2% amandes,
     noisettes, pistaches, noix, raisins, écorces d'oranges"""], 
    columns = ["ingredients"])
    return parse_ingredients(data)

def get_ingredients(ingredients_dataframe):
    ingredients_dataframe = parse_ingredients(ingredients_dataframe)
    ingredients = []
    for i in range(len(ingredients_dataframe)):
        if ingredients_dataframe.iloc[i]["subingredient_2"] != "":
            ingredient = remove_pourcentage(ingredients_dataframe.iloc[i]["subingredient_2"])
            if ingredient != "":
                ingredients.append(ingredient)
        elif ingredients_dataframe.iloc[i]["subingredient_1"] != "":
            ingredient = remove_pourcentage(ingredients_dataframe.iloc[i]["subingredient_1"])
            if ingredient != "":
                ingredients.append(ingredient)
        elif ingredients_dataframe.iloc[i]["ingredient"] != "":   
            ingredient = remove_pourcentage(ingredients_dataframe.iloc[i]["ingredient"])
            if ingredient != "":
                ingredients.append(ingredient)
    return ingredients

def correct_ingredients(misspelled):
    spell = SpellChecker(language='fr')
    misspelled = misspelled.split()
    correction = ""
    for word in misspelled:
        correction += ' ' + spell.correction(word)
        print("original word: "+ word)
        print("corrected word: "+ spell.correction(word))
    return correction

def correct_ingredients_test():
    misspelled = """Sucre, blanc d’oeufs frais, poudre d’amande, 16.5%, beurre co/centré, 
    farine de_blé, oeufs frais. sirop de glucose-fructose, 
    stabilsant : glycérol, sel, poudres à lever : carbonatesdesodium-diphosphates (blé), arôm."""
    return correct_ingredients(preprocess_ingredient_list(misspelled))

In [9]:
for i in range(50):
    print(preprocess_ingredient_list(fr_db2.iloc[i]['ingredients_text']), '\n')

eau graines de téguments de moutarde vinaigre de vin rouge sel vin rouge sucre   moût de raisin (6.2%) oignons colorants extraits de carotte et extrait de paprika huile de tournesol son de moutarde sel (cette  moutarde  uniquement disponible chez courte paille) 

antioxydant : érythorbate de sodium, colorant : caramel - origine ue), tomate 33,3%, mayonnaise 11,1% (huile de colza 78,9%, eau, jaunes d'oeuf 6%, vinaigre, moutarde [eau, graines de moutarde, sel, vinaigre, curcuma], sel, dextrose, stabilisateur : gomme de cellulose, conservateur : sorbate de potassium, colorant : ?-carotène, arôme) 

baguette poite vin pain baguette 50,6%: farine de blé, eau, sel, levure, gluten, farine de ble maité, levure désactivée, acide ascorbique, garniture fromage mi-chèvre 46% (lait pasteurisé [95 0% lait de vache, 5 0% lait de chèvre], sel, ferments lactiques et daffinage, coagulant), tomate saladg 20,4%, huile d'olive i basilic 1% 

paln suédois 42,6%: farine de blé, eau, farine de seigle, sucre, 

In [10]:
def remove_special_characters(text: str):
    text = text.translate(str.maketrans("", "", r"*•&\=+_~#²<>!?"))
    return text

In [11]:
def test_remove_special_characters():
    print(remove_special_characters("ndgn*dbs•z&\\\/=kvcxng+_sbdbfs~#²sbbs<sb> sbcdjbokdsc"))
    
test_remove_special_characters()

ndgndbsz/kvcxngsbdbfssbbssb sbcdjbokdsc


In [12]:
import unidecode
def remove_accent(accented_string) -> str:
    return unidecode.unidecode(accented_string)

In [13]:
INFIX_DELIMITERS = {
    ':': ' ', 
    '-': ' ', 
    ',': ' ', 
    ';':' ', 
    '.':' ', 
    "'": ' ',
    "/": ' ',
    "%": ' ',
    'œ': 'oe'}
def replace_infix(text):
    for delimiter in INFIX_DELIMITERS.keys():
        text = text.replace(delimiter, INFIX_DELIMITERS[delimiter])
    return text

In [14]:
def test_replace_infix():
    print(replace_infix("je mange des œufs-mimosas..."))
test_replace_infix()

je mange des oeufs mimosas   


In [15]:
def remove_percentage(text) -> list:
    """This function remove any percentage from a string"""
    return re.sub(r'(\d+( |.)?(\d+)?( )?%)', "", text)

In [16]:
def test_remove_percentage():
    print(remove_percentage("acide15.1%, nitrate15,1%, ammoniac 15.1%, levure 15,1%, banane 15.6 %, orange 20,8 %, fruit 25 %, cassis 15.7%ù kiwi 7815%, trucmuche 15 . 1 % autre 15 , 1 % jaipludinspi 15 . 1% gngn 15 , 1% "))
    
test_remove_percentage()

acide, nitrate, ammoniac , levure , banane , orange , fruit , cassis ù kiwi , trucmuche 15 .  autre 15 ,  jaipludinspi 15 .  gngn 15 ,  


In [18]:
def remove_numbers(tokens):
    return list(filter(lambda x: not x.isnumeric(), tokens))

In [19]:
def test_remove_numbers():
    print(remove_numbers("dfhdi 15 dijv 75 vjsoi 14.7, dijviv e205, dvov15, 14, 14,7    19, kd".split()))
test_remove_numbers()

['dfhdi', 'dijv', 'vjsoi', '14.7,', 'dijviv', 'e205,', 'dvov15,', '14,', '14,7', '19,', 'kd']


In [20]:
def flatten_tokens(tokens):
    flat_tokens = []
    for token in tokens:
        if type(token) == list:
            flat_tokens += token
        else:
            flat_tokens.append(token)
    return flat_tokens

In [21]:
PREFIX_DELIMITERS = ['(', '[']
SUFFIX_DELIMITERS = [')', ']', '.', ',', ';', ':']

def split_punct_prefix(tokens):
    for i in range(len(tokens)):
         if len(tokens[i]) > 1:   
            if tokens[i][0] in PREFIX_DELIMITERS:
                tokens[i] = [tokens[i][0], tokens[i][1:]]
    # print(tokens)
    return flatten_tokens(tokens)

def split_punct_suffix(tokens):
    # print("len:", len(tokens), "suffix:", tokens)
    for i in range(len(tokens)):
        # print(tokens[i])
        if len(tokens[i]) > 1:
            if tokens[i][-1] in SUFFIX_DELIMITERS:
                tokens[i] = [tokens[i][:-1], tokens[i][-1]]
    return flatten_tokens(tokens)

def split_puncts(tokens):
    return split_punct_suffix(split_punct_prefix(tokens))

split_puncts("hello, world! (hihi)".split())

['hello', ',', 'world!', '(', 'hihi', ')']

In [22]:
def tokenizer(ingr: str):
    ingr1 = remove_special_characters(ingr.lower())
    ingr2 = remove_accent(ingr1)
    ingr3 = remove_percentage(ingr2)
    ingr4 = replace_infix(ingr3)
    ingr_list = remove_numbers(ingr4.split())
    ingr_list1 = list(filter(lambda tok: tok != '', ingr_list))
    # print("tokenizer:", ingr_list1)
    return split_puncts(ingr_list1)

In [23]:
from nltk.stem.snowball import SnowballStemmer
def tokenizer2(ingr: str):
    ingr = ingr.lower().translate(str.maketrans("", "", r"*•&\/=+_~#²<>!?{}()[]."))
    ingr = remove_accent(ingr)
    ingr = remove_percentage(ingr)
    ingr = replace_infix(ingr)
    toks = "".join(filter(lambda x: not x.isdigit(), ingr)).split()
    toks = list(filter(lambda tok: len(tok) > 2, toks))
    stemmer = SnowballStemmer(language = "french")
    return list(map(stemmer.stem, toks))

In [24]:
%%time
test_list = []
for i in range(len(fr_db2)):
    test_list.append(tokenizer2(fr_db2.iloc[i]['ingredients_text']))
    
test_list

Wall time: 1min 31s


[['eau',
  'grain',
  'tegu',
  'moutard',
  'vinaigr',
  'vin',
  'roug',
  'sel',
  'vin',
  'roug',
  'sucr',
  'mout',
  'raisin',
  'oignon',
  'color',
  'extrait',
  'carott',
  'extrait',
  'paprik',
  'huil',
  'tournesol',
  'son',
  'moutard',
  'sel',
  'cet',
  'moutard',
  'uniqu',
  'disponibl',
  'chez',
  'court',
  'paill'],
 ['antioxyd',
  'erythorbat',
  'sodium',
  'color',
  'caramel',
  'origin',
  'tomat',
  'mayonnais',
  'huil',
  'colz',
  'eau',
  'jaun',
  'oeuf',
  'vinaigr',
  'moutard',
  'eau',
  'grain',
  'moutard',
  'sel',
  'vinaigr',
  'curcum',
  'sel',
  'dextros',
  'stabilis',
  'gomm',
  'cellulos',
  'conserv',
  'sorbat',
  'potassium',
  'color',
  'caroten',
  'arom'],
 ['baguet',
  'poit',
  'vin',
  'pain',
  'baguet',
  'farin',
  'ble',
  'eau',
  'sel',
  'levur',
  'gluten',
  'farin',
  'ble',
  'mait',
  'levur',
  'desactive',
  'acid',
  'ascorb',
  'garnitur',
  'fromag',
  'chevr',
  'lait',
  'pasteuris',
  'lait',
  'vach',


In [40]:
print(os.path.exists("./data/filtered_openfoodfacts.csv"))

True


In [25]:
from collections import defaultdict
ingredients = defaultdict(int)
for ingr_list in test_list:
    for ingr in ingr_list:
        ingredients[ingr] += 1
        
ingredients

defaultdict(int,
            {'eau': 108853,
             'grain': 28229,
             'tegu': 192,
             'moutard': 19269,
             'vinaigr': 22672,
             'vin': 9711,
             'roug': 12962,
             'sel': 159427,
             'sucr': 139948,
             'mout': 1106,
             'raisin': 8005,
             'oignon': 30623,
             'color': 29900,
             'extrait': 38049,
             'carott': 18114,
             'paprik': 8913,
             'huil': 90869,
             'tournesol': 44432,
             'son': 2285,
             'cet': 556,
             'uniqu': 409,
             'disponibl': 50,
             'chez': 226,
             'court': 100,
             'paill': 19,
             'antioxyd': 25067,
             'erythorbat': 2510,
             'sodium': 44565,
             'caramel': 9403,
             'origin': 27291,
             'tomat': 29621,
             'mayonnais': 788,
             'colz': 29878,
             'jaun': 10479,
   

In [26]:
clean_data = list(map(lambda y: list(filter(lambda x: ingredients[x] > 50, y)), test_list))
clean_data

[['eau',
  'grain',
  'tegu',
  'moutard',
  'vinaigr',
  'vin',
  'roug',
  'sel',
  'vin',
  'roug',
  'sucr',
  'mout',
  'raisin',
  'oignon',
  'color',
  'extrait',
  'carott',
  'extrait',
  'paprik',
  'huil',
  'tournesol',
  'son',
  'moutard',
  'sel',
  'cet',
  'moutard',
  'uniqu',
  'chez',
  'court'],
 ['antioxyd',
  'erythorbat',
  'sodium',
  'color',
  'caramel',
  'origin',
  'tomat',
  'mayonnais',
  'huil',
  'colz',
  'eau',
  'jaun',
  'oeuf',
  'vinaigr',
  'moutard',
  'eau',
  'grain',
  'moutard',
  'sel',
  'vinaigr',
  'curcum',
  'sel',
  'dextros',
  'stabilis',
  'gomm',
  'cellulos',
  'conserv',
  'sorbat',
  'potassium',
  'color',
  'caroten',
  'arom'],
 ['baguet',
  'vin',
  'pain',
  'baguet',
  'farin',
  'ble',
  'eau',
  'sel',
  'levur',
  'gluten',
  'farin',
  'ble',
  'levur',
  'desactive',
  'acid',
  'ascorb',
  'garnitur',
  'fromag',
  'chevr',
  'lait',
  'pasteuris',
  'lait',
  'vach',
  'lait',
  'chevr',
  'sel',
  'ferment',
  '

In [27]:
def custom_tokenizer(text):
    return Doc(nlp_fr.vocab, tokenizer(text))

In [28]:
nlp_fr.tokenizer = custom_tokenizer

%%time
doc_test = []
for i in range(10000):
    doc_test.append(nlp_fr(fr_db2.iloc[i]['ingredients_text']))

In [29]:
import gensim.models

In [30]:
class MyCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        for ingredient_list in clean_data:
            yield ingredient_list

In [31]:
%%time

sentences = MyCorpus()
model = gensim.models.Word2Vec(sentences=sentences)

Wall time: 7.18 s


In [32]:
%%time

from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction
import numpy as np                                  # array handling


def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    # extract the words & their vectors, as numpy arrays
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)  # fixed-width numpy strings

    # reduce using t-SNE
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels


x_vals, y_vals, labels = reduce_dimensions(model)

def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    from plotly.offline import init_notebook_mode, iplot, plot
    import plotly.graph_objs as go

    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')


def plot_with_matplotlib(x_vals, y_vals, labels):
    import matplotlib.pyplot as plt
    import random

    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    #
    # Label randomly subsampled 25 data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))




Wall time: 7.4 s


In [33]:
%%time

try:
    get_ipython()
except Exception:
    plot_function = plot_with_matplotlib
else:
    plot_function = plot_with_plotly

plot_function(x_vals, y_vals, labels)

Wall time: 2.63 s


In [34]:
len(labels)

2588