In [23]:
import os
import pandas as pd
import numpy as np
import spacy


from matplotlib import pyplot as plt
from tqdm import tqdm
from helpers.prep.foodbert_norm import RecipeNormalizer
from gensim.models.phrases import Phrases, Phraser
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import nltk
import string

# nltk.download('stopwords')

BASE_PATH = "../data/wine_extracts"



In [24]:
i = 0
wine_dataframe = pd.DataFrame()
for file in os.listdir(BASE_PATH):
    file_location = BASE_PATH + '/' + str(file)
    if i==0:
        wine_dataframe = pd.read_csv(file_location)
        i+=1
    else:
        df_to_append = pd.read_csv(file_location, low_memory=False, encoding='latin-1')
        wine_dataframe = pd.concat([wine_dataframe, df_to_append], axis=0)

wine_dataframe.drop_duplicates(subset=['Name'], inplace=True)

geographies = ['Subregion', 'Region', 'Province', 'Country']

for geo in geographies:
    wine_dataframe[geo] = wine_dataframe[geo].apply(lambda x : str(x).strip())

print(wine_dataframe.shape)


(143686, 22)


In [25]:
wine_dataframe.head(3)

Unnamed: 0.1,Unnamed: 0,Alcohol,Appellation,Bottle Size,Category,Country,Date Published,Description,Designation,Importer,...,Province,Rating,Region,Reviewer,Reviewer Twitter Handle,Subregion,User Avg Rating,Variety,Vintage,Winery
0,791.0,9%,"Burgenland, Austria",375 ml,Dessert,Austria,12/1/2018,"Marmalade, toffee, vanilla and tonka bean swir...",Nouvelle Vague Trockenbeerenauslese Nummer 7,Terlato Wines International,...,Burgenland,98.0,,Anne KrebiehlÂ MW,@AnneInVino,,Not rated yet [Add Your Review],Chardonnay,2015.0,Kracher
1,792.0,10.50%,"Burgenland, Austria",375 ml,Dessert,Austria,12/1/2018,A touch of creaminess on the nose leads to a m...,Trockenbeerenauslese Grand Selection,GD Imports,...,Burgenland,97.0,,Anne KrebiehlÂ MW,@AnneInVino,,Not rated yet [Add Your Review],Chardonnay,2015.0,H.& C. Nittnaus
2,796.0,9%,"Burgenland, Austria",375 ml,Dessert,Austria,12/1/2018,A smokiness lies thickly above the apple fruit...,Trockenbeerenauslese,Blue Danube Wine Co,...,Burgenland,96.0,,Anne KrebiehlÂ MW,@AnneInVino,,Not rated yet [Add Your Review],Chardonnay,2015.0,Rosenhof


In [26]:
wine_dataframe.columns

Index(['Unnamed: 0', 'Alcohol', 'Appellation', 'Bottle Size', 'Category',
       'Country', 'Date Published', 'Description', 'Designation', 'Importer',
       'Name', 'Price', 'Province', 'Rating', 'Region', 'Reviewer',
       'Reviewer Twitter Handle', 'Subregion', 'User Avg Rating', 'Variety',
       'Vintage', 'Winery'],
      dtype='object')

In [27]:
wine_dataframe.iloc[2]

Unnamed: 0                                                             796.0
Alcohol                                                                   9%
Appellation                                              Burgenland, Austria
Bottle Size                                                           375 ml
Category                                                             Dessert
Country                                                              Austria
Date Published                                                     12/1/2018
Description                A smokiness lies thickly above the apple fruit...
Designation                                             Trockenbeerenauslese
Importer                                                 Blue Danube Wine Co
Name                       Rosenhof 2015 Trockenbeerenauslese Chardonnay ...
Price                                                                   $29 
Province                                                          Burgenland

In [29]:
all_wine_corpus = ' '.join(str(sentence).lower() for sentence in wine_dataframe.Description.to_numpy()[:5000])
wine_sentences_tokenized = sent_tokenize(all_wine_corpus)
stop_words = set(stopwords.words('english')) 
term_normalizer = RecipeNormalizer()

all_corpus_by_word = []
for sentence in wine_sentences_tokenized:
    sent = []
    for word in word_tokenize(sentence):
        if word not in stop_words and word.isalpha():
            sent.append(word)
    all_corpus_by_word.append(sent)

words_in_corpus = [word for sentence in all_corpus_by_word for word in sentence]
terms = term_normalizer.normalize_ingredients(
    words_in_corpus
)

pd.Series(list(dict.fromkeys(words_in_corpus))).to_csv('../data/produce/wine_terms.csv')

100%|██████████| 115380/115380 [02:22<00:00, 811.42it/s] 


In [16]:
previous_idx = 0
normalized_corpus_by_words = []
for sentence in all_corpus_by_word:
    length = len(sentence)
    new_index = previous_idx + length
    normalized_corpus_by_words.append(terms[previous_idx:new_index])
    previous_idx = new_index


In [17]:
assert len(normalized_corpus_by_words) == len(all_corpus_by_word)

In [21]:
wine_bigram_model = Phrases(normalized_corpus_by_words, min_count=10)
wine_bigrams = [wine_bigram_model[line] for line in all_corpus_by_word]

In [22]:
wine_bigrams

[['marmalade', 'toffee', 'vanilla', 'tonka', 'bean', 'swirl', 'nose'],
 ['rich',
  'smooth',
  'rounded',
  'mellow',
  'yet',
  'incredibly',
  'bright',
  'aromatic',
  'core',
  'fine',
  'citrus'],
 ['think', 'dried', 'candied', 'orange_peel', 'candied', 'blood', 'orange'],
 ['magical',
  'balance',
  'citrus',
  'freshness',
  'strikes',
  'cushioning',
  'gentle',
  'vanilla'],
 ['textural', 'bliss', 'aromatic', 'fireworks'],
 ['drink', 'least'],
 ['touch',
  'creaminess',
  'nose',
  'leads',
  'mellow',
  'palate',
  'full',
  'honeyed',
  'baked',
  'apple',
  'fruit'],
 ['hint',
  'vanilla',
  'bright',
  'almost',
  'piercing',
  'gleam',
  'wonderful',
  'lemon',
  'sharpness',
  'putting',
  'creamy',
  'decadence',
  'palate',
  'even',
  'focus'],
 ['ultimate', 'concentration', 'finishes', 'zesty', 'vigor'],
 ['built', 'last'],
 ['drink', 'least'],
 ['smokiness',
  'lies',
  'thickly',
  'apple',
  'fruit',
  'notes',
  'heady',
  'tba',
  'glossing',
  'everything',
  '

In [None]:
def extract_term_frequeuncies_from_bigrams(wine_bigrams):
    wine_bigrams_list = [term for sentence in wine_bigrams for term in sentence]
    wine_terms_count = {term: 0 for term in wine_bigrams_list}

    for term in wine_bigrams_list:
        if term in wine_terms_count:
            wine_terms_count[term] += 1

    wine_terms_sorted = sorted(wine_terms_count.items(), key=lambda x: x[1], reverse=True)

    return wine_terms_sorted


term_frequencies = extract_term_frequeuncies_from_bigrams(wine_bigrams)

In [None]:
term_frequencies

In [None]:
print(f'In total found from 15 to 100: {len([elem for elem in term_frequencies if elem[1] > 15 and elem[1] < 100])} ingredients')
print(f'In total found from 100 to 300: {len([elem for elem in term_frequencies if elem[1] > 99 and elem[1] < 300])} ingredients')
print(f'In total found from 300 to 600: {len([elem for elem in term_frequencies if elem[1] > 299 and elem[1] < 600])} ingredients')

In [None]:
print(f'In total found from 15 to 100: {[elem for elem in term_frequencies if elem[1] > 15 and elem[1] < 100]} ingredients')

In [None]:

def normalize_wine_reviews(reviews):
    normalized_instructions = []
    instruction_normalizer = RecipeNormalizer()
    for instructions in tqdm(reviews, total=len(reviews)):
        if instructions is np.nan:
            normalized_instructions.append(None)
            continue

        if type(instructions) == str:
            instruction_text = [instructions.lower()]
        else:
            instruction_text = [step.strip() for step in eval(instructions)]
        
            
        normalized_instructions.append(
            instruction_normalizer.normalize_instruction(
                instruction_text,
            )
        )
    return normalized_instructions


In [None]:
reviews = wine_dataframe.Description.to_numpy()[:500]
clean_reviews = normalize_wine_reviews(reviews)

In [None]:
clean_reviews[::100]