In [6]:
import os
import pandas as pd
import numpy as np
import spacy


from matplotlib import pyplot as plt
from tqdm import tqdm
from helpers.prep.normalization import RecipeNormalizer
from gensim.models.phrases import Phrases, Phraser
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import nltk
import string


# nltk.download('stopwords')

BASE_PATH = "../data/wine_extracts"



In [None]:
# def merge_all_wine_files():
#     i = 0
#     wine_dataframe = pd.DataFrame()
#     for file in os.listdir(BASE_PATH):
#         file_location = BASE_PATH + "/" + str(file)
#         if i == 0:
#             wine_dataframe = pd.read_csv(file_location)
#             i += 1
#         else:
#             df_to_append = pd.read_csv(
#                 file_location, low_memory=False, encoding="latin-1"
#             )
#             wine_dataframe = pd.concat([wine_dataframe, df_to_append], axis=0)

#     wine_dataframe.drop_duplicates(subset=["Name"], inplace=True)

#     geographies = ["Subregion", "Region", "Province", "Country"]

#     for geo in geographies:
#         wine_dataframe[geo] = wine_dataframe[geo].apply(lambda x: str(x).strip())

#     return wine_dataframe


In [None]:
# wine_dataframe = merge_all_wine_files()

In [None]:
# wine_dataframe.to_csv('../data/produce/wine_data.csv', index=False)

wine_dataframe = pd.read_csv('../data/produce/wine_data.csv')

In [None]:
wine_dataframe.head(3)

In [None]:
wine_dataframe.columns

In [None]:
wine_dataframe.iloc[2]

In [None]:
def tokenize_corpus(all_wine_corpus):
    wine_sentences_tokenized = sent_tokenize(all_wine_corpus)
    stop_words = set(stopwords.words("english"))
    term_normalizer = RecipeNormalizer()

    all_corpus_by_word = []
    for sentence in wine_sentences_tokenized:
        sent = []
        for word in word_tokenize(sentence):
            if word not in stop_words and word.isalpha():
                sent.append(word)
        all_corpus_by_word.append(sent)

    words_in_corpus = [word for sentence in all_corpus_by_word for word in sentence]
    terms = term_normalizer.normalize_ingredients(words_in_corpus)

    previous_idx = 0
    normalized_corpus_by_words = []
    for sentence in all_corpus_by_word:
        length = len(sentence)
        new_index = previous_idx + length
        normalized_corpus_by_words.append(terms[previous_idx:new_index])
        previous_idx = new_index

    return normalized_corpus_by_words

In [None]:
all_wine_corpus = ' '.join(str(sentence).lower() for sentence in wine_dataframe.Description.to_numpy()[:5000])
wine_sentences_tokenized = sent_tokenize(all_wine_corpus)
stop_words = set(stopwords.words('english')) 
term_normalizer = RecipeNormalizer()

all_corpus_by_word = []
for sentence in wine_sentences_tokenized:
    sent = []
    for word in word_tokenize(sentence):
        if word not in stop_words and word.isalpha():
            sent.append(word)
    all_corpus_by_word.append(sent)

words_in_corpus = [word for sentence in all_corpus_by_word for word in sentence]
terms = term_normalizer.normalize_ingredients(
    words_in_corpus
)


In [None]:
previous_idx = 0
normalized_corpus_by_words = []
for sentence in all_corpus_by_word:
    length = len(sentence)
    new_index = previous_idx + length
    normalized_corpus_by_words.append(terms[previous_idx:new_index])
    previous_idx = new_index


In [None]:
assert len(normalized_corpus_by_words) == len(all_corpus_by_word)

In [None]:
wine_bigram_model = Phrases(normalized_corpus_by_words, min_count=10)
wine_bigrams = [wine_bigram_model[line] for line in all_corpus_by_word]

In [None]:
wine_bigrams

In [None]:
def extract_term_frequeuncies_from_bigrams(wine_bigrams):
    wine_bigrams_list = [term for sentence in wine_bigrams for term in sentence]
    wine_terms_count = {term: 0 for term in wine_bigrams_list}

    for term in wine_bigrams_list:
        if term in wine_terms_count:
            wine_terms_count[term] += 1

    wine_terms_sorted = sorted(wine_terms_count.items(), key=lambda x: x[1], reverse=True)

    return wine_terms_sorted


term_frequencies = extract_term_frequeuncies_from_bigrams(wine_bigrams)

In [None]:
term_frequencies

In [None]:
print(f'In total found from 15 to 100: {len([elem for elem in term_frequencies if elem[1] > 15 and elem[1] < 100])} ingredients')
print(f'In total found from 100 to 300: {len([elem for elem in term_frequencies if elem[1] > 99 and elem[1] < 300])} ingredients')
print(f'In total found from 300 to 600: {len([elem for elem in term_frequencies if elem[1] > 299 and elem[1] < 600])} ingredients')

In [None]:
print(f'In total found from 15 to 100: {[elem for elem in term_frequencies if elem[1] > 15 and elem[1] < 100]} ingredients')

In [None]:

def normalize_wine_reviews(reviews):
    normalized_instructions = []
    instruction_normalizer = RecipeNormalizer()
    for instructions in tqdm(reviews, total=len(reviews)):
        if instructions is np.nan:
            normalized_instructions.append(None)
            continue

        if type(instructions) == str:
            instruction_text = [instructions.lower()]
        else:
            instruction_text = [step.strip() for step in eval(instructions)]
        
            
        normalized_instructions.append(
            instruction_normalizer.normalize_instruction(
                instruction_text,
            )
        )
    return normalized_instructions


In [None]:
reviews = wine_dataframe.Description.to_numpy()[:500]
clean_reviews = normalize_wine_reviews(reviews)

In [None]:
clean_reviews[::100]

In [7]:
wine_descriptors_df = pd.read_csv('../data/wine_extracts/descriptor_mapping.csv')
wine_descriptors_df.drop(['level_2', 'level_1', 'type'], axis=1, inplace=True)

In [9]:
raw_descriptor = wine_descriptors_df['raw descriptor'].to_numpy()
level_3 = wine_descriptors_df.level_3.to_numpy()

descriptors = {}
for key, value in zip(raw_descriptor, level_3):
    descriptors.update({value.replace('_', ' '): value.replace('_', ' ')})

In [10]:
descriptors

{'abrasive': 'abrasive',
 'acacia': 'acacia',
 'acid driven': 'acid driven',
 'aggressive': 'aggressive',
 'airy': 'airy',
 'allspice': 'allspice',
 'almond': 'almond',
 'alpine herbs': 'alpine herbs',
 'american oak': 'american oak',
 'angular': 'angular',
 'anise': 'anise',
 'apple': 'apple',
 'apple blossom': 'apple blossom',
 'apple pie': 'apple pie',
 'apple sauce': 'apple sauce',
 'apricot': 'apricot',
 'ash': 'ash',
 'asian spice': 'asian spice',
 'asparagus': 'asparagus',
 'asphalt': 'asphalt',
 'assertive': 'assertive',
 'astringent': 'astringent',
 'austere': 'austere',
 'bacon': 'bacon',
 'baked': 'baked',
 'baked apple': 'baked apple',
 'baked bread': 'baked bread',
 'baking spices': 'baking spices',
 'balsamic': 'balsamic',
 'banana': 'banana',
 'band-aid': 'band-aid',
 'barbecue': 'barbecue',
 'bark': 'bark',
 'barnyard': 'barnyard',
 'basil': 'basil',
 'bay leaf': 'bay leaf',
 'beef': 'beef',
 'beef jerky': 'beef jerky',
 'beefy': 'beefy',
 'beeswax': 'beeswax',
 'beet':

In [11]:
term_normalizer = RecipeNormalizer()
term_normalizer.read_and_write_ingredients(descriptors, './helpers/prep/wine_mapping_values.py', append_ingredients=False)

{'abrasive': 'abrasive',
 'acacia': 'acacia',
 'acid driven': 'acid driven',
 'aggressive': 'aggressive',
 'airy': 'airy',
 'allspice': 'allspice',
 'almond': 'almond',
 'alpine herbs': 'alpine herbs',
 'american oak': 'american oak',
 'angular': 'angular',
 'anise': 'anise',
 'apple': 'apple',
 'apple blossom': 'apple blossom',
 'apple pie': 'apple pie',
 'apple sauce': 'apple sauce',
 'apricot': 'apricot',
 'ash': 'ash',
 'asian spice': 'asian spice',
 'asparagus': 'asparagus',
 'asphalt': 'asphalt',
 'assertive': 'assertive',
 'astringent': 'astringent',
 'austere': 'austere',
 'bacon': 'bacon',
 'baked': 'baked',
 'baked apple': 'baked apple',
 'baked bread': 'baked bread',
 'baking spices': 'baking spices',
 'balsamic': 'balsamic',
 'banana': 'banana',
 'band-aid': 'band-aid',
 'barbecue': 'barbecue',
 'bark': 'bark',
 'barnyard': 'barnyard',
 'basil': 'basil',
 'bay leaf': 'bay leaf',
 'beef': 'beef',
 'beef jerky': 'beef jerky',
 'beefy': 'beefy',
 'beeswax': 'beeswax',
 'beet':