In [1]:
import os
import pandas as pd
import numpy as np
import spacy


from matplotlib import pyplot as plt
from tqdm import tqdm
from helpers.prep.normalization import RecipeNormalizer
from gensim.models.phrases import Phrases, Phraser
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import nltk
import string


# nltk.download('stopwords')

BASE_PATH = "../data/wine_extracts"



In [None]:
# def merge_all_wine_files():
#     i = 0
#     wine_dataframe = pd.DataFrame()
#     for file in os.listdir(BASE_PATH):
#         file_location = BASE_PATH + "/" + str(file)
#         if i == 0:
#             wine_dataframe = pd.read_csv(file_location)
#             i += 1
#         else:
#             df_to_append = pd.read_csv(
#                 file_location, low_memory=False, encoding="latin-1"
#             )
#             wine_dataframe = pd.concat([wine_dataframe, df_to_append], axis=0)

#     wine_dataframe.drop_duplicates(subset=["Name"], inplace=True)

#     geographies = ["Subregion", "Region", "Province", "Country"]

#     for geo in geographies:
#         wine_dataframe[geo] = wine_dataframe[geo].apply(lambda x: str(x).strip())

#     return wine_dataframe


In [None]:
# wine_dataframe = merge_all_wine_files()

In [2]:
# wine_dataframe.to_csv('../data/produce/wine_data.csv', index=False)

wine_dataframe = pd.read_csv('../data/produce/wine_data.csv')

In [None]:
wine_dataframe.head(3)

In [3]:
wine_dataframe.columns

Index(['Alcohol', 'Appellation', 'Bottle Size', 'Category', 'Country',
       'Date Published', 'Description', 'Designation', 'Importer', 'Name',
       'Price', 'Province', 'Rating', 'Region', 'Reviewer',
       'Reviewer Twitter Handle', 'Subregion', 'User Avg Rating', 'Variety',
       'Vintage', 'Winery'],
      dtype='object')

In [None]:
wine_dataframe.iloc[2]

In [None]:
def tokenize_corpus(all_wine_corpus):
    wine_sentences_tokenized = sent_tokenize(all_wine_corpus)
    stop_words = set(stopwords.words("english"))
    term_normalizer = RecipeNormalizer()

    all_corpus_by_word = []
    for sentence in wine_sentences_tokenized:
        sent = []
        for word in word_tokenize(sentence):
            if word not in stop_words and word.isalpha():
                sent.append(word)
        all_corpus_by_word.append(sent)

    words_in_corpus = [word for sentence in all_corpus_by_word for word in sentence]
    terms = term_normalizer.normalize_ingredients(words_in_corpus)

    previous_idx = 0
    normalized_corpus_by_words = []
    for sentence in all_corpus_by_word:
        length = len(sentence)
        new_index = previous_idx + length
        normalized_corpus_by_words.append(terms[previous_idx:new_index])
        previous_idx = new_index

    return normalized_corpus_by_words

In [None]:
all_wine_corpus = ' '.join(str(sentence).lower() for sentence in wine_dataframe.Description.to_numpy()[:5000])
wine_sentences_tokenized = sent_tokenize(all_wine_corpus)
stop_words = set(stopwords.words('english')) 
term_normalizer = RecipeNormalizer()

all_corpus_by_word = []
for sentence in wine_sentences_tokenized:
    sent = []
    for word in word_tokenize(sentence):
        if word not in stop_words and word.isalpha():
            sent.append(word)
    all_corpus_by_word.append(sent)

words_in_corpus = [word for sentence in all_corpus_by_word for word in sentence]
terms = term_normalizer.normalize_ingredients(
    words_in_corpus
)


In [None]:
previous_idx = 0
normalized_corpus_by_words = []
for sentence in all_corpus_by_word:
    length = len(sentence)
    new_index = previous_idx + length
    normalized_corpus_by_words.append(terms[previous_idx:new_index])
    previous_idx = new_index


In [None]:
assert len(normalized_corpus_by_words) == len(all_corpus_by_word)

In [None]:
wine_bigram_model = Phrases(normalized_corpus_by_words, min_count=10)
wine_bigrams = [wine_bigram_model[line] for line in all_corpus_by_word]

In [None]:
wine_bigrams

In [None]:
def extract_term_frequeuncies_from_bigrams(wine_bigrams):
    wine_bigrams_list = [term for sentence in wine_bigrams for term in sentence]
    wine_terms_count = {term: 0 for term in wine_bigrams_list}

    for term in wine_bigrams_list:
        if term in wine_terms_count:
            wine_terms_count[term] += 1

    wine_terms_sorted = sorted(wine_terms_count.items(), key=lambda x: x[1], reverse=True)

    return wine_terms_sorted


term_frequencies = extract_term_frequeuncies_from_bigrams(wine_bigrams)

In [None]:
term_frequencies

In [None]:
print(f'In total found from 15 to 100: {len([elem for elem in term_frequencies if elem[1] > 15 and elem[1] < 100])} ingredients')
print(f'In total found from 100 to 300: {len([elem for elem in term_frequencies if elem[1] > 99 and elem[1] < 300])} ingredients')
print(f'In total found from 300 to 600: {len([elem for elem in term_frequencies if elem[1] > 299 and elem[1] < 600])} ingredients')

In [None]:
print(f'In total found from 15 to 100: {[elem for elem in term_frequencies if elem[1] > 15 and elem[1] < 100]} ingredients')

In [None]:

def normalize_wine_reviews(reviews):
    normalized_instructions = []
    instruction_normalizer = RecipeNormalizer()
    for instructions in tqdm(reviews, total=len(reviews)):
        if instructions is np.nan:
            normalized_instructions.append(None)
            continue

        if type(instructions) == str:
            instruction_text = [instructions.lower()]
        else:
            instruction_text = [step.strip() for step in eval(instructions)]
        
            
        normalized_instructions.append(
            instruction_normalizer.normalize_instruction(
                instruction_text,
            )
        )
    return normalized_instructions


In [None]:
reviews = wine_dataframe.Description.to_numpy()[:500]
clean_reviews = normalize_wine_reviews(reviews)

In [None]:
clean_reviews[::100]

In [2]:
wine_descriptors_df = pd.read_csv('../data/produce/descriptor_mapping_tastes.csv', encoding='latin1')
wine_descriptors_df.drop(['level_2', 'level_1'], axis=1, inplace=True)

In [3]:
raw_descriptor = wine_descriptors_df['raw descriptor'].to_numpy()
level_3 = wine_descriptors_df.level_3.to_numpy()
types = wine_descriptors_df.type.to_numpy()
tastes = wine_descriptors_df['primary taste'].to_numpy()

descriptors = {}
for key, value, term_type, taste in zip(raw_descriptor, level_3, types, tastes):
    descriptors.update(
        {
            value.replace('_', ' '): (
                value.replace('_', ' '),
                term_type,
                taste if taste is not np.nan else 'aroma'
            )
        }
    )

In [4]:
descriptors

{'abrasive': ('abrasive', 'nonaroma', 'bitter'),
 'acacia': ('acacia', 'aroma', 'aroma'),
 'acid driven': ('acid driven', 'nonaroma', 'acid'),
 'aggressive': ('aggressive', 'nonaroma', 'acid'),
 'airy': ('airy', 'nonaroma', 'weight'),
 'allspice': ('allspice', 'aroma', 'aroma'),
 'almond': ('almond', 'aroma', 'aroma'),
 'alpine herbs': ('alpine herbs', 'aroma', 'aroma'),
 'american oak': ('american oak', 'aroma', 'aroma'),
 'angular': ('angular', 'nonaroma', 'bitter'),
 'anise': ('anise', 'aroma', 'aroma'),
 'apple': ('apple', 'aroma', 'aroma'),
 'apple blossom': ('apple blossom', 'aroma', 'aroma'),
 'apple pie': ('apple pie', 'aroma', 'aroma'),
 'apple sauce': ('apple sauce', 'aroma', 'aroma'),
 'apricot': ('apricot', 'aroma', 'aroma'),
 'ash': ('ash', 'aroma', 'aroma'),
 'asian spice': ('asian spice', 'aroma', 'aroma'),
 'asparagus': ('asparagus', 'aroma', 'aroma'),
 'asphalt': ('asphalt', 'aroma', 'aroma'),
 'assertive': ('assertive', 'nonaroma', 'acid'),
 'astringent': ('astringent

In [6]:
from helpers.prep.wine_mapping_values import wine_terms_mappings

updated_terms_mappings = {}
for wine_mapping, wine_descriptor_type in descriptors.items():
    updated_terms_mappings[wine_mapping] = (
        wine_descriptor_type[0],
        wine_descriptor_type[1],
        wine_descriptor_type[2]
    )



In [7]:
term_normalizer = RecipeNormalizer()
term_normalizer.read_and_write_ingredients(updated_terms_mappings, './helpers/prep/wine_mapping_values.py', append_ingredients=False, variable_name='wine_terms_mappings')

{'abrasive': ('abrasive', 'nonaroma', 'bitter'),
 'acacia': ('acacia', 'aroma', 'aroma'),
 'acid driven': ('acid driven', 'nonaroma', 'acid'),
 'aggressive': ('aggressive', 'nonaroma', 'acid'),
 'airy': ('airy', 'nonaroma', 'weight'),
 'allspice': ('allspice', 'aroma', 'aroma'),
 'almond': ('almond', 'aroma', 'aroma'),
 'alpine herbs': ('alpine herbs', 'aroma', 'aroma'),
 'american oak': ('american oak', 'aroma', 'aroma'),
 'angular': ('angular', 'nonaroma', 'bitter'),
 'anise': ('anise', 'aroma', 'aroma'),
 'apple': ('apple', 'aroma', 'aroma'),
 'apple blossom': ('apple blossom', 'aroma', 'aroma'),
 'apple pie': ('apple pie', 'aroma', 'aroma'),
 'apple sauce': ('apple sauce', 'aroma', 'aroma'),
 'apricot': ('apricot', 'aroma', 'aroma'),
 'ash': ('ash', 'aroma', 'aroma'),
 'asian spice': ('asian spice', 'aroma', 'aroma'),
 'asparagus': ('asparagus', 'aroma', 'aroma'),
 'asphalt': ('asphalt', 'aroma', 'aroma'),
 'assertive': ('assertive', 'nonaroma', 'acid'),
 'astringent': ('astringent

In [10]:
updated_terms_mappings.keys()

dict_keys(['abrasive', 'acacia', 'acid driven', 'aggressive', 'airy', 'allspice', 'almond', 'alpine herbs', 'american oak', 'angular', 'anise', 'apple', 'apple blossom', 'apple pie', 'apple sauce', 'apricot', 'ash', 'asian spice', 'asparagus', 'asphalt', 'assertive', 'astringent', 'austere', 'bacon', 'baked', 'baked apple', 'baked bread', 'baking spices', 'balsamic', 'banana', 'band-aid', 'barbecue', 'bark', 'barnyard', 'basil', 'bay leaf', 'beef', 'beef jerky', 'beefy', 'beeswax', 'beet', 'beetroot', 'bell pepper', 'bergamot', 'berry', 'big-boned', 'bite', 'bitter', 'bitter almond', 'bittersweet', 'black cherry', 'black currant', 'black fruit', 'black pepper', 'black tea', 'blackberry', 'blackberry jam', 'blackberry-cassis', 'blackcurrant', 'bland', 'blood orange', 'blossom', 'blue', 'blue black', 'blue flower', 'blue fruit', 'blueberry', 'blueberry pie', 'bold', 'bone dry', 'botrytis', 'boysenberry', 'bramble', 'brawny', 'bread', 'bread crust', 'brick', 'bright', 'brilliant', 'brine'

In [1]:

import numpy as np
a = [np.array([1, 2, 3]),np.array([4, 5, 6]),np.array([7, 8, 9])]

b = sum(a)/len(a)
type(b)

numpy.ndarray

In [4]:
b[0]


4.0