## Cleaning and Joining Data Sets

### Import Packages

In [None]:
import pickle 
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date
from itertools import chain
import itertools
import numpy as np

### Open and Join Data Sources

In [None]:
with open('Data Sets/Recipe_Data_Scrape.pickle','rb') as read_file:
    recipe_dictionary = pickle.load(read_file)

In [None]:
recipe_data = pd.DataFrame(recipe_dictionary, 
                columns =['name', 'number_of_steps', 'number_of_ratings', 'rating_value', 'author',
                         'recipe_time', 'number_of_ingredients', 'number_of_servings', 'image_link']) 

recipe_data.info()

In [None]:
recipe_data.drop(columns =['image_link'])
recipe_data['rating_value'].value_counts()

In [None]:
with open('Data Sets/Cleaned_Date_Data.pickle','rb') as read_file:
    recipe_start_dates = pickle.load(read_file)   


In [None]:
full_recipe_data = recipe_data.join(recipe_start_dates, rsuffix = '_other')

full_recipe_data.head(5)

### Find Top Categories and Keywords

In [None]:
categories = pd.DataFrame(recipe_dictionary, columns =['name','recipe_categories'])
find_categories = pd.DataFrame(recipe_dictionary, columns =['recipe_categories'])
categories_list = find_categories.values.tolist()
array = np.array(categories_list)
merged = list(itertools.chain.from_iterable(array))
new_list1 = [y for x in merged for y in x.split(',')]
new_list2 = [s.strip() for s in new_list1]
new_list3 = sorted(new_list2)
merged
res = [] 
[res.append(x) for x in new_list3 if x not in res] 
test = [(x, new_list3.count(x)) for x in set(new_list3)]


In [None]:
# Function to find top categories and keywords
def Sort_Tuple(tup):  
    tup.sort(key = lambda x: x[1], reverse = True)  
    return tup  
    # used https://www.geeksforgeeks.org/python-program-to-sort-a-list-of-tuples-by-second-item/

Sort_Tuple(test)[:26]

In [None]:
keywords = pd.DataFrame(recipe_dictionary, columns =['name','recipe_keywords'])
find_keywords = pd.DataFrame(recipe_dictionary, columns =['recipe_keywords'])
keywords_list = find_keywords.values.tolist()
array = np.array(keywords_list)
merged = list(itertools.chain.from_iterable(array))
new_list1 = [y for x in merged for y in x.split(',')]
new_list2 = [s.strip() for s in new_list1]
new_list3 = sorted(new_list2)
kw = [] 
[kw.append(x) for x in new_list3 if x not in kw] 

test2 = [(x, new_list3.count(x)) for x in set(new_list3)]

Sort_Tuple(test2)[:26]

### Add Top Categories and Keywords to Data Set

In [None]:
categories['main_course'] = categories['recipe_categories'].str.contains('main course')
categories['dinner'] = categories['recipe_categories'].str.contains('dinner')
categories['side_dish'] = categories['recipe_categories'].str.contains('side dish')
categories['easy'] = categories['recipe_categories'].str.contains('easy')
categories['dessert'] = categories['recipe_categories'].str.contains('dessert')
categories['quick'] = categories['recipe_categories'].str.contains('quick')
categories['weekday'] = categories['recipe_categories'].str.contains('weekday')
categories['appetizer'] = categories['recipe_categories'].str.contains('appetizer')
categories['lunch'] = categories['recipe_categories'].str.contains('lunch')
categories.head()

In [None]:
keywords['vegetarian'] = keywords['recipe_keywords'].str.contains('vegetarian')
keywords['fall'] = keywords['recipe_keywords'].str.contains('fall')
keywords['winter'] = keywords['recipe_keywords'].str.contains('winter')
keywords['summer'] = keywords['recipe_keywords'].str.contains('summer')
keywords.head()

### Join and Clean Data Sets 

In [None]:
full_recipe_data_cat = full_recipe_data.join(categories, rsuffix = '_cat')
full_recipe_data_cat.info()

In [None]:
full_recipe_data_key = full_recipe_data_cat.join(keywords, rsuffix = '_key')
full_recipe_data_key.info()

In [None]:
full_recipe_data_key['days_ago'] = (pd.to_datetime("now") - full_recipe_data_key['recipe_start_date']).dt.days
full_recipe_data_key['days_ago']

In [None]:
full_recipe_data_key['recipe_time'].unique()

In [None]:
full_recipe_data_key['recipe_time_new'] = full_recipe_data_key['recipe_time'].str.split('T').str[1]
full_recipe_data_key['recipe_time_hour'] = pd.to_numeric(np.where(full_recipe_data_key['recipe_time_new'].str.contains('H'), full_recipe_data_key['recipe_time_new'].str.split('H').str[0], 0))
full_recipe_data_key['recipe_time_min_w_hour'] = np.where(full_recipe_data_key['recipe_time_new'].str.contains('H'), full_recipe_data_key['recipe_time_new'].str.split('H').str[1], 0)
full_recipe_data_key['recipe_time_min_no_hour'] =  np.where(full_recipe_data_key['recipe_time_new'].str.contains('H') == False, full_recipe_data_key['recipe_time_new'], 0)
full_recipe_data_key['recipe_time_min_w_hour'] = pd.to_numeric(full_recipe_data_key['recipe_time_min_w_hour'].str.strip('M')).fillna(0)
full_recipe_data_key['recipe_time_min_no_hour'] = pd.to_numeric(full_recipe_data_key['recipe_time_min_no_hour'].str.strip('M')).fillna(0)
full_recipe_data_key['recipe_time_in_min'] = (60*full_recipe_data_key['recipe_time_hour']) + full_recipe_data_key['recipe_time_min_w_hour'] + full_recipe_data_key['recipe_time_min_no_hour']


In [None]:
full_recipe_data_key.drop(columns =['name_cat','name_key','recipe_time_new', 'recipe_time_hour', 'recipe_time_min_w_hour', 'recipe_time_min_no_hour'], inplace = True)
full_recipe_data_key.info()

### Export Data Set

In [None]:
with open('Data Sets/Final_Data_Set.pickle', 'wb') as to_write:
        pickle.dump(full_recipe_data_key, to_write)