In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
import numpy as np
import pickle
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
user_data = pd.read_csv('data/RAW_interactions.csv')

recipe_data = pd.read_csv('data/RAW_recipes.csv')

recipes_PP = pd.read_csv('data/PP_recipes.csv')

users_PP = pd.read_csv('data/PP_users.csv')

pickled_map = pd.read_pickle('data/ingr_map.pkl')

In [3]:
# Creating a function to perform cleaning steps at once
stopwords_list = stopwords.words('english')

no_bad_chars = re.compile('[!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n - ]')
no_nums = re.compile('[\d-]')

def clean_text(text):
    #text = no_nums.sub('', text) 
    text = no_bad_chars.sub(' ', text) 
    text = text.lower() 
    text = ' '.join(word for word in text.split() if word not in stopwords_list)
    return text


In [4]:
recipe_data['description'] = recipe_data['description'].astype(str)
descr_cleaned = recipe_data['description'].apply(clean_text)

recipe_data['name'] = recipe_data['name'].astype(str)
names_cleaned = recipe_data['name'].apply(clean_text)


In [5]:
# how many recipe names include the word 'vegetarian'?
len(names_cleaned[names_cleaned.str.contains('vegetarian')])


928

In [6]:
# how many recipe descriptions include the word 'vegetarian'?
len(descr_cleaned[descr_cleaned.str.contains('vegetarian')])

3784

In [7]:
# how many recipe names include the word 'vegan'?
len(names_cleaned[names_cleaned.str.contains('vegan')])

1362

In [8]:
# how many recipe descriptions include the word 'vegan'?
len(descr_cleaned[descr_cleaned.str.contains('vegan')])

1909

In [9]:
len(recipe_data[recipe_data['ingredients'].str.contains('vegetarian')])


293

In [10]:
len(recipe_data[recipe_data['ingredients'].str.contains('vegan')])


367

In [11]:
len(recipe_data[recipe_data['ingredients'].str.contains('gluten-free')])


305

In [12]:
len(recipe_data[recipe_data['ingredients'].str.contains('gluten free')])


54

In [13]:
len(recipe_data[recipe_data['name'].str.contains('gluten-free')])


0

In [14]:
len(recipe_data[recipe_data['name'].str.contains('gluten free')])


939

In [15]:
len(recipe_data[recipe_data['description'].str.contains('gluten free')])


571

In [16]:
len(recipe_data[recipe_data['tags'].str.contains('gluten-free')])

5743

In [17]:
len(recipe_data[recipe_data['tags'].str.contains('vegan')])

10012

In [35]:
recipe_data[recipe_data['tags'].str.contains('vegan')]

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
6,aww marinated olives,25274,15,21730,2002-04-14,"['15-minutes-or-less', 'time-to-make', 'course...","[380.7, 53.0, 7.0, 24.0, 6.0, 24.0, 6.0]",4,['toast the fennel seeds and lightly crush the...,my italian mil was thoroughly impressed by my ...,"['fennel seeds', 'green olives', 'ripe olives'...",9
19,cream of cauliflower soup vegan,23850,110,3288,2002-03-28,"['lactose', 'weeknight', 'time-to-make', 'cour...","[174.2, 4.0, 24.0, 1.0, 15.0, 1.0, 10.0]",10,['heat the oil or margarine in a soup pot and ...,this is a dairy free,"['canola oil', 'onion', 'garlic', 'cauliflower...",16
21,cream of spinach soup vegan,24701,55,3288,2002-04-08,"['60-minutes-or-less', 'time-to-make', 'course...","[64.8, 3.0, 13.0, 54.0, 4.0, 2.0, 3.0]",10,"['in a 3 qt saucepan over medium high heat , s...",thickened with a mix of cooked oats and vegies...,"['onion', 'scallion', 'apple juice', 'olive oi...",12
26,fool the meat eaters chili,54272,40,40525,2003-02-17,"['60-minutes-or-less', 'time-to-make', 'course...","[295.6, 3.0, 56.0, 76.0, 32.0, 1.0, 18.0]",9,"['rehydrate tvp if needed', 'spray or oil a la...",this recipe was adapted by my mother and mysel...,"['vegetarian ground beef', 'garlic', 'onion', ...",12
61,mock a mole low fat guacamole,81185,15,67026,2004-01-15,"['15-minutes-or-less', 'time-to-make', 'course...","[115.9, 7.0, 21.0, 19.0, 9.0, 3.0, 5.0]",7,['thaw peas by running hot water over them in ...,from crescent dragonwagon's,"['frozen peas', 'hass avocado', 'salt', 'lemon...",7
...,...,...,...,...,...,...,...,...,...,...,...,...
231559,zucchini courgettes sauteed with sun dried to...,413205,40,485109,2010-02-16,"['weeknight', '60-minutes-or-less', 'time-to-m...","[115.4, 7.0, 33.0, 6.0, 8.0, 3.0, 5.0]",12,['place the sundried tomatoes in the 3 / 4 cup...,"this is a lovely mediterranean side dish, and ...","['sun-dried tomatoes', 'warm water', 'olive oi...",8
231566,zucotte,96811,65,65056,2004-08-03,"['time-to-make', 'course', 'main-ingredient', ...","[145.7, 13.0, 10.0, 2.0, 3.0, 27.0, 5.0]",10,"['in a large , heavy saucpan , melt 1 tbs of t...",a garlicky braised winter squash dish that cou...,"['butter', 'white wine', 'garlic', 'butternut ...",6
231572,zulu cabbage,455000,40,1058097,2011-05-04,"['60-minutes-or-less', 'time-to-make', 'course...","[94.0, 7.0, 25.0, 4.0, 4.0, 3.0, 4.0]",5,"['heat the oil in a large skillet', 'add onion...",adapted from the african kitchen.,"['sunflower oil', 'onion', 'green bell pepper'...",9
231610,zurie s overnight no knead bread,196201,70,200862,2006-11-18,"['time-to-make', 'course', 'main-ingredient', ...","[1379.6, 5.0, 12.0, 194.0, 79.0, 3.0, 96.0]",82,"[""since first making this bread according to t...","after posting this recipe, and making the brea...","['white bread flour', 'instant yeast', 'sugar'...",5


In [18]:
len(recipe_data[recipe_data['tags'].str.contains('vegetarian')])

35651

In [19]:
#print(recipe_data[recipe_data['name'].str.contains('gluten free')])

In [20]:
len(recipe_data[recipe_data['description'].str.contains('gluten-free')])


473

In [21]:
meat = ['ham', 'beef', 'meat', 'chicken', 'pork', 'bacon', 'sausage', 'lamb', 'veal', 'turkey', 'steak', 'rib', 'frankfurter', 'duck', 'poultry', 'goat', 'liver', 'hen', 'quail', 'brisket', 'goose']
seafood = ['fish', 'shrimp', 'seafood', 'crab', 'lobster', 'clam', 'oyster', 'scallop', 'mussel', 'cod', 'salmon', 'halibut', 'shellfish', 'roe', 'tuna', 'caviar', 'pollock', 'yellowtail', 'squid', 'calamari', 'octopus', 'crawfish', 'crayfish', 'sardine', 'trout', 'flounder', 'anchovy', 'bass', 'haddock', 'sole' ]
animal_other = ['egg', 'honey']
dairy = ['milk', 'cheese', 'yogurt', 'mayonnaise', 'butter', 'margarine', 'cream']
gluten_free = ['gluten free', 'gluten-free']


In [None]:
vegan = ['ham', 'beef', 'meat', 'chicken', 'pork', 'bacon', 'sausage', 'lamb', 'veal', 'turkey', 'steak', 'rib', 'frankfurter', 'duck', 'poultry', 'goat', 'liver', 'hen', 'quail', 'brisket', 'goose','fish', 'shrimp', 'seafood', 'crab', 'lobster', 'clam', 'oyster', 'scallop', 'mussel', 'cod', 'salmon', 'halibut', 'shellfish', 'roe', 'tuna', 'caviar', 'pollock', 'yellowtail', 'squid', 'calamari', 'octopus', 'crawfish', 'crayfish', 'sardine', 'trout', 'flounder', 'anchovy', 'bass', 'haddock', 'sole','egg', 'honey','milk', 'cheese', 'yogurt', 'mayonnaise', 'butter', 'margarine', 'cream']

vegetarian = ['ham', 'beef', 'meat', 'chicken', 'pork', 'bacon', 'sausage', 'lamb', 'veal', 'turkey', 'steak', 'rib', 'frankfurter', 'duck', 'poultry', 'goat', 'liver', 'hen', 'quail', 'brisket', 'goose','fish', 'shrimp', 'seafood', 'crab', 'lobster', 'clam', 'oyster', 'scallop', 'mussel', 'cod', 'salmon', 'halibut', 'shellfish', 'roe', 'tuna', 'caviar', 'pollock', 'yellowtail', 'squid', 'calamari', 'octopus', 'crawfish', 'crayfish', 'sardine', 'trout', 'flounder', 'anchovy', 'bass', 'haddock', 'sole']


In [22]:
len(recipe_data['description'].isin(gluten_free))

231637

In [32]:
len(recipe_data[recipe_data['description'].str.contains("vegetarian")])

3784

In [33]:
len(recipe_data[recipe_data['description'].str.contains("gluten free")])

571

In [34]:
len(recipe_data[recipe_data['description'].str.contains("gluten-free")])

473

In [36]:
veg_result = []
def vege(tag):
    if ([recipe_data['tags'].str.contains("vegetarian")]):
        veg_result.append("True")
    else:
        veg_result.append("False")
recipe_data["vegetarian"] = recipe_data['tags'].apply(vege)   
print(recipe_data)

KeyboardInterrupt: 

In [41]:
#Create ratings distribution

from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

data = user_data['rating'].value_counts().sort_index(ascending=False)
trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / user_data.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = data.values,
               )
# Create layout
layout = dict(title = 'Distribution Of {} recipe ratings'.format(user_data.shape[0]),
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)
#view raw

In [42]:
# Number of ratings per book
data = user_data.groupby('recipe_id')['rating'].count().clip(upper=50)

# Create trace
trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 50,
                                  size = 2))
# Create layout
layout = go.Layout(title = 'Distribution Of Number of Ratings Per Recipe (Clipped at 100)',
                   xaxis = dict(title = 'Number of Ratings Per Recipe'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [44]:
user_data.groupby('recipe_id')['rating'].count().reset_index().sort_values('rating', ascending=False)[:10]

Unnamed: 0,recipe_id,rating
833,2886,1613
14424,27208,1601
51312,89204,1579
21639,39087,1448
38631,67256,1322
30638,54257,1305
11695,22782,1234
17473,32204,1220
39713,69173,997
39588,68955,904


In [45]:
# Number of ratings per user
data = user_data.groupby('user_id')['rating'].count().clip(upper=50)

# Create trace
trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 50,
                                  size = 2))
# Create layout
layout = go.Layout(title = 'Distribution Of Number of Ratings Per User (Clipped at 50)',
                   xaxis = dict(title = 'Ratings Per User'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [47]:
user_data.groupby('user_id')['rating'].count().reset_index().sort_values('rating', ascending=False)[:10]

Unnamed: 0,user_id,rating
46329,424680,7671
3826,37449,5603
43248,383346,4628
20019,169430,4076
15168,128473,3917
10551,89831,3353
6942,58104,3288
15636,133174,3107
23735,199848,3018
35099,305531,2902


In [None]:
most common tags?
top_tags = tags_tokenized.value_counts()[:10].sort_values(ascending=False)
top_tags

In [None]:
# from medium post
import nltk
vocabulary = nltk.FreqDist()
# This was done once I had already preprocessed the ingredients
for ingredients in recipe_df['ingredients']:
    ingredients = ingredients.split()
    vocabulary.update(ingredients)
for word, frequency in vocabulary.most_common(200):
    print(f'{word};{frequency}')