In [1]:
import gzip
import math
import numpy as np
import pandas as pd
import random
import sklearn
import string
from collections import defaultdict
from nltk.stem.porter import *
from sklearn import linear_model
from gensim.models import Word2Vec
import dateutil
from collections import Counter
from scipy.sparse import lil_matrix # To build sparse feature matrices, if you likepip

In [2]:
interactions_train = pd.read_csv("interactions_train.csv")
interactions_test = pd.read_csv("interactions_test.csv")
interactions_validation = pd.read_csv("interactions_validation.csv")
recipes = pd.read_csv("RAW_recipes.csv")
interactions = pd.read_csv("RAW_interactions.csv")

In [3]:
def clean_reviews():

    clean = interactions.merge(recipes, left_on='recipe_id', right_on='id')
    clean = clean.drop(columns=['id'])
    clean['date'] = pd.to_datetime(clean['date'])
    clean['submitted'] = pd.to_datetime(clean['submitted'])
    clean.rename(columns={
        'date': 'review_date',
        'review': 'review_text',
        'name': 'recipe_name',
        'submitted': 'recipe_date',  
    })
    return clean

In [4]:
reviews_clean = clean_reviews()
N = reviews_clean.shape[0]

train = reviews_clean[:int(0.8 * N)]
test = reviews_clean[int(0.8 * N): int(0.9 * N)]
validation = reviews_clean[int(0.9 * N):]

In [5]:
len(train), len(test), len(validation)

(905893, 113237, 113237)

In [6]:
def eda():
    stats = {}

    stats['num_reviews'] = len(reviews_clean)
    stats['num_users'] = reviews_clean.groupby('user_id')['recipe_id'].count().shape[0]
    stats['num_recipes'] = len(recipes)
    stats['num_contributors'] = recipes.groupby('contributor_id')['id'].count().shape[0]
    
    stats['median_recipe_time'] = recipes['minutes'].median()
    stats['avg_num_steps'] = recipes['n_steps'].mean()
    stats['avg_num_ingredients'] = recipes['n_ingredients'].mean()

    stats['avg_rating'] = reviews_clean['rating'].mean()

    return stats
eda()

{'num_reviews': 1132367,
 'num_users': 226570,
 'num_recipes': 231637,
 'num_contributors': 27926,
 'median_recipe_time': 40.0,
 'avg_num_steps': 9.7654994668382,
 'avg_num_ingredients': 9.051153313158087,
 'avg_rating': 4.411016039852804}

In [7]:
train

Unnamed: 0,user_id,recipe_id,date,rating,review,name,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...,white bean green chile pepper soup,495,1533,2002-09-21,"['weeknight', 'time-to-make', 'course', 'main-...","[204.8, 5.0, 9.0, 26.0, 24.0, 2.0, 10.0]",4,"['combine beans , onion , chilies , 1 / 2 teas...",easy soup for the crockpot.,"['great northern beans', 'yellow onion', 'dice...",9
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall...",white bean green chile pepper soup,495,1533,2002-09-21,"['weeknight', 'time-to-make', 'course', 'main-...","[204.8, 5.0, 9.0, 26.0, 24.0, 2.0, 10.0]",4,"['combine beans , onion , chilies , 1 / 2 teas...",easy soup for the crockpot.,"['great northern beans', 'yellow onion', 'dice...",9
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...,devilicious cookie cake delights,20,56824,2002-10-27,"['30-minutes-or-less', 'time-to-make', 'course...","[132.3, 11.0, 39.0, 5.0, 4.0, 11.0, 5.0]",5,"['blend together cake mix , oil and eggs', 'ad...",,"[""devil's food cake mix"", 'vegetable oil', 'eg...",4
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...,baked potato toppings,10,64342,2004-02-25,"['15-minutes-or-less', 'time-to-make', 'course...","[2786.2, 342.0, 134.0, 290.0, 161.0, 301.0, 42.0]",3,['pick whichever topping you want to use and c...,these toppings sure makes a nice change from p...,"['mayonnaise', 'salsa', 'cheddar cheese', 'ref...",13
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin...",baked potato toppings,10,64342,2004-02-25,"['15-minutes-or-less', 'time-to-make', 'course...","[2786.2, 342.0, 134.0, 290.0, 161.0, 301.0, 42.0]",3,['pick whichever topping you want to use and c...,these toppings sure makes a nice change from p...,"['mayonnaise', 'salsa', 'cheddar cheese', 'ref...",13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
905888,32946,14325,2002-04-25,5,I made a batch for the ladies at work on Secre...,fresh orange muffins,35,20371,2001-11-15,"['60-minutes-or-less', 'time-to-make', 'course...","[198.5, 12.0, 57.0, 8.0, 5.0, 25.0, 9.0]",10,"['preheat oven to 400f', 'spray a muffin tin w...",i got this recipe from one of my sils after ha...,"['orange', 'orange juice', 'egg', 'butter', 'a...",8
905889,47183,14325,2002-07-07,5,I have never tried making orange muffins befor...,fresh orange muffins,35,20371,2001-11-15,"['60-minutes-or-less', 'time-to-make', 'course...","[198.5, 12.0, 57.0, 8.0, 5.0, 25.0, 9.0]",10,"['preheat oven to 400f', 'spray a muffin tin w...",i got this recipe from one of my sils after ha...,"['orange', 'orange juice', 'egg', 'butter', 'a...",8
905890,52381,14325,2002-11-06,4,Amazing that something smelling and tasting so...,fresh orange muffins,35,20371,2001-11-15,"['60-minutes-or-less', 'time-to-make', 'course...","[198.5, 12.0, 57.0, 8.0, 5.0, 25.0, 9.0]",10,"['preheat oven to 400f', 'spray a muffin tin w...",i got this recipe from one of my sils after ha...,"['orange', 'orange juice', 'egg', 'butter', 'a...",8
905891,47341,14325,2003-03-10,5,these were so easy to make that i doubled it a...,fresh orange muffins,35,20371,2001-11-15,"['60-minutes-or-less', 'time-to-make', 'course...","[198.5, 12.0, 57.0, 8.0, 5.0, 25.0, 9.0]",10,"['preheat oven to 400f', 'spray a muffin tin w...",i got this recipe from one of my sils after ha...,"['orange', 'orange juice', 'egg', 'butter', 'a...",8
