In [1]:
# Import the pandas package, then use the "read_csv" function to read
# the labeled training data
import pandas as pd       
train = pd.read_json("cooking/train.json")

In [2]:
print(train.shape)
print(train.columns.values)
train.describe()

(39774, 3)
[u'cuisine' u'id' u'ingredients']


Unnamed: 0,id
count,39774.0
mean,24849.536959
std,14360.035505
min,0.0
25%,12398.25
50%,24887.0
75%,37328.5
max,49717.0


In [4]:
train.isnull().sum()

cuisine        0
id             0
ingredients    0
dtype: int64

In [5]:
train.iloc[0,:]

cuisine                                                    greek
id                                                         10259
ingredients    [romaine lettuce, black olives, grape tomatoes...
Name: 0, dtype: object

In [6]:
train.cuisine.unique()

array([u'greek', u'southern_us', u'filipino', u'indian', u'jamaican',
       u'spanish', u'italian', u'mexican', u'chinese', u'british', u'thai',
       u'vietnamese', u'cajun_creole', u'brazilian', u'french',
       u'japanese', u'irish', u'korean', u'moroccan', u'russian'], dtype=object)

In [25]:
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
def ingredients_to_words( ingredients_list ):
    ingredients_text = " ".join(ingredients_list)
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    ingredients_text = BeautifulSoup(ingredients_text).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", ingredients_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))

In [27]:
clean_ingredients = ingredients_to_words( train["ingredients"][0] )
print clean_ingredients

romaine lettuce black olives grape tomatoes garlic pepper purple onion seasoning garbanzo beans feta cheese crumbles


In [28]:
num_ingredients = train["ingredients"].size
print(num_ingredients)
clean_ingredients = []
for i in xrange( 0, num_ingredients ):
    # Call our function for each one, and add the result to the list of
    # clean reviews
     # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
        print "Review %d of %d\n" % ( i+1, num_ingredients ) 
    clean_ingredients.append( ingredients_to_words( train["ingredients"][i] ) )

39774
Review 1000 of 39774

Review 2000 of 39774

Review 3000 of 39774

Review 4000 of 39774

Review 5000 of 39774

Review 6000 of 39774

Review 7000 of 39774

Review 8000 of 39774

Review 9000 of 39774

Review 10000 of 39774

Review 11000 of 39774

Review 12000 of 39774

Review 13000 of 39774

Review 14000 of 39774

Review 15000 of 39774

Review 16000 of 39774

Review 17000 of 39774

Review 18000 of 39774

Review 19000 of 39774

Review 20000 of 39774

Review 21000 of 39774

Review 22000 of 39774

Review 23000 of 39774

Review 24000 of 39774

Review 25000 of 39774

Review 26000 of 39774

Review 27000 of 39774

Review 28000 of 39774

Review 29000 of 39774

Review 30000 of 39774

Review 31000 of 39774

Review 32000 of 39774

Review 33000 of 39774

Review 34000 of 39774

Review 35000 of 39774

Review 36000 of 39774

Review 37000 of 39774

Review 38000 of 39774

Review 39000 of 39774



In [29]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 
train_data_features = vectorizer.fit_transform(clean_ingredients)
train_data_features = train_data_features.toarray()

In [30]:
print(train_data_features.shape)

(39774, 2978)


In [31]:
vocab = vectorizer.get_feature_names()
print(vocab)

[u'abalone', u'abbamele', u'absinthe', u'abura', u'acai', u'accent', u'accompaniment', u'achiote', u'acid', u'acini', u'ackee', u'acorn', u'acting', u'activ', u'active', u'added', u'adobo', u'adzuki', u'agar', u'agave', u'age', u'aged', u'ahi', u'ai', u'aioli', u'ajinomoto', u'ajwain', u'aka', u'alaskan', u'albacore', u'alcohol', u'ale', u'aleppo', u'alexia', u'alfalfa', u'alfredo', u'allspice', u'almond', u'almondmilk', u'almonds', u'aloe', u'alphabet', u'alum', u'amaranth', u'amarena', u'amaretti', u'amaretto', u'amba', u'amber', u'amberjack', u'amchur', u'america', u'american', u'aminos', u'ammonium', u'amontillado', u'ampalaya', u'anaheim', u'anasazi', u'ancho', u'anchovies', u'anchovy', u'andouille', u'anejo', u'angel', u'anglaise', u'angled', u'angostura', u'angus', u'anise', u'anisette', u'anjou', u'annatto', u'ao', u'aonori', u'apple', u'apples', u'applesauce', u'applewood', u'apricot', u'apricots', u'aquavit', u'arak', u'arame', u'arbol', u'arborio', u'arctic', u'arepa', u'arg

In [32]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100) 
forest = forest.fit( train_data_features, train["cuisine"] )

In [33]:
test = pd.read_json("cooking/test.json")
print(test.shape)

(9944, 2)


In [34]:
num_ingredients = test["ingredients"].size
clean_test_ingredients = []
for i in xrange( 0, num_ingredients ):
    # Call our function for each one, and add the result to the list of
    # clean reviews
     # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
        print "Review %d of %d\n" % ( i+1, num_ingredients ) 
    clean_test_ingredients.append( ingredients_to_words( test["ingredients"][i] ) )

Review 1000 of 9944

Review 2000 of 9944

Review 3000 of 9944

Review 4000 of 9944

Review 5000 of 9944

Review 6000 of 9944

Review 7000 of 9944

Review 8000 of 9944

Review 9000 of 9944



In [35]:
test_data_features = vectorizer.transform(clean_test_ingredients)
test_data_features = test_data_features.toarray()

In [36]:
result = forest.predict(test_data_features)
output = pd.DataFrame( data={"id":test["id"], "cuisine":result} )
output.to_csv( "whats_cooking.csv", index=False, quoting=3 )