# Project 3 CS5293 SP20
## Matthew Huson

#### Note: as this was done for a class, the flow (define function, test function, with acutal execution at the end) is to meet the requirements of the prompt

## Libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn

ModuleNotFoundError: No module named 'seaborn'

## Functions

### Read In Json

In [3]:
def read_in(filename='yummly.json'):
    '''
    takes json filename as argument
    returns dataframe of recipe ids, cuisines and ingredients
    '''
    
    df = pd.read_json(filename) #read_json turns all of the json dictionaries into a dataframe
    
    return df

#### TEST

In [4]:
test_df = read_in() #get test dataframe

print(test_df.shape) #dataframe should have 3 columns and 39774 rows

test_df.head() #check contents of dataframe

(39774, 3)


Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."


### De-Tokenize Ingredients

In [5]:
def de_tokenize(df):
    '''
    takes a dataframe as argument
    returns same dataframe with additional column for joined ingredient lists
    '''
    
    l = [' '.join(recipe) for recipe in df['ingredients']] #get list of joined ingredient lists
    
    df['ing_join'] = l #create new column in dataframe to hold them
    
    return df

#### TEST

In [6]:
test_df = de_tokenize(test_df) #get de-tokenized dataframe

print(test_df.shape) #data frame should have 4 columns and 39774 rows

test_df.head() #check that new column is a string, not a list

(39774, 4)


Unnamed: 0,id,cuisine,ingredients,ing_join
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes...",romaine lettuce black olives grape tomatoes ga...
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g...",plain flour ground pepper salt tomatoes ground...
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g...",eggs pepper salt mayonaise cooking oil green c...
3,22213,indian,"[water, vegetable oil, wheat, salt]",water vegetable oil wheat salt
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe...",black pepper shallots cornflour cayenne pepper...


### Vectorize

In [7]:
def vectorize(df, train=True):
    
    '''
    accepts a dataframe and train flag as arguments
    returns either a matrix of vectorized ingredients and array of cuisines, or a vector of a single ingredient list
    '''
    
    if train: #if train == True, it needs the matrix of vectors and the cuisine labels
        vectorizer = TfidfVectorizer(min_df=2) #vectorize; setting min_df to 2 eliminates typos and one-offs
        X = vectorizer.fit_transform(df['ing_join']) #fit_transform the joined ingredients lists
        y = np.array(df['cuisine']) #get array of cuisine labels
        
        return X,y #return matrix and labels
    
    else: #otherwise, it just needs the last line of the updated dataframe
        vectorizer = TfidfVectorizer(min_df=2) #vectorize
        X = vectorizer.fit_transform(df['ing_join'])[-1] #only get last row; will be the user input
        
        return X #return single row vector
    
    

#### TEST

In [8]:
test_X,test_y = vectorize(test_df) #get matrix and array

print(test_X.shape) #matrix should have 39774 rows and some number of columns
print(test_y.shape) #array should have 39774 rows and no columns

test_X_vec = vectorize(test_df, False) #get vector

print(test_X_vec.shape) #vector should have 1 row and the same number of columns as matrix

(39774, 2459)
(39774,)
(1, 2459)


### Train Classifier

In [9]:
def train_classifier(X,y):
    '''
    takes vector matrix and array of cuisine labels as arguments
    fits kNN classifier to the data and returns the classifier
    '''
    
    classifier = KNeighborsClassifier(n_neighbors=11) #initialize classifier instance
    classifier.fit(X,y) #fit to data
    
    return classifier #return classifier

#### TEST

In [10]:
test_classifier = train_classifier(test_X,test_y) #get classifier

print(test_classifier) #check that classifier is KNeighborsClassifier with n_neighbors=11

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=11, p=2,
                     weights='uniform')


### Parse User Input

In [11]:
def parse_user_input(input_ingredients, df):
    
    '''
    takes user-input list of ingredients and the original dataframe as arguments
    returns a dataframe with the user ingredients as the last row
    '''
    
    temp_df = df.copy() #create copy of original dataframe
    joined_input = ' '.join(input_ingredients).strip() #join the list of ingredients into a string
    
    new_row = [None,None,input_ingredients, joined_input] #create new row to add to df
    temp_df.loc[len(temp_df['id'])] = new_row #add row to bottom of df

    
    return temp_df #return new df

#### TEST

In [12]:
test_input = ['tomatoes', 'garlic', 'spaghetti'] #list of user input ingredients
test_t_df = parse_user_input(test_input, test_df) #get new dataframe

test_t_df.tail() #check that ingredients list has been added to the end

Unnamed: 0,id,cuisine,ingredients,ing_join
39770,11462.0,italian,"[KRAFT Zesty Italian Dressing, purple onion, b...",KRAFT Zesty Italian Dressing purple onion broc...
39771,2238.0,irish,"[eggs, citrus fruit, raisins, sourdough starte...",eggs citrus fruit raisins sourdough starter fl...
39772,41882.0,chinese,"[boneless chicken skinless thigh, minced garli...",boneless chicken skinless thigh minced garlic ...
39773,2362.0,mexican,"[green chile, jalapeno chilies, onions, ground...",green chile jalapeno chilies onions ground bla...
39774,,,"[tomatoes, garlic, spaghetti]",tomatoes garlic spaghetti


### Classify Cuisine for Ingredients

In [26]:
def classify_ingredients(classifier,X):
    
    '''
    takes trained classifier and user input vector as arguments
    returns string of cuisine prediction and probability
    '''
    
    cuisine = classifier.predict(X)[0] #get cuisine label prediction
    cuis_prob = classifier.predict_proba(X)[0].max() #get %of nearest neighbors that share cuisine label
    
    out_string = f'Cuisine: {cuisine.capitalize()}\nProbability: {round(cuis_prob,2)}\n' #create return string
    
    return out_string #return string

#### TEST

In [28]:
test_user_vec = vectorize(test_t_df, False) #vectorize the user input from before

print(classify_ingredients(test_classifier, test_user_vec)) #check that the predicted cuisine is Italian

Cuisine: Italian
Probability: 0.91



### Get Closest Recipes

In [15]:
def closest_recipes(classifier,X,df,count=7):
    
    '''
    takes trained classifier, user input vector, original dataframe and count number as arguments
    returns string of n=count closest recipes and their euclidean distances
    '''
    
    recipes = classifier.kneighbors(X, count) #get tuple of closest recipes and their distances
    out_string = f'Closest {count} recipies (Euclidean dist.): ' #initialize string to return
    for distance, recipe in zip(recipes[0][0],recipes[1][0]): #zip recipe index to recipe distance
        rid = df['id'][recipe] #find recipe id in original dataframe
        out_string += f'{rid} ({round(distance,2)}), ' #concatenate string
    
    return out_string #return string

#### TEST

In [16]:
print(closest_recipes(test_classifier,test_user_vec,test_df)) #check that the correct number of recipes are printed

Closest 7 recipies (Euclidean dist.): 41633 (0.77), 34842 (0.9), 32897 (0.91), 42475 (0.91), 28639 (0.93), 3050 (0.94), 24276 (0.94), 


### Main Function

In [22]:
def main(user_input):
    
    df=read_in() #get yummly dataframe
    df=de_tokenize(df) #add de-tokenized column to dataframe
    t_df = parse_user_input(user_input,df) #get new dataframe with user data as final row
    X,y = vectorize(df) #get training matrix and cuisine labels
    classifier = train_classifier(X,y) #train classifier
    t_df = parse_user_input(user_input,df) #get new dataframe with user data as final row
    uX = vectorize(t_df, False) #get user input vector
    cuisine = classify_ingredients(classifier,uX) #get predicted cuisine
    cuisine += closest_recipes(classifier,uX,df) #concatenate nearest recipes
    
    
    return cuisine #return string

## Execution

In [27]:
us_in = ['biryani masala', 'tortilla', 'cheese'] #get user input

%time print(main(us_in)) #print 

Cuisine: Indian
Probability: 0.91
Closest 7 recipies (Euclidean dist.): 11272 (1.08), 46971 (1.1), 15491 (1.12), 33920 (1.15), 40306 (1.15), 43221 (1.16), 30678 (1.16), 
Wall time: 2.65 s
