In [1]:
import pandas as pd
import numpy as np
import re 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.stem import WordNetLemmatizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV

In [94]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [97]:
from sklearn.svm import SVC

## Read in data

In [2]:
df_1 = pd.read_csv('brownie_ingred_reduced.csv')

In [3]:
df_2 = pd.read_csv('recipe_info.csv')

## Clean data

In [4]:
df_1.drop('Unnamed: 0', inplace=True, axis = 1)
df_1.set_index('Unnamed: 0.1', inplace=True)

In [5]:
df_2.drop('Unnamed: 0', inplace=True, axis = 1)
df_2.set_index('Unnamed: 0.1', inplace=True)

# Join and clean joined df

In [6]:
df_3 = df_2.join(df_1)

In [7]:
df_3.drop_duplicates(inplace=True)

In [8]:
df_3.dropna(axis=0, inplace=True)

In [9]:
df_3.reset_index(inplace=True, drop=True)
df_3.head(2)

Unnamed: 0,rec_name,tot_time_seconds,rec_source,rating,1% low-fat milk,2% lowfat greek yogurt,2% reduced-fat milk,active dry yeast,adobo sauce,adzuki beans,...,wilton candy eyeballs,xanthan gum,xylitol sweetener,yams,yellow cake mix,yellow food coloring,yoghurt,yolk,yoplait,zucchini
0,1 Minute Chocolate Frosting,60.0,Spend with Pennies,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1 Minute Microwave Brownie-Microwave Mug Meals,360.0,Gemma's Bigger Bolder Baking,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Feature engineering - recipe sources that consistently score a certain way

In [10]:
solid_fives = ['My Baking Addiction', 'Recipe Girl', 'La Tartine Gourmande',
              'Closet Cooking', 'Our Best Bites', 'David Lebovitz',
              'I Adore Food!']
solid_threes = ['Tablespoon', 'Tasty Kitchen', 'Southern Food About.com',
               'Cooking Channel', 'Dinner Then Dessert', 'Diabetic Connect']
solid_fours = ['Brown Eyed Baker', 'Recipe for Perfection', 'Something Swanky',
              'Broma Bakery', 'a trEATs affair', 'The Domestic Rebel',
              'Dinners, Dishes and Desserts', 'Love and Olive Oil', 
              'Life Made Simple', "Roxana's Home Baking"]

In [11]:
# Create column of zeros and ones for rec_source membership in solid_fives list
fives_boolean = df_3.rec_source.isin(solid_fives)
fives_dummy = [1 if z==True else 0 for z in fives_boolean]
df_3['solid_fives'] = fives_dummy

# Do same for solid_fours and solid_threes
fours_boolean = df_3.rec_source.isin(solid_fours)
fours_dummy = [1 if z==True else 0 for z in fours_boolean]
df_3['solid_fours'] = fours_dummy

threes_boolean = df_3.rec_source.isin(solid_threes)
threes_dummy = [1 if z==True else 0 for z in threes_boolean]
df_3['solid_threes'] = threes_dummy

## Add number of ingredients column, also remove rare ratings

In [12]:
# Create column for number of ingredients
df_3['num_ingredients'] = df_3.iloc[:, 4:-9].sum(axis=1)

In [13]:
#quickly drop zeros and twos, since there are so few
df_3 = df_3.loc[df_3.rating != 0]
df_3 = df_3.loc[df_3.rating != 2]

## Process text data for bag of words

In [17]:
# strip non alpha characters
name_string = [re.sub('[^A-Za-z]', ' ', z).strip().lower() for z in df_3['rec_name']]

# convert to list so you can iterate through with lemmatization
convert_to_list = [z.split() for z in name_string]

# lemmatize each word in list
lemmatized = []
for _list in convert_to_list:
    sub_list = []
    for word in _list:
        sub_list.append(WordNetLemmatizer().lemmatize(word))
    lemmatized.append(sub_list)
    
df_3['name_str'] = [' '.join(z) for z in lemmatized]

In [85]:
# Instantiate TfidfVectorizer
vect = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_df=.5)

# create document term matrix
dtm = vect.fit_transform(df_3.name_str).toarray()

# convert to dataframe
clean_df = pd.DataFrame(dtm, columns = vect.get_feature_names())

# add columns to dataframe
clean_df['num_ingred'] = df_3.num_ingredients
clean_df['solid_five'] = df_3.solid_fives
clean_df['solid_four'] = df_3.solid_fours
clean_df['solid_three'] = df_3.solid_threes
clean_df['rating'] = df_3.rating

# drop any NaNs from dataframe
clean_df.dropna(axis = 0, inplace=True)

In [91]:
# create features and target
X, y = clean_df.drop(['rating'], axis=1), clean_df.rating

## Reduce features L1-based feature selection

In [122]:
# Fit LinearSVC
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)

# use SelectFromModel to extract features
model = SelectFromModel(lsvc, prefit=True)

# transform X to new feature space
X_new = model.transform(X)

# check shape
X_new.shape

(4794, 4)

In [127]:
model.get_support(True)

array([5038, 5039, 5040, 5041])

In [133]:
# look, it's only the features I added, none of the text data was worth while
reduced_df = X.iloc[:, (5038, 5039, 5040, 5041)]

## Compare Multinomial Naive Bayes, Bernoulli NB, Logistic Regression, and Random Forests

In [116]:
mnb = MultinomialNB()

In [117]:
cross_val_score(mnb, X_new, y, cv = 5)

array([ 0.77604167,  0.78125   ,  0.77685089,  0.78810021,  0.77638454])

In [118]:
bnb = BernoulliNB()

In [119]:
cross_val_score(bnb, X_new, y, cv = 5)

array([ 0.77604167,  0.78125   ,  0.77685089,  0.78810021,  0.77638454])

In [120]:
logit = LogisticRegression()

In [121]:
cross_val_score(logit, X_new, y, cv = 5)

array([ 0.77604167,  0.78125   ,  0.77685089,  0.78810021,  0.77638454])

In [101]:
# clf = SVC(C=1)
# gamma_range = 10.**np.arange(-3, 2)
# kernel_range = ['rbf', 'linear', 'poly']
# param_grid = dict(gamma=gamma_range)
# grid = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
# grid.fit(X, y)

In [103]:
rf = RandomForestClassifier()

In [104]:
cross_val_score(rf, X, y, cv = 5)

array([ 0.71354167,  0.6875    ,  0.72888425,  0.75156576,  0.70532915])

In [112]:
params = {'n_estimators':[3, 5, 10, 50],
          'criterion': ['gini', 'entropy'],
          'max_depth': [None, 3, 5],
          'min_samples_split': [2,5],
          'class_weight':[None, 'balanced']}


gsrf = GridSearchCV(RandomForestClassifier(n_jobs=-1),
                    params, n_jobs=-1)

In [113]:
gsrf.fit(X_new, y)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [3, 5, 10, 50], 'min_samples_split': [2, 5], 'criterion': ['gini', 'entropy'], 'max_depth': [None, 3, 5], 'class_weight': [None, 'balanced']},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [114]:
gsrf.best_score_

0.77972465581977468

## Investigate mistakes the classifier made

In [134]:
# genertate predictions with cross_val_predict
logit = LogisticRegression()
predictions = cross_val_predict(logit, X_new, y, cv = 5)

In [135]:
# add predictions and true ratings to reduced_df
reduced_df['predictions'] = predictions
reduced_df['rating'] = df_3.rating
reduced_df['rec_name'] = df_3.rec_name

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [142]:
reduced_df[reduced_df.predictions != reduced_df.rating].groupby('rating').mean()

Unnamed: 0_level_0,num_ingred,solid_five,solid_four,solid_three,predictions
rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3,9.026846,0.0,0.0,0.0,4.0
4,8.375,1.0,0.0,0.0,5.0
5,9.214521,0.0,0.0,0.0,4.0


In [141]:
reduced_df.shape

(4794, 7)