In [1]:
import pandas as pd
import numpy as np

In [27]:
from matplotlib import pyplot as plt
% matplotlib inline

### Read in data, merge two dfs

In [2]:
df_1 = pd.read_csv('brownie_ingred_reduced.csv')

In [3]:
df_2 = pd.read_csv('recipe_info.csv')

In [4]:
df_3 = pd.merge(df_1, df_2, on='Unnamed: 0.1')

### Create new features based on sources that consistently rank a certain way

In [5]:
df_3.rec_source.value_counts()[:15]

AllRecipes                  180
MyRecipes                    92
Betty Crocker                57
Taste of Home                53
Bake or Break                52
Martha Stewart               50
Brown Eyed Baker             42
Recipe Girl                  36
Inside BruCrew Life          32
Better Homes and Gardens     29
Epicurious                   29
My Baking Addiction          29
Recipe for Perfection        29
Crazy For Crust              28
How Sweet It Is              26
Name: rec_source, dtype: int64

In [6]:
# performed several groupbys to identify recipe sources that had brownie recipes all of one ranking
# standard deviation would equal zero in these cases - chose the ones with the highest value counts
grouped = df_3.groupby('rec_source')
grouped['rating'].agg([np.mean, np.count_nonzero, np.std]).sort_values(by=['mean', 'count_nonzero'], 
                                                                       ascending=False)[:5]

Unnamed: 0_level_0,mean,count_nonzero,std
rec_source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
I Adore Food!,5.0,5,0.0
David Lebovitz,5.0,4,0.0
Our Best Bites,5.0,4,0.0
Closet Cooking,5.0,2,0.0
La Tartine Gourmande,5.0,2,0.0


In [7]:
solid_fives = ['My Baking Addiction', 'Recipe Girl', 'La Tartine Gourmande',
              'Closet Cooking', 'Our Best Bites', 'David Lebovitz',
              'I Adore Food!']
solid_threes = ['Tablespoon', 'Tasty Kitchen', 'Southern Food About.com',
               'Cooking Channel', 'Dinner Then Dessert', 'Diabetic Connect']
solid_fours = ['Brown Eyed Baker', 'Recipe for Perfection', 'Something Swanky',
              'Broma Bakery', 'a trEATs affair', 'The Domestic Rebel',
              'Dinners, Dishes and Desserts', 'Love and Olive Oil', 
              'Life Made Simple', "Roxana's Home Baking"]

In [8]:
# Create column of zeros and ones for rec_source membership in solid_fives list
fives_boolean = df_3.rec_source.isin(solid_fives)
fives_dummy = [1 if z==True else 0 for z in fives_boolean]
df_3['solid_fives'] = fives_dummy

# Do same for solid_fours and solid_threes
fours_boolean = df_3.rec_source.isin(solid_fours)
fours_dummy = [1 if z==True else 0 for z in fours_boolean]
df_3['solid_fours'] = fours_dummy

threes_boolean = df_3.rec_source.isin(solid_threes)
threes_dummy = [1 if z==True else 0 for z in threes_boolean]
df_3['solid_threes'] = threes_dummy

## Create new features based on whether "healthy" is in the name of the brownie

### First check if "best" is exclusive to a certain rank, not so much

In [9]:
df_2.set_index("Unnamed: 0.1", inplace=True)

In [10]:
contains_best = []
for i in df_2.index:
    if 'est' in i:
        contains_best.append(i)

In [11]:
contains_best_df = df_2.ix[contains_best, :]

In [12]:
contains_best_df.groupby('rating').count()

Unnamed: 0_level_0,Unnamed: 0,rec_name,tot_time_seconds,rec_source
rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3,11,11,11,11
4,97,97,97,97
5,10,10,10,10


In [13]:
contains_healthy = []
for i in df_2.index:
    if 'ealth' in i:
        contains_healthy.append(i)
contains_healthy_df = df_2.ix[contains_healthy, :]

In [14]:
contains_healthy_df.groupby('rating').count()

Unnamed: 0_level_0,Unnamed: 0,rec_name,tot_time_seconds,rec_source
rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3,6,6,6,6
4,49,49,49,49


In [15]:
# add to dataframe
healthy_boolean = df_3["Unnamed: 0.1"].isin(contains_healthy)
healthy_dummy = [1 if z==True else 0 for z in healthy_boolean]
df_3['healthy_in_name'] = healthy_dummy

In [16]:
healthy_boolean.value_counts()

False    4952
True       55
Name: Unnamed: 0.1, dtype: int64

In [22]:
df_3.head(1)

Unnamed: 0,Unnamed: 0_x,Unnamed: 0.1,1% low-fat milk,2% lowfat greek yogurt,2% reduced-fat milk,active dry yeast,adobo sauce,adzuki beans,agave nectar,all-purpose flour,...,zucchini,Unnamed: 0_y,rec_name,tot_time_seconds,rec_source,rating,solid_fives,solid_fours,solid_threes,healthy_in_name
0,0,2-Ingredient-Nutella-Brownies-1471916,0,0,0,0,0,0,0,0,...,0,416,2 Ingredient Nutella Brownies,2700.0,Baking Beauty,4,0,0,0,0


In [26]:
# Create column for number of ingredients
df_3['num_ingredients'] = df_3.iloc[:, 2:-9].sum(axis=1)

## Prelim Modelling

In [29]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import BaggingClassifier

In [30]:
#quickly drop a few NaNs
df_3.dropna(axis=0, inplace=True)

In [31]:
#quickly drop zeros and twos, since there are so few
df_3 = df_3.loc[df_3.rating != 0]
df_3 = df_3.loc[df_3.rating != 2]

In [32]:
# check and see if they dropped properly
df_3.rating.value_counts()

4    3753
3     823
5     373
Name: rating, dtype: int64

### Simple kNN to get baseline

In [35]:
X = df_3.drop(['Unnamed: 0_x', 'Unnamed: 0.1', 'Unnamed: 0_y', 'rec_name', 
           'rating', 'rec_source', 'tot_time_seconds'], axis=1)
y = df_3.rating

In [84]:
cross_val_score(knn, X, y, cv=5)

array([ 0.75479314,  0.75782038,  0.75580222,  0.76315789,  0.76012146])

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [41]:
import re

In [46]:
from nltk import WordNetLemmatizer

In [53]:
convert_to_list = [z.split() for z in name_string]

lemmatized = []
for _list in convert_to_list:
    sub_list = []
    for word in _list:
        sub_list.append(WordNetLemmatizer().lemmatize(word))
    lemmatized.append(sub_list)
    
name_str = [' '.join(z) for z in lemmatized]

NameError: name 'name_str' is not defined

In [None]:
name

In [None]:
X = df_3['solid_fives', 'solid_fours', 'solid_threes', 'num_ingredients', 'tot_time_seconds',
        ]

### Simple logistic regression,

In [48]:
logit = LogisticRegression()

In [49]:
cross_val_score(logit, X, y, cv = 5)

array([ 0.77598385,  0.79515641,  0.7679112 ,  0.77834008,  0.76518219])

### Gridsearch logistic regression

In [109]:
params = {'C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'penalty': ['l1', 'l2']}
gslr = GridSearchCV(logit, params, n_jobs=-1, cv=KFold(len(y), n_folds=3, shuffle=True))

In [110]:
gslr.fit(X, y)

GridSearchCV(cv=sklearn.cross_validation.KFold(n=4949, n_folds=3, shuffle=True, random_state=None),
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [120]:
gslr.best_score_

0.78015760759749442

### Wrap logistic regression in a bagging classifier

In [116]:
bagging_params = {'n_estimators': [10, 20], 'max_samples': [0.7, 1.0], 'max_features': [0.7, 1.0],
                  'bootstrap_features': [True, False]}

In [117]:
gsbagginglr = GridSearchCV(BaggingClassifier(gslr.best_estimator_),
                           bagging_params, n_jobs=-1, cv=KFold(len(y), n_folds=3, shuffle=True))

In [118]:
gsbagginglr.fit(X, y)

GridSearchCV(cv=sklearn.cross_validation.KFold(n=4949, n_folds=3, shuffle=True, random_state=None),
       error_score='raise',
       estimator=BaggingClassifier(base_estimator=LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start...n_estimators=10, n_jobs=1, oob_score=False,
         random_state=None, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [10, 20], 'max_samples': [0.7, 1.0], 'bootstrap_features': [True, False], 'max_features': [0.7, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [119]:
print gsbagginglr.best_score_

0.779955546575


## takeaways from prelim modelling:

very little improvement over guessing most common class every time (rating of four) --  i.e. 75%

### Prelim modelling no better than baseline - try feature selection (only marginal improvement)

In [91]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [121]:
X_new = SelectKBest(chi2, k=100).fit_transform(X, y).get

In [122]:
cross_val_score(logit, X_new, y, cv = 5)

array([ 0.78203835,  0.80221998,  0.77800202,  0.78947368,  0.77935223])

## Try random forest, SVC

In [124]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [100]:
rfclass = RandomForestClassifier()

In [102]:
cross_val_score(rfclass, X_new, y, cv=5)

array([ 0.76690212,  0.78708375,  0.76185671,  0.77226721,  0.75809717])

In [125]:
svm = SVC(kernel='linear')

In [131]:
cvscores = cross_val_score(svm, X_new, y, cv = 5, n_jobs=-1)
print "CV score: {:.3} +/- {:.3}".format(cvscores.mean(), cvscores.std())

descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)


CV score: 0.784 +/- 0.00857
