In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC, SVR
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

# Data

In [20]:
raw_data = pd.read_csv('epi_r.csv')
z = {0.0: 0, 1.25: 1, 1.875: 2, 2.5: 3, 3.125: 4, 3.75: 5, 4.375: 6, 5.0: 7}
z_inv = {v: k for k, v in z.items()}
raw_data["rating_cat"] = raw_data.rating.map(z)

In [4]:
# First lets look at the ratings.
raw_data.rating.value_counts()

4.375    8019
3.750    5169
5.000    2719
0.000    1836
3.125    1489
2.500     532
1.250     164
1.875     124
Name: rating, dtype: int64

In [5]:
raw_data["rating_cat"].value_counts()

6    8019
5    5169
7    2719
0    1836
4    1489
3     532
1     164
2     124
Name: rating_cat, dtype: int64

In [6]:
np.diff(np.array([0.0, 1.25, 1.875, 2.5, 3.125, 3.75, 4.375, 5.0]))

array([1.25 , 0.625, 0.625, 0.625, 0.625, 0.625, 0.625])

Since the ratings are actually discrete, a classifier makes more sense. Note that the rating levels are not uniformly distributed. 

First lets add a binary feature to indicate if the recipe lists calories that are in the top 75% of all ratings, as well as some group some highly correlated features.

# Split

In [12]:
X = raw_data.dropna().drop(['rating', 'title', 'rating_cat'], axis = 1)
X_test, X_train, y_test, y_train = train_test_split(X, raw_data.dropna().rating, test_size=0.70)

In [13]:
# lets see what features are extrmely correlated
cm = X_train.corr()
s = cm.unstack()
so = s[s!=1.0].sort_values(kind="quicksort", ascending=False)
so[so>=0.7]

calories             fat                    0.996224
fat                  calories               0.996224
sodium               calories               0.996084
calories             sodium                 0.996084
fat                  sodium                 0.985281
sodium               fat                    0.985281
peanut free          soy free               0.941935
soy free             peanut free            0.941935
kosher               pescatarian            0.884051
pescatarian          kosher                 0.884051
portland             oregon                 0.880571
oregon               portland               0.880571
drink                alcoholic              0.858857
alcoholic            drink                  0.858857
tree nut free        peanut free            0.825273
peanut free          tree nut free          0.825273
tree nut free        soy free               0.798740
soy free             tree nut free          0.798740
missouri             st. louis              0.

In [16]:
# combine calories, fat and sodium
X_train["source"] = X_train[['fat', 'calories', 'sodium']].sum(axis=1)
X_test["source"] = X_test[['fat', 'calories', 'sodium']].sum(axis=1)
# features to combine* (binary interaction)
features1 = ['calories', 'calories', 'sodium', 'peanut free', 
             'pescatarian', 'drink', 'peanut free', 'portland', 
             'soy free', 'sodium', 'vegetarian', 'snack week', 
             'pescatarian', 'kosher', 'peanut free', 'peanut free', 
             'soy free', 'calories', 'brunch', 'kentucky', 'denver', 
             'louisiana', 'new orleans', 'lasagna']

features2 = ['sodium', 'fat', 'fat', 'soy free', 'kosher', 'alcoholic', 
             'tree nut free', 'oregon', 'tree nut free', 'protein', 
             'pescatarian', 'snack', 'soy free', 'vegetarian', 
             'pescatarian', 'kosher', 'kosher', 'protein', 'breakfast', 
             'louisville', 'omelet', 'kitchen olympics', 'louisiana', 'epi loves the microwave']

for a,b in zip(features1, features2):
    X_train[a + "_" + b] = X_train[a] * X_train[b]
    X_test[a + "_" + b] = X_test[a] * X_test[b]
    
for b in ['fat', 'calories', 'sodium']:
    X_train['protein_' + b] = X_train['protein'] * X_train[b]
    X_test['protein_' + b] = X_test['protein'] * X_test[b]

# Feature Selection  

We will first predict how likely the recipe is a high (>=2.5 rating) or low (<2.5), and given this predicted probability we can feed the data to a further model (2 layer).

**High or Low rating?** (<= 2.5)

In [50]:
X_train_dummy = pd.get_dummies(X_train)
# convert all columns to int's and make source_pc binary
X_train_dummy = X_train_dummy.astype(int)

# Features & Target
y_low = y_train <= 2.5

#Pipeline Construction.
anova_low = SelectKBest(f_classif, k=30)
svc_low = SVC(kernel='linear')
anova_svc = make_pipeline(anova_low, svc_low)

In [51]:
#Fitting & Cross-Validation.
anova_svc.fit(X_train_dummy,y_low)
scores_low = cross_val_score(anova_svc, X_train_dummy, y_low, cv=5)

#De-Masking Selected Features.
features_low = anova_low.get_support(indices=True)
selected_features_low = list(X_train_dummy.columns[features_low])

#Printing Outcomes.
print('Cross-Validation Scores: {}'.format(scores_low))
print('Cross-Validation Score Averaged Across Folds: {:.2%}.\n'.format(scores_low.mean()))
print('Selected Features: {}\n'.format(selected_features_low))

Cross-Validation Scores: [0.8856371  0.88878883 0.88383611 0.88473661 0.8865376 ]
Cross-Validation Score Averaged Across Folds: 88.59%.

Selected Features: ['alcoholic', 'bake', 'bitters', 'bon appétit', 'brandy', 'chartreuse', 'chile pepper', 'cocktail', 'cocktail party', 'condiment', 'créme de cacao', 'drink', 'fall', 'fortified wine', 'gin', 'harpercollins', 'house & garden', 'liqueur', 'non-alcoholic', 'peanut free', 'rum', 'sauté', 'soy free', 'spirit', 'tree nut free', 'vegan', 'weelicious', 'winter', 'peanut free_soy free', 'drink_alcoholic']



In [52]:
svc_1ow = SVC()
X_low = X_train[selected_features_low]
svc_1ow.fit(X_low,y_low)

scores_low = cross_val_score(svc_1ow, X_train[selected_features_low], y_low, cv=5)
print('Cross-Validation Scores: {}'.format(scores_low))
print('Cross-Validation Score Averaged Across Folds: {:.2%}.\n'.format(scores_low.mean()))
y_pred_train = svc_1ow.predict(X_train[selected_features_low]).astype(int)

Cross-Validation Scores: [0.8865376  0.8865376  0.88518685 0.88743809 0.8865376 ]
Cross-Validation Score Averaged Across Folds: 88.64%.



This model simply predicts if the given recipe will be rated high or low. Let's see how it performs on the test set.  

In [71]:
#Clasification report
results_train = metrics.classification_report(y_true=(y_train<=2.5).astype(int), y_pred=y_pred_train)
print(results_train)
metrics.accuracy_score(y_true=(y_train<=2.5).astype(int), y_pred=y_pred_train)
#metrics.auc() HOW TO USE THIS?

              precision    recall  f1-score   support

           0       0.89      1.00      0.94      9765
           1       0.83      0.08      0.14      1340

   micro avg       0.89      0.89      0.89     11105
   macro avg       0.86      0.54      0.54     11105
weighted avg       0.88      0.89      0.84     11105



0.8868977937865826

### Evaluation

In [72]:
y_pred_test = svc_1ow.predict(X_test[selected_features_low])
#Clasification report
results_test = metrics.classification_report(y_true=(y_test<=2.5).astype(int), y_pred=y_pred_test)
print(results_test)
metrics.accuracy_score(y_true=(y_test<=2.5).astype(int), y_pred=y_pred_test)


              precision    recall  f1-score   support

           0       0.89      1.00      0.94      4194
           1       0.73      0.06      0.12       565

   micro avg       0.89      0.89      0.89      4759
   macro avg       0.81      0.53      0.53      4759
weighted avg       0.87      0.89      0.84      4759



0.8861105274217272

This classification task seems quite limited (not practical), so let's open it up to predict each rating level.

Let's use LASSO regression to identify features.  

## LASSO Regression

In [67]:
from sklearn.linear_model import LogisticRegression
#solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
lassoregr = LogisticRegression(penalty ='l1', solver='saga', multi_class='multinomial')
lassoregr.fit(X_train, y_train.map(z))

coeffs = pd.DataFrame(lassoregr.coef_.transpose())
coeffs.index = X_train.columns
coeffs.head()

Unnamed: 0,0,1,2,3,4,5,6,7
calories,-1.532908e-21,-1.709296e-21,-1.716656e-21,-1.666317e-21,2.226187e-21,-8.858959e-22,2.4328120000000003e-22,5.041605e-21
protein,-6.070384000000001e-23,-6.625873000000001e-23,-6.644454000000001e-23,-6.440107e-23,4.354286e-24,-2.976108e-23,-9.415707e-24,2.926326e-22
fat,-7.394482000000001e-23,-8.210371000000001e-23,-8.248664e-23,-7.972559e-23,1.401701e-22,-3.525707e-23,3.9964770000000006e-23,1.7338420000000002e-22
sodium,-1.620845e-21,-1.92899e-21,-1.935301e-21,-1.86024e-21,2.109248e-21,-7.060088e-22,3.022891e-22,5.639849e-21
#cakeweek,0.0,0.0,0.0,0.0,0.0,7.08633e-28,0.0,0.0


It would be easier to visualize if we first converted the above matrix into rankings (along each column). Say we wanted to optimize the model to perform on those recipes with a 3.125 rating, then we take the top k features for the 5th column.

In [73]:
new_features = coeffs.iloc[:,4].sort_values().index[:29]
print(new_features)

svc_new = SVC(probability=True)
svc_new.fit(X_train[new_features], y_train.map(z))

scores_new = cross_val_score(svc_new, X_train[new_features], y_train.map(z), cv=5)
print('Cross-Validation Scores: {}'.format(scores_new))
print('Cross-Validation Score Averaged Across Folds: {:.2%}.\n'.format(scores_new.mean()))

p_pred_new = svc_new.predict_proba(X_test[new_features])

y_pred_new = p_pred_new.argmax(axis=1)
#Clasification report
results_new = metrics.classification_report(y_true=y_test.map(z), y_pred=y_pred_new)
print(results_new)

Index(['calories_protein', 'protein_calories', 'protein_sodium',
       'sodium_protein', 'bon appétit', 'peanut free', 'soy free',
       'peanut free_soy free', 'tree nut free', 'peanut free_tree nut free',
       'soy free_tree nut free', 'gourmet', 'vegetarian', 'pescatarian',
       'bake', 'kosher', 'peanut free_pescatarian', 'peanut free_kosher',
       'summer', 'wheat/gluten-free', 'pescatarian_soy free',
       'soy free_kosher', 'quick & easy', 'pescatarian_kosher',
       'kosher_vegetarian', 'vegetarian_pescatarian', 'dessert', 'fall',
       'winter'],
      dtype='object')
Cross-Validation Scores: [0.47616906 0.46558704 0.46375507 0.47408743 0.46077547]
Cross-Validation Score Averaged Across Folds: 46.81%.

              precision    recall  f1-score   support

           0       0.47      0.29      0.36       361
           1       1.00      0.15      0.26        40
           2       0.00      0.00      0.00        31
           3       0.80      0.12      0.21       1

Whoa this set of features appears to improve on all classes! Let's see if we can get better model performance from using a random forest to select the same number of features.

## Random Forest  

In [74]:
rfc = RandomForestClassifier()

X_train2 = pd.get_dummies(X_train.dropna())
rfc.fit(X_train2, y_train.map(z))
feats = pd.DataFrame(X_train2.columns, rfc.feature_importances_)
feats.reset_index(inplace=True)
features = list(feats.sort_values(by='index', ascending=False).iloc[:30, 1])
print(features)

['calories', 'source', 'calories_sodium', 'sodium_protein', 'calories_protein', 'sodium', 'calories_fat', 'protein_sodium', 'protein_calories', 'sodium_fat', 'fat', 'protein_fat', 'protein', 'bon appétit', 'summer', 'gourmet', 'quick & easy', 'winter', 'bake', 'onion', 'fall', 'vegetarian', 'spring', 'tomato', 'vegetable', 'wheat/gluten-free', 'herb', 'milk/cream', 'egg', 'soy free_tree nut free']


In [77]:
print(results_new)

              precision    recall  f1-score   support

           0       0.47      0.29      0.36       361
           1       1.00      0.15      0.26        40
           2       0.00      0.00      0.00        31
           3       0.80      0.12      0.21       133
           4       0.83      0.12      0.21       328
           5       0.60      0.12      0.20      1243
           6       0.46      0.96      0.62      1976
           7       0.69      0.14      0.24       647

   micro avg       0.48      0.48      0.48      4759
   macro avg       0.61      0.24      0.26      4759
weighted avg       0.57      0.48      0.39      4759



In [75]:
# Now use an SVC for classifying recipes, using only the above features
svc = SVC(probability=True)
svc.fit(X_train[features], y_train.map(z))
p_pred = svc.predict_proba(X_test[features])
y_pred_test = p_pred.argmax(axis=1)
#Clasification report
results = metrics.classification_report(y_true=y_test.map(z), y_pred=y_pred_test)
print(results)

              precision    recall  f1-score   support

           0       0.77      0.18      0.29       361
           1       1.00      0.17      0.30        40
           2       0.80      0.13      0.22        31
           3       0.86      0.14      0.23       133
           4       0.98      0.13      0.23       328
           5       0.91      0.11      0.20      1243
           6       0.45      0.99      0.62      1976
           7       0.80      0.16      0.27       647

   micro avg       0.49      0.49      0.49      4759
   macro avg       0.82      0.25      0.29      4759
weighted avg       0.70      0.49      0.39      4759



In [76]:
# Accuracy tables.
table_test = pd.crosstab(y_test.map(z), y_pred_test, margins=True)
table_test

col_0,0,1,2,3,4,5,6,7,All
rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,64,0,0,1,1,3,278,14,361
1,1,7,0,0,0,0,31,1,40
2,2,0,4,0,0,0,25,0,31
3,2,0,0,18,0,1,111,1,133
4,0,0,0,0,42,1,283,2,328
5,7,0,0,1,0,137,1094,4,1243
6,3,0,1,1,0,8,1959,4,1976
7,4,0,0,0,0,1,536,106,647
All,83,7,5,21,43,151,4317,132,4759


# Conclusion  

The model built from features selected by LASSO regression made drastic improvements over the naive SVR model (using all features), and using a Random Forest to select the features resulted in the best performing model.