In [None]:
import pandas as pd
import numpy as np
import re

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold

# Import classifier models 
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


# Evaluation metrics
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import cross_val_score

In [2]:
# Synthetic training dataset
synthetic_train_df = pd.read_csv('train-test datasets/synthetic_train_v2.csv')

# Yelp sample training dataset
yelp_train_df = pd.read_csv('train-test datasets/yelp_sample_train.csv')

In [3]:
yelp_train_df.isna().sum()

dish_name        0
description    243
cuisine          0
diet             0
dtype: int64

In [4]:
synthetic_train_df.isna().sum()

dish_name      0
description    7
cuisine        0
diet           0
dtype: int64

# Standard Baseline Models

We first test some standard machine learning classification models and evaluate their accuracy and F1-scores to see how good they are at labeling dishes as vegetarian or non-vegatarian.

We will perform a 5-fold cross validation on the datasets `synthetic_train_df` and `yelp_train_df` separately, train the models on the combined training sets of the synthetic and yelp data in each fold, then have each model predict on the test sets of the synthetic data and yelp data separately (in each fold). This will tell us if the models are preforming differently on the synthetic vs the yelp sample data.

In [32]:
models_dict_1 = {'Naive Bayes': MultinomialNB() ,
                'Logistic Regression': LogisticRegression(solver='liblinear', class_weight='balanced'),
                'XGBoost Classifier': XGBClassifier(n_estimators=100),
                'SVM': SVC(),
                'Decision Tree': DecisionTreeClassifier(class_weight='balanced'),
                'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=5, class_weight='balanced'),
                'KNN': KNeighborsClassifier()}

In [33]:
# Combine the text data
X_synthetic_text = synthetic_train_df['dish_name'] + ': ' + synthetic_train_df['cuisine']  + ': ' + synthetic_train_df['description'].fillna('')
X_synthetic_text = X_synthetic_text.apply(lambda x: re.sub(r'[^A-Za-z\s]', '', x.lower())) # Clean text

X_yelp_text = yelp_train_df['dish_name'] + ': ' + yelp_train_df['cuisine']  + ': ' + yelp_train_df['description'].fillna('') 
X_yelp_text = X_yelp_text.apply(lambda x: re.sub(r'[^A-Za-z\s]', '', x.lower())) # Clean text

# Target feature is 'diet' column 
y_synthetic_labels = synthetic_train_df['diet']
y_yelp_labels = yelp_train_df['diet']

## Using TF-IDF Vectorizer

TF-IDF gives weight importance to words across documents. Frequent words are given lower scores, while unique words are given higher ones. Is appropiate for text classifications that depend on keywords.

In [None]:
def mean_calculator(mylist):
    return 100*sum(mylist)/len(mylist)

# Initiate the text encoders 
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df = .57, stop_words='english')
le = LabelEncoder()  

Evaluate accuracy and F1-score of each baseline classifier on the cross validation folds.

In [34]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

score_dict = {}

for model_name, model in models_dict_1.items():

    synthetic_acc_scores = []
    synthetic_f1_scores = []
    yelp_acc_scores = []
    yelp_f1_scores = []

    for fold, ((train_synth_idx, test_synth_idx), (train_yelp_idx, test_yelp_idx)) in enumerate(zip(kf.split(X_synthetic_text), kf.split(X_yelp_text)), 1):

        # Create train-test splits for synthetic data
        X_synthetic_train, X_synthetic_test = X_synthetic_text[train_synth_idx], X_synthetic_text[test_synth_idx]
        y_synthetic_train, y_synthetic_test = y_synthetic_labels[train_synth_idx], y_synthetic_labels[test_synth_idx]

        # Create train-test splits for yelp data
        X_yelp_train, X_yelp_test = X_yelp_text[train_yelp_idx], X_yelp_text[test_yelp_idx]
        y_yelp_train, y_yelp_test = y_yelp_labels[train_yelp_idx], y_yelp_labels[test_yelp_idx]

        # Combine synthetic + yelp training data
        X_combined_train_split = pd.concat([X_synthetic_train, X_yelp_train], axis=0)
        y_combined_train_split = pd.concat([y_synthetic_train, y_yelp_train], axis=0)

        # Encode the text data with TF-IDF 
        X_train_tfidf_split = tfidf_vectorizer.fit_transform(X_combined_train_split) # For fitting

        X_test_tfidf_split_synthetic = tfidf_vectorizer.transform(X_synthetic_test)
        X_test_tfidf_split_yelp = tfidf_vectorizer.transform(X_yelp_test)
        
        # Encode the target labels (Non-Veg --> 0, Veg --> 1)
        y_train_encoded_split = le.fit_transform(y_combined_train_split) # For fitting

        y_test_encoded_synthetic = le.transform(y_synthetic_test)
        y_test_encoded_yelp = le.transform(y_yelp_test)

        # Train the model
        model.fit(X_train_tfidf_split, y_train_encoded_split)
        
        # Make predictions on synthetic data
        y_pred_split_synthetic = model.predict(X_test_tfidf_split_synthetic)
        synthetic_acc_scores.append(accuracy_score(y_pred_split_synthetic, y_test_encoded_synthetic))
        synthetic_f1_scores.append(f1_score(y_pred_split_synthetic, y_test_encoded_synthetic))

        # Make predictions on yelp data
        y_pred_split_yelp = model.predict(X_test_tfidf_split_yelp)
        yelp_acc_scores.append(accuracy_score(y_pred_split_yelp, y_test_encoded_yelp))
        yelp_f1_scores.append(f1_score(y_pred_split_yelp, y_test_encoded_yelp))

    score_dict[model_name] = {"Avg. Accuracy Synthetic": mean_calculator(synthetic_acc_scores),
                              "Avg. F1-Score Synthetic": mean_calculator(synthetic_f1_scores),
                              "Avg. Accuracy Yelp": mean_calculator(yelp_acc_scores),
                              "Avg. F1-Score Yelp": mean_calculator(yelp_f1_scores)}

In [35]:
pd.DataFrame(score_dict).T.sort_values(by=['Avg. F1-Score Synthetic','Avg. F1-Score Yelp'],ascending=False)

Unnamed: 0,Avg. Accuracy Synthetic,Avg. F1-Score Synthetic,Avg. Accuracy Yelp,Avg. F1-Score Yelp
XGBoost Classifier,94.882128,95.355453,89.446362,90.673702
Decision Tree,93.815887,94.272617,90.568703,91.452454
SVM,93.442553,94.002839,86.863348,88.520668
Logistic Regression,92.802553,93.283871,87.761597,89.117498
Naive Bayes,89.923262,90.76111,84.733538,86.256471
Random Forest,83.901135,84.933436,78.441404,82.553053
KNN,82.035035,83.063341,76.879041,78.720813


## Save Cross Validation Fold Predictions

In [None]:
# Change predictions back to string labels
int_to_label = {1:'Vegetarian', 0:'Non-Vegetarian'}

# Add fold dataframes to list
combined_fold_prediction_dfs = []

for model_name, model in models_dict_1.items():

        for fold, ((train_synth_idx, test_synth_idx), (train_yelp_idx, test_yelp_idx)) in enumerate(zip(kf.split(X_synthetic_text), kf.split(X_yelp_text)), 1):

                # print(f'==== Fold {fold} ====')

                # Create train-test splits for synthetic data
                X_synthetic_train, X_synthetic_test = X_synthetic_text[train_synth_idx], X_synthetic_text[test_synth_idx]
                y_synthetic_train, y_synthetic_test = y_synthetic_labels[train_synth_idx], y_synthetic_labels[test_synth_idx]

                # Create train-test splits for yelp data
                X_yelp_train, X_yelp_test = X_yelp_text[train_yelp_idx], X_yelp_text[test_yelp_idx]
                y_yelp_train, y_yelp_test = y_yelp_labels[train_yelp_idx], y_yelp_labels[test_yelp_idx]

                # Combine synthetic + yelp training data
                X_combined_train_split = pd.concat([X_synthetic_train, X_yelp_train], axis=0)
                y_combined_train_split = pd.concat([y_synthetic_train, y_yelp_train], axis=0)

                # Encode the text data with TF-IDF 
                X_train_tfidf_split = tfidf_vectorizer.fit_transform(X_combined_train_split) # For fitting
                X_test_tfidf_split_synthetic = tfidf_vectorizer.transform(X_synthetic_test)
                X_test_tfidf_split_yelp = tfidf_vectorizer.transform(X_yelp_test)
                
                # Encode the target labels (Non-Veg --> 0, Veg --> 1)
                y_train_encoded_split = le.fit_transform(y_combined_train_split) # For fitting
                y_test_encoded_synthetic = le.transform(y_synthetic_test)
                y_test_encoded_yelp = le.transform(y_yelp_test)

                # Train model
                model.fit(X_train_tfidf_split, y_train_encoded_split)

                # Predict (Synthetic)
                y_pred_split_synthetic = model.predict(X_test_tfidf_split_synthetic)

                synth_preds_df = pd.DataFrame({'original_index': test_synth_idx,
                                        'predicted_diet': y_pred_split_synthetic}).set_index('original_index') # Ensure matching index columns
                # Combine predictions 
                synth_fold_df = synthetic_train_df.loc[test_synth_idx].join(synth_preds_df['predicted_diet'])
                # Convert predicted values to labels
                synth_fold_df['predicted_diet'] = synth_fold_df['predicted_diet'].map(int_to_label)
                # Add source column
                synth_fold_df['source'] = 'Synthetic'

                # Predict (Yelp)
                y_pred_split_yelp = model.predict(X_test_tfidf_split_yelp)

                yelp_preds_df = pd.DataFrame({'original_index': test_yelp_idx,
                                        'predicted_diet': y_pred_split_yelp}).set_index('original_index')

                yelp_fold_df = yelp_train_df.loc[test_yelp_idx].join(yelp_preds_df['predicted_diet'])
                
                yelp_fold_df['predicted_diet'] = yelp_fold_df['predicted_diet'].map(int_to_label)

                yelp_fold_df['source'] = 'Yelp'

                # Combine Synthetic & Yelp fold dfs
                combined_fold_df = pd.concat([synth_fold_df, yelp_fold_df], axis=0).reset_index(drop=True)
                combined_fold_prediction_dfs.append(combined_fold_df)

                # Save each dataframe fold of model to a csv file
                combined_fold_df.to_csv(f"predictions_{model_name}_fold_{fold}.csv", index=False)

Save each model's mismatch labels across all folds into a single csv file.

In [None]:
for model_name in models_dict_1.keys():
    fold_dfs = []
    for fold in range(5):
        fold_df = pd.read_csv(f'{model_name} Folds/predictions_{model_name}_fold_{fold+1}.csv')
        fold_dfs.append(fold_df)

    combined_folds_df = pd.concat(fold_dfs, axis=0)
    combined_folds_df.drop('Unnamed: 0', axis=1, inplace=True)
    unequal_rows_df = combined_folds_df.loc[combined_folds_df['diet'] != combined_folds_df['predicted_diet']].reset_index(drop=True)
    unequal_rows_df.to_csv(f"mislabeled_{model_name}.csv") 

## Word Embedding

A **word embedding** is a NLP technique that represents a word as a numerical vector in a real vector space, which captures its sematic relationships where similar words have similar vectors.

In [41]:
models_dict_2 = {'Naive Bayes': Pipeline([('scaler', MinMaxScaler()), ('clf', MultinomialNB())]),
                'Logistic Regression': LogisticRegression(solver='liblinear', class_weight='balanced'),
                'XGBoost Classifier': XGBClassifier(n_estimators=80),
                'SVM': SVC(),
                'Decision Tree': DecisionTreeClassifier(class_weight='balanced'),
                'Random Forest': RandomForestClassifier(n_estimators=80, max_depth=5, class_weight='balanced'),
                'KNN': KNeighborsClassifier()}

In [42]:
# For word embedding
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab') # For NLTK version 3.8.2 or later, download 'punkt_tab' instead

# Create document embeddings by averaging word vectors
def get_document_embedding(sentence, word_vectors, vector_size):
    embeddings = [word_vectors[word] for word in sentence if word in word_vectors]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(vector_size)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/justinfong/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Evaluate accuracy and F1-score of each baseline classifier on the cross validation folds.

In [43]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

score_dict = {}

for model_name, model in models_dict_2.items():

    synthetic_acc_scores = []
    synthetic_f1_scores = []
    yelp_acc_scores = []
    yelp_f1_scores = []

    for fold, ((train_synth_idx, test_synth_idx), (train_yelp_idx, test_yelp_idx)) in enumerate(zip(kf.split(X_synthetic_text), kf.split(X_yelp_text)), 1):

        # Create train-test splits for synthetic data
        X_synthetic_train, X_synthetic_test = X_synthetic_text[train_synth_idx], X_synthetic_text[test_synth_idx]
        y_synthetic_train, y_synthetic_test = y_synthetic_labels[train_synth_idx], y_synthetic_labels[test_synth_idx]

        # Create train-test splits for yelp data
        X_yelp_train, X_yelp_test = X_yelp_text[train_yelp_idx], X_yelp_text[test_yelp_idx]
        y_yelp_train, y_yelp_test = y_yelp_labels[train_yelp_idx], y_yelp_labels[test_yelp_idx]

        # Combine synthetic + yelp training data
        X_combined_train_split = pd.concat([X_synthetic_train, X_yelp_train], axis=0)
        y_combined_train_split = pd.concat([y_synthetic_train, y_yelp_train], axis=0)

        # Encode the text data with average word embeddings
        token_combined_train = [word_tokenize(sentence.lower()) for sentence in X_combined_train_split] 
        word2vec_model = Word2Vec(sentences=token_combined_train, vector_size=100, window=5, min_count=1, workers=4)  
        X_train_embeddings = np.array([get_document_embedding(s, word2vec_model.wv, word2vec_model.wv.vector_size) for s in token_combined_train])
        
        # Word embedd synthetic test data
        token_synthetic = [word_tokenize(sentence.lower()) for sentence in X_synthetic_test]
        word2vec_synthetic_model = Word2Vec(sentences=token_synthetic, vector_size=100, window=5, min_count=1, workers=4) 
        X_test_synthetic = np.array([get_document_embedding(s, word2vec_synthetic_model.wv, word2vec_synthetic_model.wv.vector_size) for s in token_synthetic])

        # Word embedd yelp test data
        token_yelp = [word_tokenize(sentence.lower()) for sentence in X_yelp_test]
        word2vec_yelp_model = Word2Vec(sentences=token_yelp, vector_size=100, window=5, min_count=1, workers=4) 
        X_test_yelp = np.array([get_document_embedding(s, word2vec_yelp_model.wv, word2vec_yelp_model.wv.vector_size) for s in token_yelp])

        
        # Encode the target labels (Non-Veg --> 0, Veg --> 1)
        y_train_encoded_split = le.fit_transform(y_combined_train_split) # For fitting

        y_test_encoded_synthetic = le.transform(y_synthetic_test)
        y_test_encoded_yelp = le.transform(y_yelp_test)

        # Train the model
        model.fit(X_train_embeddings, y_train_encoded_split)
        
        # Make predictions on synthetic data
        y_pred_split_synthetic = model.predict(X_test_synthetic)
        synthetic_acc_scores.append(accuracy_score(y_pred_split_synthetic, y_test_encoded_synthetic))
        synthetic_f1_scores.append(f1_score(y_pred_split_synthetic, y_test_encoded_synthetic))

        # Make predictions on yelp data
        y_pred_split_yelp = model.predict(X_test_yelp)
        yelp_acc_scores.append(accuracy_score(y_pred_split_yelp, y_test_encoded_yelp))
        yelp_f1_scores.append(f1_score(y_pred_split_yelp, y_test_encoded_yelp))

    score_dict[model_name] = {"Avg. Accuracy Synthetic": mean_calculator(synthetic_acc_scores),
                              "Avg. F1-Score Synthetic": mean_calculator(synthetic_f1_scores),
                              "Avg. Accuracy Yelp": mean_calculator(yelp_acc_scores),
                              "Avg. F1-Score Yelp": mean_calculator(yelp_f1_scores)}

In [44]:
pd.DataFrame(score_dict).T.sort_values(by=['Avg. F1-Score Synthetic','Avg. F1-Score Yelp'],ascending=False)

Unnamed: 0,Avg. Accuracy Synthetic,Avg. F1-Score Synthetic,Avg. Accuracy Yelp,Avg. F1-Score Yelp
Naive Bayes,54.316596,70.349471,53.423514,69.618941
SVM,54.316596,70.349471,53.423514,69.618941
XGBoost Classifier,54.210355,70.197807,53.423514,69.618941
Random Forest,53.465106,68.782169,53.423514,69.618941
KNN,51.013759,63.931576,53.423514,69.618941
Decision Tree,53.197021,56.378203,50.277446,41.590436
Logistic Regression,51.763262,43.66172,53.423514,69.618941


The accuracy and F1-scores are lower using word embeddings. This may be due to training on a smaller dataset, and the fact that food descriptions are more keyword dependent than semantic meanings.

# Classification on Combined Datasets

We test how the baseline classifiers preform on the combined synthetic and yelp datasets.

In [39]:
df = pd.concat([synthetic_train_df, yelp_train_df], axis=0).sample(frac=1,random_state=42).reset_index(drop=True)
df

Unnamed: 0,dish_name,description,cuisine,diet
0,Tom yum (hot & sour) soup with prawns,"700ml chicken stock 1 lemongrassstalk, bruised...",French,Non-Vegetarian
1,Badshahi Chicken Recipe,Badshahi Chicken Recipe is a simple chicken re...,Indian,Non-Vegetarian
2,Plain Mixed Vegetable,with Steam Rice,Chinese,Vegetarian
3,Pork Lo Mein Tray,,Chinese,Non-Vegetarian
4,Vegetable Minimalist Tofu Spaghetti,A delicious and healthy vegan lunch option fea...,French,Vegetarian
...,...,...,...,...
2762,Chicken With Mushroom Sauce Recipe,is a classic combination that leaves you droo...,Continental,Non-Vegetarian
2763,Vegetable Spring Rolls,Crispy fried rolls filled with a medley of fin...,Chinese,Vegetarian
2764,Instant Granola In A Microwave Recipe,"Super crispy, crunchy, toasty granola, Instant...",Continental,Vegetarian
2765,French Toast,Slices of bread soaked in an egg and milk mixt...,American,Vegetarian


## Word Embedding

In [45]:
import re

X = df['dish_name'] + ' ' + df['cuisine'] + ' ' + df['description']
X = X.fillna('').apply(lambda x: re.sub(r'[^A-Za-z\s]', '', x.lower())) # Clean text data for preparation
y = df['diet']

# Tokenize the sentences (rows of X)
tokenized_corpus = [word_tokenize(sentence.lower()) for sentence in X]

# Train a Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)

# Calculate the average of the Word2Vec vectors of all words in a the tokenized document    
document_embeddings = np.array([get_document_embedding(s, word2vec_model.wv, word2vec_model.wv.vector_size) for s in tokenized_corpus])

# Encode the target labels
le = LabelEncoder()
y_encoded = le.fit_transform(y) 

In [46]:
def cross_val_evaluations(model, cv=5):

    scores = cross_val_score(model, document_embeddings, y_encoded, cv=5, scoring="f1")
    accuracy_scores = cross_val_score(model, document_embeddings, y_encoded, cv=5, scoring="accuracy")

    d = {'average f1-score': scores.mean()*100,
        'average accuracy': accuracy_scores.mean()*100}
    
    return d

In [47]:
# Calculate average accuracy and f1-scores on each cv-fold

model_name_list = [model for model in models_dict_2.keys()]
f1_score_list = [cross_val_evaluations(model)['average f1-score'] for model in models_dict_2.values()]
accuracy_list = [cross_val_evaluations(model)['average accuracy'] for model in models_dict_2.values()]

cv_score_dict = {'Model': model_name_list,
              'Average F1-score': f1_score_list,
              'Average Accuracy': accuracy_list}

cv_score_df = pd.DataFrame(cv_score_dict).sort_values(by='Average F1-score', ascending=False).reset_index(drop=True)
cv_score_df

Unnamed: 0,Model,Average F1-score,Average Accuracy
0,XGBoost Classifier,72.543321,69.0632
1,SVM,70.76444,59.77569
2,Logistic Regression,70.376512,68.449155
3,Random Forest,65.196646,64.763384
4,Decision Tree,64.383218,60.64303
5,KNN,61.937932,60.426489
6,Naive Bayes,52.229974,53.415045


## TF-IDF

In [52]:
X_copy = X.copy()
y_copy = y.copy()

X_tfidf = tfidf_vectorizer.fit_transform(X_copy)

le = LabelEncoder()
y_encoded = le.fit_transform(y_copy) # Non-Veg = 0, Veg = 1


def cross_val_evaluations(model, cv=5):

    scores = cross_val_score(model, X_tfidf, y_encoded, cv=5, scoring="f1")
    accuracy_scores = cross_val_score(model, X_tfidf, y_encoded, cv=5, scoring="accuracy")

    d = {'average f1-score': scores.mean()*100,
        'average accuracy': accuracy_scores.mean()*100}
    return d

In [53]:
# Calculate average accuracy and f1-scores on each cv-fold

model_name_list = [model for model in models_dict_1.keys()]
f1_score_list = [cross_val_evaluations(model)['average f1-score'] for model in models_dict_1.values()]
accuracy_list = [cross_val_evaluations(model)['average accuracy'] for model in models_dict_1.values()]

cv_score_dict = {'Model': model_name_list,
              'Average F1-score': f1_score_list,
              'Average Accuracy': accuracy_list}

cv_score_df = pd.DataFrame(cv_score_dict).sort_values(by='Average F1-score', ascending=False).reset_index(drop=True)

cv_score_df

Unnamed: 0,Model,Average F1-score,Average Accuracy
0,XGBoost Classifier,92.464988,91.434969
1,Decision Tree,92.227729,91.000973
2,Logistic Regression,89.855971,88.434662
3,SVM,89.267113,87.494533
4,Naive Bayes,88.081064,86.447014
5,Random Forest,83.042052,80.483089
6,KNN,66.573319,54.824162


TF-IDF still preforms better than word embedding, hence classifying vegetarian and non-vegetarian dishes may depend more on keywords such as chicken, pork, beef, ect. 