In [1]:
import pandas as pd
import pickle
import re
import spacy
import numpy as np

In [2]:
with open('data/item_info.pkl', 'rb') as pickle_file:
    item_info = pickle.load(pickle_file)
len(item_info)

1897

In [3]:
def clean_text(s):
    return re.sub(r'[^a-zA-Z]', ' ', s).lower().split()

In [4]:
all_data = pd.DataFrame(columns=["ingredients", "is_vegan"])

for item_id in item_info:
    all_data = all_data.append({
        "ingredients": clean_text(item_info[item_id]['ingredients']),
        "is_vegan": item_info[item_id]["is_vegan"]
    }, ignore_index=True)

In [49]:
all_data['ingredients'] = all_data['ingredients'].apply(lambda l: " ".join(l))

In [50]:
X = all_data['ingredients']
y = all_data['is_vegan']

## TF-IDF Pipeline

In [51]:
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline

In [134]:
pipeline = Pipeline([
    ('countvec', CountVectorizer(
                    lowercase=False,
                    tokenizer=lambda x:x,
                    ngram_range=(2,5),
                    analyzer="word"
                )
    ),
    ('tf-idf', TfidfTransformer(
                    norm='l2',
                )
    )
]).fit(X)
pipeline

Pipeline(steps=[('countvec',
                 CountVectorizer(lowercase=False, ngram_range=(2, 5),
                                 tokenizer=<function <lambda> at 0x7fc6b3d2e310>)),
                ('tf-idf', TfidfTransformer())])

## Logistic Regression


In [135]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [136]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [137]:
X_train_tfidf = pipeline.transform(X_train)
X_test_tfidf = pipeline.transform(X_test)

In [138]:
y_train = y_train.astype(np.int8)
y_test = y_test.astype(np.int8)

In [139]:
model = LogisticRegression().fit(X_train_tfidf, y_train)
model.score(X_test_tfidf, y_test)

0.881578947368421

# Tuning

In [295]:
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

def generate_ngram_pairs(minimum, maximum):
    pairs = []
    for i in range(minimum, maximum+1):
        for j in range(minimum, maximum+1):
            if i <= j:  
                pairs.append((i,j))
            
    return pairs

def tune_lr(data, params):
    passed_params = {}
    for p in params:
        passed_params[p] = params[p]

    defaults = {
        "test_size": 0.2,
        "seed": 1,
        "ngram_range": (1,1),
        "analyzer": "word",
        "norm": "l2",
        "penalty": "l2",
        "C": 1,
        "class_weight": "balanced", # can be None or imbalanced
        "solver": "lbfgs",
        "l1_ratio": 0.5, # only for penalty elasticnet
    }
    
    for d in defaults:
        if d not in params:
            params[d] = defaults[d]
            
    
    X = all_data['ingredients']
    y = all_data['is_vegan']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=params['test_size'], random_state=params['seed'])
    
    pipeline = Pipeline([
        ('countvec', CountVectorizer(
                        lowercase=False,
                        tokenizer=lambda x:x,
                        ngram_range=params['ngram_range'],
                        analyzer=params['analyzer']
                    )
        ),
        ('tf-idf', TfidfTransformer(
                        norm=params['norm'],
                    )
        )
    ]).fit(X)
    
    X_train_tfidf = pipeline.transform(X_train)
    X_test_tfidf = pipeline.transform(X_test)
    y_train = y_train.astype(np.int8)
    y_test = y_test.astype(np.int8)        
    
    if params['penalty'] != 'elasticnet':
        params['l1_ratio'] = None
    
    model = LogisticRegression(
                C=params['C'],
                penalty=params['penalty'],
                class_weight=params['class_weight'],
                random_state=params['seed'],
                solver=params['solver'],
                l1_ratio=params['l1_ratio']
            ).fit(X_train_tfidf, y_train)
    
    score = model.score(X_test_tfidf, y_test)
    print(f"Accuracy: {score * 100:.2f}% with {passed_params}")
    return model, score

In [299]:
n_gram_min = 1
n_gram_max = 6
best_acc = 0
best_model = None
best_pair = None

for n_gram_pair in generate_ngram_pairs(n_gram_min,n_gram_max):
    model, acc = tune_lr(all_data,
        {
            "ngram_range": n_gram_pair,
        }
    )
    if acc > best_acc:
        print("New best")
        best_acc = acc
        best_model = model
        best_pair = n_gram_pair
print(f"Best pair: {best_pair} with accuracy of {best_acc*100:.2f}%")

Accuracy: 84.74% with {'ngram_range': (1, 1)}
New best
Accuracy: 90.00% with {'ngram_range': (1, 2)}
New best
Accuracy: 90.79% with {'ngram_range': (1, 3)}
New best
Accuracy: 90.53% with {'ngram_range': (1, 4)}
Accuracy: 91.32% with {'ngram_range': (1, 5)}
New best
Accuracy: 90.53% with {'ngram_range': (1, 6)}
Accuracy: 89.74% with {'ngram_range': (2, 2)}
Accuracy: 91.32% with {'ngram_range': (2, 3)}
Accuracy: 91.32% with {'ngram_range': (2, 4)}
Accuracy: 91.58% with {'ngram_range': (2, 5)}
New best
Accuracy: 91.05% with {'ngram_range': (2, 6)}
Accuracy: 91.58% with {'ngram_range': (3, 3)}
Accuracy: 90.53% with {'ngram_range': (3, 4)}
Accuracy: 89.47% with {'ngram_range': (3, 5)}
Accuracy: 89.47% with {'ngram_range': (3, 6)}
Accuracy: 89.21% with {'ngram_range': (4, 4)}
Accuracy: 88.68% with {'ngram_range': (4, 5)}
Accuracy: 88.42% with {'ngram_range': (4, 6)}
Accuracy: 87.89% with {'ngram_range': (5, 5)}
Accuracy: 88.16% with {'ngram_range': (5, 6)}
Accuracy: 86.84% with {'ngram_range

In [300]:
model_name = "tf-idf_lr_ngrams2-5"

In [303]:
from joblib import dump, load
with open(f"./models/{model_name}.joblib", "wb") as f:
    dump(best_model, f)
with open(f"./models/model.txt", "w") as f:
    f.write(model_name)