In [1]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

import logging
logging.basicConfig(level=logging.INFO)

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, do_basic_tokenize=True)

INFO:pytorch_pretrained_bert.tokenization:loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/mike/.pytorch_pretrained_bert/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [4]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')
model = model.to('cuda')
model.eval()

INFO:pytorch_pretrained_bert.modeling:loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /home/mike/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
INFO:pytorch_pretrained_bert.modeling:extracting archive file /home/mike/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /tmp/tmpidtz7039
INFO:pytorch_pretrained_bert.modeling:Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('../data/seriouseats/seriouseats.csv')
df = df[['title', 'short_description', 'description', 'directions', 'ingredients']]
df.head()

Unnamed: 0,title,short_description,description,directions,ingredients
0,Jammy Fruit Bars,"Simple but satisfying fruit bars, made with yo...","These fruity, oaty bars are inspired by Icelan...",Getting Ready: Adjust oven rack to lower-middl...,For the Dough:||||4 1/2 ounces old fashioned r...
1,Balsamic Glazed Baby Back Ribs,"Smoky, sweet, and tangy ribs.",Nothing can fulfill the defining barbecue trif...,"To make the rub: Mix together brown sugar, sal...",For the Rub:||||2 tablespoons dark brown sugar...
2,The Best Meatball Pizza,Does size matter? When it comes to meatballs o...,Meatball pizza consists of two Italian-America...,Set aside 1/2 cup meatball mixture. Form remai...,1 recipe Italian-American Meatballs in Red Sau...
3,Easy Pork Rillettes (Slow-Cooked Pork Spread),Your guests will think you worked hard on it. ...,Rillettes are an entertainer's godsend. They'r...,Adjust oven rack to lower position and preheat...,"2 pounds boneless, skinless pork shoulder, cut..."
4,Duck Pastrami,"Salty, peppery, and luscious.","Building on past pastrami success, I changed t...","For the Cure: in a small bowl, mix together sa...",For the Cure:||||1/4 cup Kosher salt||||2 teas...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8312 entries, 0 to 8311
Data columns (total 5 columns):
title                8312 non-null object
short_description    2114 non-null object
description          7787 non-null object
directions           8282 non-null object
ingredients          8282 non-null object
dtypes: object(5)
memory usage: 324.8+ KB


In [5]:
def delete_preamble(title):
    if ':' in title:
        title_split = title.split(':')
        return title_split[-1].strip()
    return title

In [6]:
df['title'] = df['title'].map(delete_preamble)

In [7]:
import re
def get_title_post_description(title):
    match = re.search(r".+\((.+)(\)|\.{3})", title)
    if match:
        return match.group(1)
    return title

In [8]:
df['title'] = df['title'].map(get_title_post_description)

In [9]:
def remove_the_best_prefix(title):
    match = re.search(r"The Best (.+)", title)
    if match:
        return match.group(1)
    return title

In [10]:
df['title'] = df['title'].map(remove_the_best_prefix)

In [11]:
df[df.apply(lambda x: 'The Best' in x['title'], axis=1)]

Unnamed: 0,title,short_description,description,directions,ingredients


In [12]:
import torch.nn as nn

class FeatureExtractor(nn.Module):
    def __init__(self, bert_model):
        super(FeatureExtractor, self).__init__()
        self.bert_model = bert_model
    
    def forward(self, input_ids):
        encoded_layers, _ = self.bert_model(input_ids, output_all_encoded_layers=True)
        second_last_layer = encoded_layers[-2]
        features = second_last_layer.max(1)
        return features

In [14]:
m = nn.AdaptiveMaxPool2d((768, 1))
input = torch.randn(2, 5, 768)
print(input.size())
output = m(input)
output.size()

torch.Size([2, 5, 768])


torch.Size([2, 768, 1])

In [17]:
from tqdm import tqdm_notebook

features = []
#extractor = FeatureExtractor(model)
for title in tqdm_notebook(df['title']):
    text = f"[CLS] {title}"
    tokenized_text = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    tokens_tensor = tokens_tensor.to('cuda')
    with torch.no_grad():
        f = extractor(tokens_tensor)
        features.append(f)
features = torch.cat(features)

HBox(children=(IntProgress(value=0, max=8312), HTML(value='')))




TypeError: "Jammy Fruit Bars" must be <class 'list'>, but received <class 'str'>

In [19]:
# features = bc.encode(df['title'].tolist())

In [23]:
features.shape

(8312, 768)

In [24]:
from annoy import AnnoyIndex
import os

annoy_index = AnnoyIndex(768)
if os.path.isfile("index_bert_pytorch.ann"):
    annoy_index.load("index_bert_pytorch.ann")
else:
    for i, f in tqdm_notebook(enumerate(features)):
        annoy_index.add_item(i, f)

    annoy_index.build(10000)
    annoy_index.save("index_bert_pytorch.ann")

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [27]:
from collections import defaultdict
pd.set_option("max_colwidth", 600)

df_sample = df.sample(15)[['title']]
number_nearest = 3
nearest_cols = defaultdict(list)

for i in df_sample.index.values:
    nearest = annoy_index.get_nns_by_item(df.index.get_loc(i), number_nearest)
    for ii, n in enumerate(nearest):
        nearest_cols[ii].append(df.iloc[n].title)
        
for key, col in nearest_cols.items():
    df_sample[f"n_{key}"] = col
    
df_sample

Unnamed: 0,title,n_0,n_1,n_2
3091,Vegetarian Grilled Zucchini Croque Mademoiselle,Vegetarian Grilled Zucchini Croque Mademoiselle,Roasted Ratatouille Lasagna Napoleons,Chanterelle's Grilled Seafood Sausage with Beurre Blanc Sauce
4792,German Double-Chocolate Cake,German Double-Chocolate Cake,Double-Strawberry Cake,Italian Almond Cake
787,French Colonial Pie,French Colonial Pie,French Peas,The Dutchess
5531,Bananas Foster French Toast,Bananas Foster French Toast,Bananas Foster Pie,Breakfast Garlic Toast
4711,Caramelized Vidalia Onion Mashed Potatoes,Caramelized Vidalia Onion Mashed Potatoes,Curried Egg Salad with Caramelized Onion,Sweet Potato Pancakes Made With Leftover Mashed Sweet Potatoes
3717,Pioneer Woman's Cranberry Sauce,Pioneer Woman's Cranberry Sauce,Pioneer Woman's Sweet Potatoes,Dreena's Traditional Cranberry Sauce
2025,Maple-Rosemary-Bourbon Pecans,Maple-Rosemary-Bourbon Pecans,Blueberry-Maple-Pecan Conserve,Apple-Pecan Bourbon-Caramel Pie
5913,"Grilled Polenta With Marinara, Parmesan, and Basil","Grilled Polenta With Marinara, Parmesan, and Basil","Grilled Zucchini With Capers, Basil and Lemon","Grilled Stuffed Flank Steak With Salami, Fontina, Parmesan, and Bread Crumbs"
1438,Everyman Afterall,Everyman Afterall,Weather Up's Suffering Bastard,Natto
5672,Classic Burgers and Homemade Hamburger Buns,Classic Burgers and Homemade Hamburger Buns,Juicy Turkey Burgers,Thanksgiving Turkey Burgers
