This notebook benchmarks ML models (downloaded into the model directory) against various validation sets.

The 'all_users' dataset may not be public, but the single tab dataset is.


In [1]:
import pandas as pd
from pandas import DataFrame
from functools import partial

In [2]:
%pwd
%cd "~/Documents/GitHub/smart-tab-grouping"


/Users/Rrando/Documents/GitHub/smart-tab-grouping


In [3]:
from rouge_score import rouge_scorer

In [4]:
multitab_tests = pd.read_csv("data/individual_tests/private/all_users2.csv")
single_tab_tests = pd.read_csv("data/individual_tests/single_tab_validation.csv")
single_tab_tests.keywords = ""

garbled_tests = pd.read_csv("data/individual_tests/garbled.csv")



In [5]:
garbled_tests.loc[:, "keywords"] = ""

In [6]:
garbled_tests

Unnamed: 0,url,title,bad-output,keywords
0,https://www.neatorama.com/2018/12/04/The-Glamo...,"The Glamorous, Sexist History of the Women’s R...",Women'Sroom,
1,http://www.bunkhistory.org/resources/bloomberg...,"The Glamorous, Sexist History of the Women’s R...",Women'Sexist History,
2,https://uiwomenscenter.wordpress.com/2019/11/1...,The evolution of the women’s “rest”room,Women'rroom,
3,https://getpocket.com/explore/item/how-to-turn...,How to Turn Bad Anxiety Into Good Anxiety,AnXiety,
4,https://www.bbc.com/,"BBC Home - Breaking News, World News, US News,...",Breakving,
5,https://getpocket.com/explore/item/should-you-...,Should You Drink Cold or Warm Water? The Benef...,Drinkling,
6,https://getpocket.com/explore/item/how-to-make...,How To Make Cacio e Pepe: The Easiest Method f...,Cacion,
7,https://en.wikipedia.org/wiki/1982_British_Arm...,1982 British Army Gazelle friendly fire incident,Firearmous,
8,https://www.deviantart.com/szigi63/art/Fairies...,Fairies of the Autumn forest by Szigi63 on Dev...,Fairship,


In [8]:
from spellchecker import SpellChecker
spell = SpellChecker()
spell.word_frequency.load_words(['microsoft', 'apple', 'google', 'bing', 'search', 'duckduckgo', 'yahoo'])


def is_clean_string(s: str):
    for word in s.split():
        if ("'" in word):
            segments = word.split("'")
            if len(segments) == 1:
                break
            if len(segments) > 2:
                return False
            if len(segments) == 2:
                if len(segments[0]) > 1 and len(segments[1]) > 1:
                    return False
            continue # don't check spelling with 's
        if (len(spell.unknown([word])) == 1):
            return False
        last_char = None
        for cur_char in word:
            if last_char is None:
                last_char = cur_char
                continue
            if (not last_char.isalpha()) or (not cur_char.isalpha()):
                last_char = cur_char
                continue
            if cur_char.upper() == cur_char and last_char.lower() == last_char: # switch to uppercase
                return False
            last_char = cur_char
    return True
            
        
    

In [9]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

In [10]:
from sklearn.metrics.pairwise import cosine_similarity


In [11]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, pipeline
import numpy as np

In [12]:
embedder = pipeline("feature-extraction", model="sentence-transformers/all-MiniLM-L6-v2", device=-1)


Device set to use cpu


In [13]:
def cos_sim(s1, s2):
    embeddings = [np.mean(embedder(s)[0], axis=0) for s in [s1, s2]]
    similarity = cosine_similarity(embeddings[0].reshape(1,-1), embeddings[1].reshape(1,-1)).squeeze()
    return similarity

    

In [14]:
cos_sim("Dogs", "Apple")

array(0.31603999)

In [15]:
def compute_scores(row, pred_key=None):
    scores = scorer.score(row['label'], row[pred_key])
    return {
        'rouge1': scores['rouge1'].fmeasure,
        'rouge2': scores['rouge2'].fmeasure,
        'rougeL': scores['rougeL'].fmeasure,
        'pred_len': len(row[pred_key]),
        'label_len': len(row['label']),
        'cos_sim': cos_sim(row['label'], row[pred_key]),
        'clean': 1 if is_clean_string(row[pred_key]) else 0
    }

def compute_scores_no_label(row, pred_key=None):
        return {
            'clean': 1 if is_clean_string(row[pred_key]) else 0
        }



In [16]:
def get_avg_scores(input_df: DataFrame, compare_column: str):
    scorer = compute_scores_no_label if 'label' not in input_df.columns else compute_scores
    rouge_scores_df = input_df.apply(partial(scorer, pred_key=compare_column) , axis=1, result_type='expand')
    average_scores = rouge_scores_df.mean().to_dict()
    return average_scores


In [17]:
import sys
sys.path.append("/Users/Rrando/Documents/GitHub/smart-tab-grouping/src")
from util.tab_titles import T5TopicGenerator, OnnxT5TopicGenerator

In [18]:
def compute_topic_keywords(row, legacy=False, prob_limit=None):
    return topic_gen.get_topic_with_keywords({"documents": row["three_titles"].split('\n'), "keywords": row["keywords"].split(',')}, legacy=legacy, prob_limit=prob_limit)

In [19]:
topic_gen = T5TopicGenerator("./models/still-durian-309")

/Users/Rrando/Documents/GitHub/smart-tab-grouping


In [20]:
#topic_gen.tokenizer.decode(topic_gen.model.generation_config.bad_words_ids[88])

topic_gen.tokenizer.convert_ids_to_tokens(topic_gen.model.generation_config.bad_words_ids[600])

['▁as', 's', 'f', 'uk', 'ka']

In [21]:
def compute_topic_keywords_single(row, legacy=False, prob_limit=None):
    return topic_gen.get_topic_with_keywords({"documents": [row["title"]], "keywords": row["keywords"].split(',')}, legacy=legacy, 
                                             prob_limit=prob_limit)

In [22]:
def compute_topic(row):
    return topic_gen.get_topic({"documents": row["three_titles"].split('\n')})

In [None]:

multitab_tests["recomputed_titles_keywords"] = multitab_tests.apply(lambda row: compute_topic_keywords(row), axis=1)


In [23]:

torch_models = [
           {"name": "cool-yogurt-98", "legacy_data_format": False},
          {"name": "dainty-blaze-127", "legacy_data_format": False},
        {"name": "dainty-river-189","legacy_data_format": False},
        {"name": "gallant-sunset-190","legacy_data_format": False},
        {"name": "upbeat-eon-195", "legacy_data_format": False},
        {"name": "devoted-puddle-246", "legacy_data_format": False},
        {"name": "genial-tree-283", "legacy_data_format": False},
        {"name": "major-elevator-302", "legacy_data_format": False},
        {"name": "olive-silence-303", "legacy_data_format": False},
        {"name": "sandy-forest-305", "legacy_data_format": False},
        {"name": "still-durian-309", "legacy_data_format": False},
        {"name": "eager-plant-323", "legacy_data_format": False},
        {"name": "dulcet-durian-136", "legacy_data_format": False},
        {"name": "lively-planet-17", "legacy_data_format": False},
         ]

onnx_quantized_models = [
           {"name": "cool-yogurt-98", "legacy_data_format": False},
          {"name": "dainty-blaze-127", "legacy_data_format": False},
        {"name": "devoted-puddle-246", "legacy_data_format": False},
        {"name": "sandy-forest-305", "legacy_data_format": False},
        {"name": "still-durian-309", "legacy_data_format": False},
        {"name": "eager-plant-323", "legacy_data_format": False},
         ]




In [24]:
TEST_ONNX = False

In [25]:
models = onnx_quantized_models if TEST_ONNX else torch_models

In [26]:
single_tab_tests["keywords"] = pd.Series(dtype=str)
single_tab_tests = single_tab_tests.fillna("")

In [27]:
single_tab_score = []
multi_tab_score = []

for model_info in models:
    name = model_info["name"]
    topic_gen = OnnxT5TopicGenerator(model_name=f"./models_onnx/{name}") if TEST_ONNX else T5TopicGenerator(model_name=f"./models/{name}")
    col = f"recomputed_title_keywords_{name}"
    multitab_tests[col] = multitab_tests.apply(lambda row: compute_topic_keywords(row, legacy=model_info["legacy_data_format"]), axis=1)
    print(f"{name} - MultiTab Tests")
    score = get_avg_scores(multitab_tests, col)
    score["model"] = name
    multi_tab_score.append(score)
    
    single_tab_tests[col] = single_tab_tests.apply(lambda row: compute_topic_keywords_single(row, legacy=model_info["legacy_data_format"]), axis=1)
    print(f"{name} - Single Tab Tests")
    score = get_avg_scores(single_tab_tests, col)
    score["model"] = name
    single_tab_score.append(score)
    
    

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


/Users/Rrando/Documents/GitHub/smart-tab-grouping
cool-yogurt-98 - MultiTab Tests
cool-yogurt-98 - Single Tab Tests
/Users/Rrando/Documents/GitHub/smart-tab-grouping
dainty-blaze-127 - MultiTab Tests
dainty-blaze-127 - Single Tab Tests
/Users/Rrando/Documents/GitHub/smart-tab-grouping
dainty-river-189 - MultiTab Tests
dainty-river-189 - Single Tab Tests
/Users/Rrando/Documents/GitHub/smart-tab-grouping
gallant-sunset-190 - MultiTab Tests
gallant-sunset-190 - Single Tab Tests
/Users/Rrando/Documents/GitHub/smart-tab-grouping
upbeat-eon-195 - MultiTab Tests
upbeat-eon-195 - Single Tab Tests
/Users/Rrando/Documents/GitHub/smart-tab-grouping
devoted-puddle-246 - MultiTab Tests
devoted-puddle-246 - Single Tab Tests
/Users/Rrando/Documents/GitHub/smart-tab-grouping
genial-tree-283 - MultiTab Tests
genial-tree-283 - Single Tab Tests
/Users/Rrando/Documents/GitHub/smart-tab-grouping
major-elevator-302 - MultiTab Tests
major-elevator-302 - Single Tab Tests
/Users/Rrando/Documents/GitHub/smart-t

In [28]:
garbled_tests.title.to_list()

['The Glamorous, Sexist History of the Women’s Restroom Lounge - Neatorama',
 'The Glamorous, Sexist History of the Women’s Restroom Lounge — Bunk History',
 'The evolution of the women’s “rest”room',
 'How to Turn Bad Anxiety Into Good Anxiety',
 'BBC Home - Breaking News, World News, US News, Sports, Business, Innovation, Climate, Culture, Travel, Video & Audio',
 'Should You Drink Cold or Warm Water? The Benefits of Drinking 8 Glasses of Water a Day',
 'How To Make Cacio e Pepe: The Easiest Method for Perfect Results Every Time',
 '1982 British Army Gazelle friendly fire incident',
 'Fairies of the Autumn forest by Szigi63 on Deviant Art']

In [30]:
single_tab_df = pd.DataFrame(single_tab_score)
multi_tab_df = pd.DataFrame(multi_tab_score)
#garbled_tests_score = pd.DataFrame(garbled_tests_score)

In [32]:
multi_tab_df

Unnamed: 0,rouge1,rouge2,rougeL,pred_len,label_len,cos_sim,clean,model
0,0.328373,0.072222,0.328373,16.333333,11.375,0.481831,0.604167,cool-yogurt-98
1,0.275694,0.069444,0.275694,13.875,11.375,0.453263,0.708333,dainty-blaze-127
2,0.265972,0.052083,0.265972,12.8125,11.375,0.449383,0.854167,dainty-river-189
3,0.181944,0.0,0.181944,11.875,11.375,0.385708,0.875,gallant-sunset-190
4,0.270139,0.03125,0.270139,13.25,11.375,0.444296,0.75,upbeat-eon-195
5,0.328472,0.041667,0.328472,13.291667,11.375,0.503067,0.8125,devoted-puddle-246
6,0.351389,0.045139,0.351389,10.25,11.375,0.51689,0.833333,genial-tree-283
7,0.341667,0.083333,0.341667,12.895833,11.375,0.483792,0.8125,major-elevator-302
8,0.339583,0.041667,0.339583,12.854167,11.375,0.508723,0.791667,olive-silence-303
9,0.349306,0.0625,0.349306,11.125,11.375,0.519434,0.833333,sandy-forest-305


In [33]:
single_tab_df

Unnamed: 0,rouge1,rouge2,rougeL,pred_len,label_len,cos_sim,clean,model
0,0.315741,0.115741,0.315741,14.972222,8.527778,0.462089,0.805556,cool-yogurt-98
1,0.381019,0.111111,0.381019,13.638889,8.527778,0.521137,0.916667,dainty-blaze-127
2,0.372222,0.111111,0.372222,12.222222,8.527778,0.491993,0.888889,dainty-river-189
3,0.27963,0.083333,0.27963,10.75,8.527778,0.464003,0.944444,gallant-sunset-190
4,0.387963,0.157407,0.387963,13.027778,8.527778,0.522846,0.916667,upbeat-eon-195
5,0.410847,0.131481,0.410847,12.472222,8.527778,0.5551,0.916667,devoted-puddle-246
6,0.363095,0.122222,0.363095,9.388889,8.527778,0.506101,0.916667,genial-tree-283
7,0.337698,0.140741,0.337698,12.75,8.527778,0.497118,0.916667,major-elevator-302
8,0.364815,0.12963,0.364815,12.472222,8.527778,0.494205,0.861111,olive-silence-303
9,0.387963,0.12963,0.387963,13.055556,8.527778,0.500136,0.944444,sandy-forest-305


In [36]:
single_tab_tests.columns

Index(['title', 'label', 'keywords',
       'recomputed_title_keywords_cool-yogurt-98',
       'recomputed_title_keywords_dainty-blaze-127',
       'recomputed_title_keywords_dainty-river-189',
       'recomputed_title_keywords_gallant-sunset-190',
       'recomputed_title_keywords_upbeat-eon-195',
       'recomputed_title_keywords_devoted-puddle-246',
       'recomputed_title_keywords_genial-tree-283',
       'recomputed_title_keywords_major-elevator-302',
       'recomputed_title_keywords_olive-silence-303',
       'recomputed_title_keywords_sandy-forest-305',
       'recomputed_title_keywords_still-durian-309',
       'recomputed_title_keywords_eager-plant-323',
       'recomputed_title_keywords_dulcet-durian-136',
       'recomputed_title_keywords_lively-planet-17'],
      dtype='object')

In [38]:
single_tab_tests[["title", "label", "recomputed_title_keywords_lively-planet-17"]]

Unnamed: 0,title,label,recomputed_title_keywords_lively-planet-17
0,Amazon.com : cheese crackers,Groceries,Cheese Crackers
1,"Amazon.com : Hapi hot wasabi peas, 9,9 Ounce T...",Snacks,Food
2,Organic Mint Fields Tea Bags | The Republic of...,Tea,Tea Bags
3,Amazon.com: Swedish Rye Crispbreads Rounds by ...,Crackers,Kitchen
4,Nettle Meadow Kunik | Murray's Cheeses,Cheese,Cheeses
5,"Great Value Whole Vitamin D Milk, Gallon, Plas...",Groceries,Food
6,COSTCO WHOLESALE - Updated January 2025 - 52 P...,Costco review,News
7,Beaverland: How One Weird Rodent Made America:...,Book,Cars
8,Data Collection - MozillaWiki,Data policy,Data Collection
9,U+1F984 Unicorn Face Unicode Character,Unicorn character,Unicorn Characters


In [None]:
single_tab_tests[["title", "label", "recomputed_title_keywords_lively-planet-17"]]