In [1]:
import pandas as pd
from pandas import DataFrame
from functools import partial

In [2]:
%pwd
%cd "~/Documents/GitHub/smart-tab-grouping"


/Users/Rrando/Documents/GitHub/smart-tab-grouping


  bkms = self.shell.db.get('bookmarks', {})
  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [3]:
from rouge_score import rouge_scorer

In [4]:
multitab_tests = pd.read_csv("data/individual_tests/private/all_users2.csv")
single_tab_tests = pd.read_csv("data/individual_tests/single_tab_validation.csv")
single_tab_tests.keywords = ""

In [5]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

In [6]:
from sklearn.metrics.pairwise import cosine_similarity


In [7]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, pipeline
import numpy as np

In [8]:
embedder = pipeline("feature-extraction", model="sentence-transformers/all-MiniLM-L6-v2", device=-1)


Device set to use cpu


In [9]:
def cos_sim(s1, s2):
    embeddings = [np.mean(embedder(s)[0], axis=0) for s in [s1, s2]]
    similarity = cosine_similarity(embeddings[0].reshape(1,-1), embeddings[1].reshape(1,-1)).squeeze()
    return similarity



In [10]:
cos_sim("Dogs", "Apple")

array(0.31603999)

In [11]:
def compute_scores(row, pred_key=None):
    scores = scorer.score(row['label'], row[pred_key])
    return {
        'rouge1': scores['rouge1'].fmeasure,
        'rouge2': scores['rouge2'].fmeasure,
        'rougeL': scores['rougeL'].fmeasure,
        'pred_len': len(row[pred_key]),
        'label_len': len(row['label']),
        'cos_sim': cos_sim(row['label'], row[pred_key])
    }


In [12]:
def get_avg_scores(input_df: DataFrame, compare_column: str):
    rouge_scores_df = input_df.apply(partial(compute_scores, pred_key=compare_column) , axis=1, result_type='expand')
    average_scores = rouge_scores_df.mean().to_dict()
    return average_scores


In [13]:
import sys
sys.path.append("/Users/Rrando/Documents/GitHub/smart-tab-grouping/src")
from util.tab_titles import T5TopicGenerator

In [14]:
topic_gen = T5TopicGenerator()

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


/Users/Rrando/Documents/GitHub/smart-tab-grouping


In [15]:
def compute_topic_keywords(row, legacy=False):
    return topic_gen.get_topic_with_keywords({"documents": row["three_titles"].split('\n'), "keywords": row["keywords"].split(',')}, legacy=legacy)

In [16]:
def compute_topic_keywords_single(row, legacy=False):
    return topic_gen.get_topic_with_keywords({"documents": [row["title"]], "keywords": row["keywords"].split(',')}, legacy=legacy)

In [17]:
def compute_topic(row):
    return topic_gen.get_topic({"documents": row["three_titles"].split('\n')})

In [18]:

multitab_tests["recomputed_titles_keywords"] = multitab_tests.apply(lambda row: compute_topic_keywords(row), axis=1)
#multitab_tests["recomputed_title_no_keywords"] = multitab_tests.apply(lambda row: compute_topic(row), axis=1)
# Look at OpenAI can do for generating topics from a set of tabs
#llm_topic_gen_no_keywords = OpenAITopicGenerator(support_keywords=False)
#multitab_tests["openai_keywords"] = multitab_tests.apply(lambda row: llm_topic_gen_keywords.get_topic({"documents": row.three_titles.split("\n"), "keywords": row.keywords.split(",")}), axis=1)
#print_rouge_scores(multitab_tests, "openai_keywords")
# Compare with fine tuned model
#topic_gen = T5TopicGenerator(model_name="./models/gentle-pyramid-114")
#multitab_tests["recomputed_title_keywords_pyramid"] = multitab_tests.apply(lambda row: compute_topic_keywords(row, legacy=False), axis=1)
#print_rouge_scores(multitab_tests, "recomputed_title_keywords_pyramid")


In [32]:
models = [
          {"name": "cool-yogurt-98", "legacy_data_format": False},
          {"name": "dainty-blaze-127", "legacy_data_format": False},
          {"name": "major-cloud-188", "legacy_data_format": False},
        {"name": "dainty-river-189","legacy_data_format": False},
        {"name": "gallant-sunset-190","legacy_data_format": False},
        {"name": "upbeat-eon-195", "legacy_data_format": False}
         ]

# pious-butterfly-170 is broken


In [33]:
single_tab_tests["keywords"] = pd.Series(dtype=str)
single_tab_tests = single_tab_tests.fillna("")

In [34]:
single_tab_score = []
multi_tab_score = []

for model_info in models:
    name = model_info["name"]
    topic_gen = T5TopicGenerator(model_name=f"./models/{name}")
    col = f"recomputed_title_keywords_{name}"
    multitab_tests[col] = multitab_tests.apply(lambda row: compute_topic_keywords(row, legacy=model_info["legacy_data_format"]), axis=1)
    print(f"{name} - MultiTab Tests")
    score = get_avg_scores(multitab_tests, col)
    score["model"] = name
    multi_tab_score.append(score)
    
    single_tab_tests[col] = single_tab_tests.apply(lambda row: compute_topic_keywords_single(row, legacy=model_info["legacy_data_format"]), axis=1)
    print(f"{name} - Single Tab Tests")
    score = get_avg_scores(single_tab_tests, col)
    score["model"] = name
    single_tab_score.append(score)

    
    

/Users/Rrando/Documents/GitHub/smart-tab-grouping
cool-yogurt-98 - MultiTab Tests
cool-yogurt-98 - Single Tab Tests
/Users/Rrando/Documents/GitHub/smart-tab-grouping
dainty-blaze-127 - MultiTab Tests
dainty-blaze-127 - Single Tab Tests
/Users/Rrando/Documents/GitHub/smart-tab-grouping
major-cloud-188 - MultiTab Tests
major-cloud-188 - Single Tab Tests
/Users/Rrando/Documents/GitHub/smart-tab-grouping
dainty-river-189 - MultiTab Tests
dainty-river-189 - Single Tab Tests
/Users/Rrando/Documents/GitHub/smart-tab-grouping
gallant-sunset-190 - MultiTab Tests
gallant-sunset-190 - Single Tab Tests
/Users/Rrando/Documents/GitHub/smart-tab-grouping
upbeat-eon-195 - MultiTab Tests
upbeat-eon-195 - Single Tab Tests


In [35]:
single_tab_df = pd.DataFrame(single_tab_score)
multi_tab_df = pd.DataFrame(multi_tab_score)

In [36]:
multi_tab_df

Unnamed: 0,rouge1,rouge2,rougeL,pred_len,label_len,cos_sim,model
0,0.328373,0.072222,0.328373,16.333333,11.375,0.481831,cool-yogurt-98
1,0.275694,0.069444,0.275694,13.875,11.375,0.453263,dainty-blaze-127
2,0.260119,0.052083,0.260119,13.104167,11.375,0.433384,major-cloud-188
3,0.265972,0.052083,0.265972,12.8125,11.375,0.449383,dainty-river-189
4,0.181944,0.0,0.181944,11.875,11.375,0.385708,gallant-sunset-190
5,0.270139,0.03125,0.270139,13.25,11.375,0.444296,upbeat-eon-195


In [37]:
single_tab_df

Unnamed: 0,rouge1,rouge2,rougeL,pred_len,label_len,cos_sim,model
0,0.315741,0.115741,0.315741,14.972222,8.527778,0.462089,cool-yogurt-98
1,0.381019,0.111111,0.381019,13.638889,8.527778,0.521137,dainty-blaze-127
2,0.421429,0.140741,0.421429,13.111111,8.527778,0.540664,major-cloud-188
3,0.372222,0.111111,0.372222,12.222222,8.527778,0.491993,dainty-river-189
4,0.27963,0.083333,0.27963,10.75,8.527778,0.464003,gallant-sunset-190
5,0.387963,0.157407,0.387963,13.027778,8.527778,0.522846,upbeat-eon-195


In [None]:
single_tab_tests.columns

In [None]:
from jobs.tune_base import keyword_prompt

In [None]:
keyword_prompt.generate_prompt("DOC NAME", "")

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, pipeline, T5Config


In [None]:
name = "cool-yogurt-98"
c = T5Config.from_pretrained(f"./models/{name}")
model = T5ForConditionalGeneration.from_pretrained(f"./models/{name}", config=c)
#model = T5ForConditionalGeneration.from_pretrained(f"./models/{name}")

In [None]:
len(getattr(model, "decoder").block)

In [None]:
len(getattr(model, "encoder").block)

In [None]:
model.config.num_layers

In [None]:
layers_to_remove = [i for i in range(1, 4 * 2, 2)]

In [None]:
layers_to_remove

In [None]:
del getattr(model, "encoder").block[7]
del getattr(model, "encoder").block[5]
del getattr(model, "encoder").block[3]
del getattr(model, "encoder").block[1]


In [None]:
del getattr(model, "decoder").block[7]
del getattr(model, "decoder").block[5]
del getattr(model, "decoder").block[3]
del getattr(model, "decoder").block[1]


In [None]:
len(getattr(model, "decoder").block)

In [None]:
model.config.num_decoder_layers = len(getattr(model, "decoder").block)
model.config.num_layers = len(getattr(model, "encoder").block)

In [None]:
generate_response("Topic from keywords: . titles: \n Dogs and Cats are awesome but buy the Tacos")

In [None]:
model.save_pretrained("./models/test_remove/")
tokenizer.save_pretrained("./models/test_remove/")


In [None]:
c = T5Config.from_pretrained(f"./models/test_remove")
model = T5ForConditionalGeneration.from_pretrained(f"./models/test_remove", config=c)

In [None]:
#model = test_removd

In [None]:
generate_response("Topic from keywords: . titles: \n Dogs and Cats are awesome but buy the Tacos")

In [None]:
len(getattr(model, "encoder").block)

In [None]:
for name, param in model.named_parameters():
    print(name)

In [None]:
for i, block in enumerate(getattr(model, "encoder").block):
    print(f"Layer {i}: {block}")


In [None]:
getattr(model, "encoder").block

In [None]:
model = T5ForConditionalGeneration.from_pretrained(f"./models/{name}", config=config)
tokenizer = T5Tokenizer.from_pretrained(f"./models/{name}")


In [None]:
def generate_response(prompt, max_tokens=14):
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids
        outputs = model.generate(input_ids, max_length=max_tokens, num_return_sequences=1)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response


In [None]:
generate_response("Topic from keywords: . titles: \n Groceries")