In [9]:
import json, os
from argparse import ArgumentParser
from tqdm import tqdm
from scipy.sparse import coo_matrix
from ortools.sat.python import cp_model


def create_doc_kp_matrix(phrase_occurrences):
    # first create phrase vocab
    phrase_vocab = list(sorted(phrase_occurrences.keys()))
    phrase2idx = {p:i for i, p in enumerate(phrase_vocab)}

    # then create docid2idx
    all_doc_ids = set([])
    for phrase_idx, phrase in tqdm(enumerate(phrase_vocab), desc = "Creating docid2idx"):
        all_doc_ids.update(phrase_occurrences[phrase])
    all_doc_ids = list(sorted(all_doc_ids))
    docid2idx = {docid:i for i, docid in enumerate(all_doc_ids)}

    row, col, data = [], [], []

    for phrase_idx, phrase in tqdm(enumerate(phrase_vocab), desc = "Creating sparse matrix"):
        doc_indices = [docid2idx[docid] for docid in phrase_occurrences[phrase]]
        row.extend(doc_indices)
        col.extend([phrase_idx] * len(doc_indices))
        data.extend([1] * len(doc_indices))

    n_rows = max(row) + 1
    n_cols = max(col) + 1

    sparse_mat = coo_matrix((data, (row, col)), shape=(n_rows, n_cols))

    return sparse_mat.tocsr(), phrase2idx


def optimization_problem(doc_kp_matrix, phrase2idx):
    model = cp_model.CpModel()

    num_j = len(phrase2idx)
    num_i = doc_kp_matrix.shape[0]
    y = [model.NewBoolVar(f'y_{j}') for j in range(num_j)]
    z = [model.NewBoolVar(f'z_{i}') for i in range(num_i)]

    for i in tqdm(range(num_i), desc = "Adding constraints"):
        
        # Create weighted sum expression: S_i = sum(x_ij * y_j)
        S_i = sum(doc_kp_matrix[i, j] * y[j] for j in range(num_j))
        
        # If z_i is True, enforce S_i >= 1
        model.Add(S_i >= 1).OnlyEnforceIf(z[i])
        
        # If z_i is False, enforce S_i == 0
        model.Add(S_i <= 0).OnlyEnforceIf(z[i].Not())

    # Optional: Add objective to maximize number of active z_i
    model.Maximize(sum(z))

    # Solve
    print("Start solving")
    solver = cp_model.CpSolver()
    status = solver.Solve(model)

    # Print results
    if status == cp_model.OPTIMAL:
        print('Solution found')
        res = []
        for j in range(num_j):
            res.append(solver.Value(y[j]))

        number_of_documents_covered = 0
        for i in range(num_i):
            number_of_documents_covered += solver.Value(z[i])
        print("Number of documents covered", number_of_documents_covered)

        return res
    else:
        print('No solution found.')

In [2]:
input_folder = "/scratch/lamdo/s2orc_phrase_vocab/"

input_files = os.listdir(input_folder)
input_files = [os.path.join(input_folder, file) for file in input_files]


phrase_occurrences = {}
for file in tqdm(input_files):
    with open(file) as f:
        temp = json.load(f)

        for k in temp:
            if k not in phrase_occurrences: phrase_occurrences[k] = []
            phrase_occurrences[k].extend(temp[k])

    break

phrase_occurrences = {k:v for k,v in phrase_occurrences.items() if len(v) >= 50}

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:07<?, ?it/s]


In [5]:
doc_kp_matrix, phrase2idx = create_doc_kp_matrix(phrase_occurrences)

Creating docid2idx: 9555it [00:00, 20929.67it/s]
Creating sparse matrix: 9555it [00:01, 6775.55it/s]


In [10]:
test = optimization_problem(doc_kp_matrix, phrase2idx)

Adding constraints:   0%|          | 617/1140871 [02:03<63:33:43,  4.98it/s]


KeyboardInterrupt: 

In [11]:
import json
with open("/home/lamdo/splade/create_concept_splade/s2orc/phrase_vocab_100k.json") as f:
    original_vocab = json.load(f)

with open("/home/lamdo/splade/create_concept_splade/vocab_create/phrase_vocab_s2orc_gitig_.json") as f:
    optimized_vocab = json.load(f)

In [17]:
test = [item for item in original_vocab[:30000] if item not in optimized_vocab]

In [18]:
len(test)

3528

In [19]:
test

['library schools',
 'land surveying',
 'living library',
 'australian business entities',
 'biographical entry',
 '//extension.oregonstate.edu/catalog',
 'dalton guide',
 'pcos',
 'finding articles',
 'cancer death',
 'finding books',
 'less latency time',
 'reduced graphene oxide',
 'tasmanian history',
 'papr reduction',
 'hpv infection',
 'thin film transistor',
 'renal replacement therapy',
 'zno thin film',
 'books collection',
 'multiple sclerosis patients',
 'oakland campus',
 'library research guide',
 'graphene layer',
 'prostate cancer cells',
 'rbf network',
 'cell apoptosis',
 'related articles',
 'adsorption process',
 'information processing method',
 'copd patients',
 'reference sources',
 'power factor correction',
 'fuzzy control system',
 'eeg signals',
 'cell cycle arrest',
 'digital library saves',
 'sliding mode controller',
 'pem fuel cell',
 'hbv infection',
 'serum uric acid',
 'left coronary artery',
 'research resources',
 'breast cancer diagnosis',
 'develop

In [1]:
from transformers import AutoTokenizer

In [2]:
tokenizer_bert = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [4]:
tokenizer_roberta = AutoTokenizer.from_pretrained("distilroberta-base")

In [6]:
bert_vocab = tokenizer_bert.vocab
roberta_vocab = tokenizer_roberta.vocab

In [7]:
len([term for term in bert_vocab if term in roberta_vocab])

4320

In [8]:
roberta_vocab

{'ayan': 10790,
 'Ġpitted': 30259,
 'Ġreprim': 30212,
 'Ġintrigued': 28622,
 'Ġsigning': 3442,
 'Regardless': 38861,
 'inical': 44839,
 'design': 23414,
 'Ì': 44025,
 'Ġaccommodate': 9824,
 'ĠMassachusetts': 5517,
 '43': 3897,
 'Ġcoer': 39705,
 'ĠGujarat': 13401,
 'ĠInstruct': 45769,
 'positive': 22173,
 'ĠKids': 9151,
 'Ġfile': 2870,
 'Ġclim': 23377,
 'Ġpalpable': 31966,
 'Ġcreators': 15520,
 'Ġorganic': 6523,
 'cing': 11162,
 'fox': 31587,
 'find': 26559,
 'Ġparam': 40206,
 'Ġviolation': 4565,
 'stro': 26764,
 'Ġcurrently': 855,
 'Ġinvestigation': 803,
 'Ġmarketed': 22472,
 'Library': 44855,
 'icho': 44419,
 'rity': 38039,
 'super': 16101,
 'ĠBannon': 10309,
 'ĠBok': 36335,
 'ĠEpisode': 16012,
 'ĠLebanon': 8398,
 'Ġmomentarily': 38289,
 'minute': 4530,
 'IED': 42382,
 'Malley': 31880,
 'Ġequity': 2355,
 'Years': 43937,
 'aughty': 42942,
 'phony': 29788,
 'ĠCasino': 14049,
 'ĠEspecially': 17570,
 'Ġtalents': 11268,
 'Ġvariability': 36049,
 '*.': 44460,
 'Ġsore': 12867,
 'ĠPistons': 18

In [1]:
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [3]:
tokenizer.tokenize("kershaw")

['ke', '##rs', '##haw']