In [26]:
import numpy as np
from timeit import timeit
from scipy.sparse import load_npz

csc_codes = load_npz("csc_codes.npz")

time = lambda fn, number=100: print(timeit(fn, number=number))

def get_topk(arr, k=10):
    assert isinstance(k, int) and k > 0
    if not isinstance(arr, np.ndarray):
        arr = np.array(arr)
    assert len(arr.shape) == 1

    indices = arr.argsort()[-k:][::-1]
    vals = arr[indices]
    return indices.tolist(), vals.tolist()



def baseline():
    idx = np.random.randint(0, csc_codes.shape[1])
    get_topk(arr=csc_codes[:,idx].toarray().flatten(), k=400)
# time(lambda: csc_codes[:,1].todense().flatten(), number=1000)
# time(lambda: topk(x, k=100), number=1000)
# atom_result = atom_query(atom_idx, csc_codes, k=k, lowest_ratio=lowest_ratio, string=True)
# toks, weights = topk(csc_codes[:, atom_idx].toarray().flatten(), k=k)


# test_arr = np.random.rand(10000)

# # timeit
# print(timeit.timeit(lambda: topk(test_arr), number=10000))

def get_top_k_indices(arr, k):
    if arr.shape[0] < k:
        partitioned_indices = np.arange(arr.shape[0])
    else:
        partitioned_indices = np.argpartition(arr, -k)[-k:]
    return partitioned_indices[np.argsort(-arr[partitioned_indices])]

def topk_optimized(arr, k=10):
    if not isinstance(arr, np.ndarray):
        raise ValueError("Expected input to be a numpy array")
    
    top_k_indices = get_top_k_indices(arr, k)
    
    return top_k_indices.tolist(), arr[top_k_indices].tolist()

# # timeit    
# print(timeit.timeit(lambda: topk_optimized(test_arr), number=10000))

# topk(csc_codes[:,1].data, k=10)


# import torch
def experimental():
    idx = np.random.randint(0, csc_codes.shape[1])
    indices = csc_codes[:,idx].indices
    _idx, vals = get_topk(csc_codes[:,idx].data, k=400)
    tok_ids = indices[_idx]
    return tok_ids, vals

def experimental_two():
    idx = np.random.randint(0, csc_codes.shape[1])
    indices = csc_codes[:,idx].indices
    _idx, vals = topk_optimized(csc_codes[:,idx].data, k=400)
    tok_ids = indices[_idx]
    return tok_ids, vals


time(experimental, number=2000)
time(baseline, number=2000)
time(experimental_two, number=2000)

0.2669292710015725
1.2488546700005827
0.28119098099887196


In [80]:
import numpy as np
from timeit import timeit
from scipy.sparse import load_npz

csc_codes = load_npz("csc_codes.npz")
csr_codes = load_npz("csr_codes.npz")

time = lambda fn, number=100: print(timeit(fn, number=number))

def get_topk(arr, k=10):
    assert isinstance(k, int) and k > 0
    if not isinstance(arr, np.ndarray):
        arr = np.array(arr)
    assert len(arr.shape) == 1

    indices = arr.argsort()[-k:][::-1]
    vals = arr[indices]
    return indices.tolist(), vals.tolist()



# def baseline():
#     idx = np.random.randint(0, csc_codes.shape[1])
#     get_topk(arr=csc_codes[idx].toarray().flatten(), k=400)
# # time(lambda: csc_codes[:,1].todense().flatten(), number=1000)
# # time(lambda: topk(x, k=100), number=1000)
# # atom_result = atom_query(atom_idx, csc_codes, k=k, lowest_ratio=lowest_ratio, string=True)
# # toks, weights = topk(csc_codes[:, atom_idx].toarray().flatten(), k=k)



def experimental():
    idx = np.random.randint(0, csr_codes.shape[0])
    indices = csr_codes[idx].indices
    _idx, vals = get_topk(csr_codes[idx].data, k=400)
    tok_ids = indices[_idx]
    return tok_ids, vals

# def experimental_two():
#     idx = np.random.randint(0, csr_codes.shape[0])
#     indices = csc_codes[idx].indices
#     _idx, vals = get_topk(csc_codes[:,idx].data, k=400)
#     tok_ids = indices[_idx]
#     return tok_ids, vals

def baseline():
    idx = np.random.randint(0, csr_codes.shape[0])
    get_topk(arr=csr_codes[idx].toarray().flatten(), k=400)

    


# def experimental_two():
#     idx = np.random.randint(0, csc_codes.shape[1])
#     indices = csc_codes[:,idx].indices
#     _idx, vals = topk_optimized(csc_codes[:,idx].data, k=400)
#     tok_ids = indices[_idx]
#     return tok_ids, vals


time(experimental, number=20000)
time(baseline, number=20000)
# time(experimental_two, number=2000)

2.331315841000105
1.7935730469998816


In [94]:
from tokenizers import Tokenizer

tokenizer = Tokenizer.from_pretrained('EleutherAI/pythia-70m-v0')

tokens = [tokenizer.decode([tok_id]) for tok_id in range(50277)]

import json

with open("tokens.json", "w") as f:
    json.dump(tokens, f)

In [86]:
tokenizer.get_vocab_size()

50277

In [93]:
tokenizer.decode([50276])



'  '

In [58]:
# baseline
import torch
torch.tensor(csr_codes[123].toarray().flatten()).topk(k=100)

torch.return_types.topk(
values=tensor([109,  45,   5,   5,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0], dtype=torch.uint8),
indices=tensor([ 176,  244,    2,   40, 1111, 1113, 1110, 1109, 1108, 1107, 1106, 1105,
        1104, 1103, 1102, 1101, 1100, 1099, 1098, 1097, 1123, 1151, 1152, 1153,
        1154, 1155, 1156, 1149, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144,
        1145, 1146, 1147, 1205, 1112, 1136, 1122, 1121, 1120, 1119, 1118, 1117,
        1116, 1115, 1114, 1159, 1192, 1181, 1182, 1183,

In [55]:
idx = 123

def experimental():
    idx = np.random.randint(0, csr_codes.shape[0])
    indices = csr_codes[idx].indices
    _idx, vals = get_topk(csr_codes[:,idx].data, k=400)
    tok_ids = indices[_idx]
    return tok_ids, vals

print(csr_codes[idx].indices)
print(csr_codes[idx].data)



[  2  40 176 244]
[  5   5 109  45]


In [427]:
import thefuzz
from thefuzz import process, fuzz

import json

with open("tokens.json", "r") as f:
    tokens = np.array(json.load(f))

def update_scores(query, candidates, scores):
    new_scores = [score+0.6*fuzz.ratio(query, c) for c, score in zip(candidates, scores)]
    new_candidates = [c for new_score, c in sorted(zip(new_scores, candidates), reverse=True) if new_score >= 100 and abs(len(query) - len(c)) < 6 and len(c) > 3]
    # all_candidates = [c for new_score, c in sorted(zip(new_scores, candidates), reverse=True)]
    # print(all_candidates)
    new_scores = sorted(new_scores, reverse=True)
    return new_candidates, new_scores

def process_candidates(query, candidates_and_scores):

    filtered = [
        (c, score + 0.4 * fuzz.ratio(query, c))
        for c, score in candidates_and_scores
        if score + 0.6 * fuzz.ratio(query, c) >= 60
        and abs(len(query) - len(c)) < 6
        and len(c) > 3
    ]

    new_candidates, new_scores = zip(*sorted(filtered, key=lambda x: x[1], reverse=True))

    return list(new_candidates), list(new_scores)


flattened_tokens = [tok.strip().lower() for tok in tokens]

import torch



def get_matches(query, k=10):
    flattened_query = query.strip().lower()
    flattened_query_len = len(flattened_query)

    # Get topk using something like prefix matching on strings without spaces or capitalization
    prefix_scores = [int(tok.startswith(flattened_query))*(flattened_query_len-0.2*len(tok))+int(flattened_query.startswith(tok))*(len(tok)-0.2*flattened_query_len) - 20*(tok == '') - (len(tok) < 3) for tok in flattened_tokens]
    prefix_scores = np.array(prefix_scores)
    topk_indices = prefix_scores.argpartition(-k)[-k:]
    candidates, scores = tokens[topk_indices].tolist(), prefix_scores[topk_indices].tolist()

    # Drop candidates with 0 score
    candidates = [c for c, score in zip(candidates, scores) if score > 0]

    # Reorder candidates to take into account capitalization/spacing
    fuzz_scores = [score + fuzz.ratio(query, c)/100 for c, score in zip(candidates, scores)]
    candidates = [c for fuzz_score, c in sorted(zip(fuzz_scores, candidates), reverse=True)]
    
    return candidates




In [500]:
from timeit import timeit


from thefuzz import fuzz
import numpy as np
import json


def get_suggestions(query, k=5):
    flattened_query = query.strip().lower()
    flattened_query_len = len(flattened_query)

    # Get topk using something like prefix matching on strings without spaces or capitalization
    prefix_scores = [int(tok.startswith(flattened_query))*(flattened_query_len)+int(flattened_query.startswith(tok))*len(tok) - 1000*((tok == '')+(len(tok) < 3)) for tok in flattened_tokens]
    prefix_scores = np.array(prefix_scores)
    topk_indices = prefix_scores.argpartition(-k)[-k:]
    candidates, scores = tokens[topk_indices].tolist(), prefix_scores[topk_indices].tolist()

    # Drop candidates with 0 score
    candidates = [c for c, score in zip(candidates, scores) if score > 0]
    scores = [score for score in scores if score > 0]

    # Reorder candidates to take into account capitalization/spacing
    print(scores)
    print([(fuzz.ratio(query, c)/100) for c, score in zip(candidates, scores)])
    fuzz_scores = [score + (fuzz.ratio(query, c)/100) for c, score in zip(candidates, scores)]
    print(fuzz_scores)
    print(candidates)
    candidates = [c for fuzz_score, c in sorted(zip(fuzz_scores, candidates), reverse=True)]
    
    return candidates

get_suggestions(' brilliant')

[3, 6, 3, 18]
[0.57, 0.82, 0.43, 1.0]
[3.57, 6.82, 3.43, 19.0]
[' bri', ' brilli', ' Bri', ' brilliant']


[' brilliant', ' brilli', ' bri', ' Bri']