# Making data

In [21]:
import numpy as np
import pandas as pd
from string import ascii_letters
from functools import reduce
RNG = np.random.default_rng()

In [44]:
# target set: 10,000 values, in random order

target = RNG.permutation(list(range(10_000)))
target_set = set(target)
target_series = pd.Series(target)

# 10 dataframes: each 10,000 rows, with 9 random float columns and 1 drawn randomly from integers [0,100,000)

dfs = []
dicts = []
for _ in range(10):
    cols = {
        f'col{i}': RNG.random(size=10_000)
        for i in range(4)
    }
    cols['target'] = RNG.integers(100_000, size=10_000)
    dfs.append(pd.DataFrame(cols))
    dicts.append(cols)

In [9]:
%%timeit

total = 0
for df in dfs:
    total += df.target.isin(target).sum()
total

2.82 ms ± 58 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [12]:
%%timeit

total = 0
for df in dfs:
    total += df.target.isin(target_set).sum()
total


13.1 ms ± 2.44 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [11]:
%%timeit

total = 0
for df in dfs:
    total += df.target.isin(target_series).sum()
total

2.99 ms ± 239 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [67]:
letter_array = np.array([letter for letter in ascii_letters])
random_indices = RNG.choice(len(letter_array), (100_000, 25), replace=True)
converter = np.empty(100_000, dtype='object')
for i in range(100_000):
    converter[i] = reduce(lambda a, b: a+b, letter_array[random_indices[i]], '')

target_str = converter[target].copy()
target_str_set = set(target_str)
target_str_sorted = np.sort( target_str)

In [68]:
dfs_str = []
for df in dfs:
    df_str = df.copy()
    df_str['target'] = converter[df_str['target']].copy()
    dfs_str.append(df_str)

In [39]:
%%timeit

total = 0
for df in dfs_str:
    total += df.target.isin(target_str).sum()
# print(total)

13 ms ± 2.27 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [43]:
%%timeit

total = 0
for df in dfs_str:
    total += df.target.isin(target_str_sorted).sum()
# print(total)

11.1 ms ± 366 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [40]:
%%timeit

total = 0
for df in dfs_str:
    total += df.target.isin(target_str_set).sum()
# print(total)

21 ms ± 3.42 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [81]:
%%timeit

total = 0
for df in dfs_str:
    for _, row in df.iterrows():
        # print(row['target'])
        if row['target'] in target_str_set:
            total += 1

3.3 s ± 534 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [77]:
print(dfs_str[0]['target'].isin(target_str_sorted).sum())
print(dfs_str[0].iloc[1])
print(dfs[0].iloc[1])

1031
col0                       0.547856
col1                       0.675737
col2                       0.364239
col3                       0.127782
target    LHyxuILHfssXfncqgsfLqIkNK
Name: 1, dtype: object
col0        0.547856
col1        0.675737
col2        0.364239
col3        0.127782
target    979.000000
Name: 1, dtype: float64


In [79]:
print(converter[979] in target_str_set)
print(979 in target_set)

True
True


In [76]:
print(target_str[:5])
print(target[:5])
print(converter[target[:5]])

['wdjcOWUjicsMjWOGKOHFutxYG' 'slvtcKHfIyzhNBDqOlLJuTVZc'
 'yMfhgcqbfyUzLKaKmSkLdofaI' 'pmqPblEreSfAQHMXQdiPqWaUM'
 'XxRDxLTJHFmlNcHirruVLHBsz']
[ 344 3434 9784 5573 9480]
['wdjcOWUjicsMjWOGKOHFutxYG' 'slvtcKHfIyzhNBDqOlLJuTVZc'
 'yMfhgcqbfyUzLKaKmSkLdofaI' 'pmqPblEreSfAQHMXQdiPqWaUM'
 'XxRDxLTJHFmlNcHirruVLHBsz']
