In [None]:
!pip install git+https://github.com/oskar-j/thresher.git

## Import package

In [1]:
import numpy as np
import pandas as pd
import thresher
import time

t = thresher.Thresher()

In [2]:
print('Currently supported algorithms:')
print(t.get_supported_algorithms())

Currently supported algorithms:
['auto', 'ls', 'sgd', 'gen', 'grid', 'sgrid']


## Read test data

In [None]:
# to load the data, unpack the milion_samples.7z file first

In [3]:
data = pd.read_csv('milion_samples.csv')

In [4]:
f'Read {len(data)} rows of data'

'Read 1000000 rows of data'

In [5]:
data.head()

Unnamed: 0,actual_label,score
0,1,0.974296
1,1,0.983075
2,0,0.759452
3,0,0.089109
4,1,0.984881


## Evaluate algorithms

### Algorithm: LS

In [6]:
t_ls = thresher.Thresher(algorithm='ls', 
                      progress_bar=True, labels=(0,1), 
                      algorithm_params={'n_jobs': 10})

In [None]:
# too slow for 10^6 rows of data
# there is some room to tweak the paralelization as well

In [None]:
# s_time = time.process_time()
# result = t_ls.optimize_threshold(data.score.values, data.actual_label.values)
# elapsed_time = time.process_time() - s_time

### Algorithm: SGD

In [13]:
t_sgd = thresher.Thresher(algorithm='sgd', 
                          progress_bar=True, labels=(0,1))

In [14]:
results = []

In [15]:
for _ in range(10):
    s_time = time.process_time()
    result = t_sgd.optimize_threshold(data.score.values, data.actual_label.values)
    elapsed_time = time.process_time() - s_time
    print(f'Process took {elapsed_time} seconds and gave result of {result}')
    results.append((elapsed_time, result))

Process took 0.7443240000000002 seconds and gave result of 0.5162577102954293
Process took 0.7188130000000008 seconds and gave result of 0.5162577102954293
Process took 0.7002949999999997 seconds and gave result of 0.5162577102954293
Process took 0.9045120000000004 seconds and gave result of 0.5172416417344768
Process took 0.6866709999999996 seconds and gave result of 0.5162577102954293
Process took 0.8574999999999999 seconds and gave result of 0.5168159060263999
Process took 0.8237519999999998 seconds and gave result of 0.5162577102954293
Process took 0.836879999999999 seconds and gave result of 0.517066410903332
Process took 1.2367670000000004 seconds and gave result of 0.517527811922288
Process took 0.779520999999999 seconds and gave result of 0.5162577102954293


In [18]:
f'Mean cpu time: {np.mean([_[0] for _ in results])} mean result: {np.mean([_[1] for _ in results])}'

'Mean cpu time: 0.8289034999999998 mean result: 0.5166198032359073'

### Algorithm: GEN

In [19]:
t_gen = thresher.Thresher(algorithm='gen', 
                          progress_bar=True, labels=(0,1))

In [20]:
results = []

In [21]:
for _ in range(10):
    s_time = time.process_time()
    result = t_gen.optimize_threshold(data.score.values, data.actual_label.values)
    elapsed_time = time.process_time() - s_time
    print(f'Process took {elapsed_time} seconds and gave result of {result}')
    results.append((elapsed_time, result))

 |####################################################################################################| 100.0% 
Process took 165.537085 seconds and gave result of 0.3342839526941856
 |####################################################################################################| 100.0% 
Process took 167.15820599999998 seconds and gave result of 0.4591834735375162
 |####################################################################################################| 100.0% 
Process took 172.54984000000002 seconds and gave result of 0.2398139297250794
 |####################################################################################################| 100.0% 
Process took 176.73918600000002 seconds and gave result of 0.4484500148264838
 |####################################################################################################| 100.0% 
Process took 161.09605299999998 seconds and gave result of 0.40848835242990983
 |#######################################################

In [22]:
f'Mean cpu time: {np.mean([_[0] for _ in results])} mean result: {np.mean([_[1] for _ in results])}'

'Mean cpu time: 165.4681959 mean result: 0.41493580695180327'

### Algorithm: Grid search

In [23]:
t_grid = thresher.Thresher(algorithm='grid', 
                           progress_bar=True, labels=(0,1))

In [24]:
results = []

In [25]:
for _ in range(10):
    s_time = time.process_time()
    result = t_grid.optimize_threshold(data.score.values, data.actual_label.values)
    elapsed_time = time.process_time() - s_time
    print(f'Process took {elapsed_time} seconds and gave result of {result}')
    results.append((elapsed_time, result))

 |####################################################################################################| 100.0% 
 |####################################################################################################| 100.0% 
Process took 11.979750999999851 seconds and gave result of 0.48
 |####################################################################################################| 100.0% 
 |####################################################################################################| 100.0% 
Process took 12.093507000000045 seconds and gave result of 0.48
 |####################################################################################################| 100.0% 
 |####################################################################################################| 100.0% 
Process took 12.435658999999987 seconds and gave result of 0.48
 |####################################################################################################| 100.0% 
 |######################

In [26]:
f'Mean cpu time: {np.mean([_[0] for _ in results])} mean result: {np.mean([_[1] for _ in results])}'

'Mean cpu time: 13.144794099999967 mean result: 0.4800000000000001'

### Algorithm: Stochastic Grid search

In [27]:
t_sgrid = thresher.Thresher(algorithm='sgrid', 
                            progress_bar=True, labels=(0,1))

In [28]:
results = []

In [29]:
for _ in range(10):
    s_time = time.process_time()
    result = t_sgrid.optimize_threshold(data.score.values, data.actual_label.values)
    elapsed_time = time.process_time() - s_time
    print(f'Process took {elapsed_time} seconds and gave result of {result}')
    results.append((elapsed_time, result))

 |####################################################################################################| 100.0% 
 |####################################################################################################| 100.0% 
Process took 2.308320999999978 seconds and gave result of 0.51
 |####################################################################################################| 100.0% 
 |####################################################################################################| 100.0% 
Process took 2.2762410000000273 seconds and gave result of 0.5
 |####################################################################################################| 100.0% 
 |####################################################################################################| 100.0% 
Process took 2.3001150000000052 seconds and gave result of 0.51
 |####################################################################################################| 100.0% 
 |########################

In [30]:
f'Mean cpu time: {np.mean([_[0] for _ in results])} mean result: {np.mean([_[1] for _ in results])}'

'Mean cpu time: 2.3367062999999915 mean result: 0.483'

### Algorithm: Stochastic Grid search

#### different params

In [44]:
t_sgrid = thresher.Thresher(algorithm='sgrid', 
                            progress_bar=False, labels=(0,1),
                            algorithm_params={'no_of_decimal_places': 2,
                                              'stoch_ratio': 0.04,
                                              'reshuffle': False})

In [45]:
results = []

In [46]:
for _ in range(10):
    s_time = time.process_time()
    result = t_sgrid.optimize_threshold(data.score.values, data.actual_label.values)
    elapsed_time = time.process_time() - s_time
    print(f'Process took {elapsed_time} seconds and gave result of {result}')
    results.append((elapsed_time, result))

Process took 1.7316349999998693 seconds and gave result of 0.51
Process took 1.6873630000000048 seconds and gave result of 0.51
Process took 1.74340200000006 seconds and gave result of 0.47000000000000003
Process took 1.785188000000062 seconds and gave result of 0.49
Process took 1.7099889999999505 seconds and gave result of 0.47000000000000003
Process took 1.6689440000000104 seconds and gave result of 0.48
Process took 1.6211889999999585 seconds and gave result of 0.48
Process took 1.6211840000000848 seconds and gave result of 0.53
Process took 1.7044220000000223 seconds and gave result of 0.51
Process took 1.7553729999999632 seconds and gave result of 0.46


In [47]:
f'Mean cpu time: {np.mean([_[0] for _ in results])} mean result: {np.mean([_[1] for _ in results])}'

'Mean cpu time: 1.7028688999999986 mean result: 0.491'