In [1]:
import csv
import pickle
import numpy as np
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm
from boonamber import AmberClient

In [2]:
with open('../data/processed/stats.csv', 'r') as f:
    reader = csv.reader(f)
    fields = next(reader)
    usernames = []
    stats_list = []
    for line in tqdm(reader):
        levels = [int(n) for n in line[5::3]]
        usernames.append(line[0])
        stats_list.append(levels)
stats = np.array(stats_list)
del stats_list
stats[np.where(stats == -1)] = 1

0it [00:00, ?it/s]

In [10]:
def get_weights(split):
    if split == 'all':
        cb_weight = 2 * (16 / 23)
        noncb_weight = 7 / 23
        return 7*[cb_weight] + 16*[noncb_weight]
    else:
        return 1

def get_dataset(split):
    if split == 'all':
        return stats
    elif split == 'cb':
        return stats[:, :7]
    elif split == 'noncb':
        return stats[:, 7:]

In [7]:
amber = AmberClient(license_id='luke-dev')
sensor_id = amber.create_sensor()
with open('../reference/model_id.txt', 'w') as f:
    f.write(sensor_id)

In [9]:
amber = AmberClient(license_id='luke-dev')
with open('../reference/model_id.txt', 'r') as f:
    sensor_id = f.read()

In [None]:
def cluster_dataset(dataset, mins, maxes, max_clusters=None)

def get_num_clusters(dataset, pv, max_clusters=None):
    num_samples = len(dataset)
    cluster_ids = np.zeros(num_samples, dtype='int')
    
    success, response = nano.open_nano('0')
    if not success:
        raise ValueError(response)
    
    batch_size = 10000
    max_id = 0
    for i in range(num_samples // batch_size):
        batch_start = i * batch_size
        batch_end = (i + 1) * batch_size
        batch = data[batch_start:batch_end]

        success, response = nano.load_data(batch)
        if not success:
            raise ValueError(response)
        success, response = nano.run_nano(results='ID')
        if not success:
            raise ValueError(response)

        max_in_batch = max(response['ID'])
        max_id = max(max_id, max_in_batch)

    if max_clusters and max_id > max_clusters:
        return None
    
    return max_id

In [None]:
def find_optimal_pv(max_clusters=1000)

In [None]:
np.random.seed(0)
for split in ['all', 'cb', 'noncb']:
    
    print("sweeping pv parameter for data split '{}'...".format(split))
    weights = get_weights(split)
    dataset = get_dataset(split)

    num_samples = 100000
    subsample = np.random.choice(len(stats), size=num_samples, replace=False)
    autotune_data = dataset[subsample]

    success, response = nano.open_nano('0')
    if not success:
        raise ValueError(response)

    pvs = np.linspace(0.05, 0.20, 16)
    inference_dataset(nano, dataset, pv, max_clusters=None)
    
    plt.figure(figsize=(12, 8))
    plt.stem(num_clusters)
    plt.xticks(ticks=np.arange(len(num_clusters)),
               labels=[f'{pv:.2f}' for pv in pvs[::-1]])
    plt.title("Data split '{}'".format(split))
    plt.xlabel("Percent variation")
    plt.ylabel("Number of clusters")
    plt.show()

In [None]:
for split in ['all', 'cb', 'noncb']:
    
    print("sweeping percent variation for data split '{}'...".format(split))
    weights = get_weights(split)
    dataset = get_dataset(split)

    num_samples = 100000
    subsample = np.random.choice(len(stats), size=num_samples, replace=False)
    autotune_data = dataset[subsample]

    success, response = nano.open_nano('0')
    if not success:
        raise ValueError(response)
    
    num_clusters = np.zeros(len(pvs), dtype='int')
    for i, pv in tqdm(enumerate(pvs)):
        nano.configure_nano(feature_count=feature_count, weight=weight,
                            min_val=1, max_val=99, percent_variation=pv)

        batch_size = 10000
        cluster_ids = np.zeros(len(subsample), dtype='int')
        for j in range(num_samples // batch_size):
            batch_start = j*batch_size
            batch_end = (j+1)*batch_size
            batch = autotune_data[batch_start:batch_end]
            
            success, response = nano.load_data(batch)
            if not success:
                raise ValueError(response)
            success, response = nano.run_nano(results='ID')
            if not success:
                raise ValueError(response)
                
            cluster_ids[batch_start:batch_end] = response['ID']
        
        cluster_count = np.max(cluster_ids)
        num_clusters[i] = cluster_count
        print("pv = {:0.3}, num_clusters = {}".format(pv, cluster_count))
        
        if cluster_count > 1000:
            break

    plt.figure(figsize=(12, 8))
    plt.stem(num_clusters)
    plt.xticks(ticks=np.arange(len(num_clusters)),
               labels=[f'{pv:.2f}' for pv in pvs[::-1]])
    plt.title("Data split '{}'".format(split))
    plt.xlabel("Percent variation")
    plt.ylabel("Number of clusters")
    plt.show()

In [None]:
params = {
    'all': {
        'pv': 0.127,
        'weight': 7*[16*23] + 16*[7*23],
        'feature_count': 23
    },
    'cb': {
        'pv': 0.060,
        'weight': 1,
        'feature_count': 7
    },
    'noncb': {
        'pv': 0.144,
        'weight': 1,
        'feature_count': 16
    }
}