In [1]:
!pip install datasets >> /dev/null

In [3]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.huggingface/token
Login successful


In [35]:
from  datasets  import  load_dataset

# specific language (e.g. Dockerfiles)
ds = load_dataset(
    "bigcode/the-stack", 
    data_dir="data/python", 
    split="train", 
    streaming=True
)

text = ''
for i, sample in enumerate(ds):
    text += sample["content"]
    if i > 1000: 
        break



In [36]:
len(text)

10182785

In [14]:
import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/muellerzr/keyboard-mastery/master/baseline/with_modifier.csv")

The modifier is based on the key *position*, not the key itself. 

As a result we can number a keyboard based on the following format:
![](https://github.com/muellerzr/keyboard-mastery/blob/master/keyboard-layout.jpg?raw=1)

We just then need to map the key to the position:

In [167]:
key2pos = {k:i for i,k in enumerate("qwertyuiop[]asdfghjkl;'zxcvbnm,.")}

In [168]:
df["Location"] = df["Key"].replace(key2pos)

In [170]:
df.loc[len(df.index)] = [1, "[", .635, 1, 1.45, key2pos["["]]
df.loc[len(df.index)] = [1, "]", .72, 2, 2.165, key2pos["]"]]
df.loc[len(df.index)] = [1, "'", .62, 1, 1.25, key2pos["'"]]

In [172]:
df.head()

Unnamed: 0.1,Unnamed: 0,Key,Average Time (s),Distance,Modifier,Location
0,0,",",0.721701,1,1.443402,30
1,1,.,0.635377,1,1.270754,31
2,2,;,0.583544,0,0.583544,21
3,3,a,0.623853,0,0.623853,12
4,4,b,0.64447,2,1.933409,27


In [173]:
key2mod = dict(zip(df["Location"], df["Modifier"]))

In [174]:
keys = list(df["Key"].unique())

In [175]:
import numpy as np
import random

In [176]:
num_weights = len(keys)
solutions_per_population = 8
num_solutions_mating = 4
def init_population(population_size):
    population = []
    for _ in range(population_size):
        genome = keys[:]
        random.shuffle(genome)
        population.append(genome)
    return population

In [177]:
def mate(board1, board2):
    idx = random.randint(0, 31)
    length = random.randint(0,31)
    child = ["_" for i in range(32)]
    # Add keys from keyboard 1
    for _ in range(length):
        if idx > 28:
            idx = 0
        child[idx] = board1[idx]
        idx += 1
    
    # Add remaining keys from keyboard 2
    child_idx = idx
    while "_" in child:
        if idx > 31:
            idx = 0
        if child_idx > 31:
            child_idx = 0
        char = board2[idx]
        if char in child:
            idx += 1
            continue
        child[child_idx] = board2[idx]
        child_idx += 1
        idx += 1
    
    # 10% chance of random mutation
    if random.random() >= 0.9:
        p1 = random.randint(0,31)
        p2 = random.randint(0,31)
        allele1 = child[p1]
        allele2 = child[p2]
        child[p1] = allele2
        child[p2] = allele1
    return child

In [178]:
def new_generation(population, sorted_evals, p_size):
    new_gen = []
    sorted_pop = []
    for idx in sorted_evals:
        sorted_pop.append(population[idx])
    
    # Best 10% of layouts in this generation
    for i in range(int(p_size*0.1)):
        new_gen.append(sorted_pop[i])
    
    # Combine two from top 50% and create a new generation
    for _ in range(int(p_size * 0.9)):
        pop = sorted_pop[:int(p_size*.5)]
        p1, p2 = random.choices(pop, k=2)
        child = mate(p1,p2)
        new_gen.append(child)
    return new_gen

In [179]:
def calculate_time(s, layout, key2mod):
    total_distance = 0
    for char in s:
        total_distance += key2mod[layout.index(char)]
    return total_distance

In [180]:
key2mod = dict(zip(df["Location"], df["Modifier"]))

In [193]:
def clean_text(text, keys):
    text = text.replace("}", "]").replace("{", "[").replace(":", ";").replace('"', "'").replace(">", ".").replace("<", ",")
    return ''.join([t for t in text if t in keys])

In [194]:
cleaned = clean_text(text, keys)

In [195]:
len(cleaned), len(text)

(6100872, 10182785)

In [196]:
populations = init_population(10)
num_generations = 20
for generation in range(num_generations):
    distances = []
    for population in populations:
        distances.append(
            calculate_time(
                cleaned, 
                population,
                key2mod
            )
        )
    sorted_evals = [x for _,x in sorted(zip(distances, range(len(distances) - 1)))]
    print(f"Generation {generation} best:\n\t{list(sorted(list(distances)))[0]}")
    if generation < num_generations - 1 or generation % 5 == 0:
        populations = new_generation(populations, sorted_evals, 10)

Generation 0 best:
	6593216.366996985
Generation 1 best:
	6593216.366996985
Generation 2 best:
	6593216.366996985
Generation 3 best:
	6593216.366996985
Generation 4 best:
	6593216.366996985
Generation 5 best:
	6508190.362488356
Generation 6 best:
	6481162.118895515
Generation 7 best:
	6438968.025074063
Generation 8 best:
	6412945.805753794
Generation 9 best:
	6168074.985319027
Generation 10 best:
	6168074.985319027
Generation 11 best:
	6168074.985319027
Generation 12 best:
	6168074.985319027
Generation 13 best:
	6168074.985319027
Generation 14 best:
	6168074.985319027
Generation 15 best:
	6168074.985319027
Generation 16 best:
	6168074.985319027
Generation 17 best:
	6168074.985319027
Generation 18 best:
	6168074.985319027
Generation 19 best:
	6154863.16022332


In [None]:
# 12, 11, 9

In [202]:
layout = populations[0]
print(layout[:12])
print(layout[12:(12+11)])
print(layout[12+11:])

['j', 'i', 'x', 'o', 'a', 'n', 'q', 'y', 'u', ',', 'c', 'k']
['r', 'p', 'm', 'e', 'v', '.', 't', 'd', ';', 'l', 'w']
['h', 'z', 's', "'", '[', 'b', 'f', ']', 'g']
