In [1]:
!pip install datasets >> /dev/null

In [1]:
from  datasets  import  load_dataset

# specific language (e.g. Dockerfiles)
ds = load_dataset(
    "bigcode/the-stack", 
    data_dir="data/python", 
    split="train", 
    streaming=True
)

text = ''
for i, sample in enumerate(ds):
    text += sample["content"]
    if i > 1000: 
        break

Using custom data configuration bigcode--the-stack-e4a54f956a2755d4


In [2]:
len(text)

10182785

In [3]:
import pandas as pd
df = pd.read_csv("baseline/with_modifier.csv")

The modifier is based on the key *position*, not the key itself. 

As a result we can number a keyboard based on the following format:
![](https://github.com/muellerzr/keyboard-mastery/blob/master/keyboard-layout.jpg?raw=1)

We just then need to map the key to the position:

In [4]:
key2pos = {k:i for i,k in enumerate("qwertyuiop[]asdfghjkl;'zxcvbnm,./")}

In [5]:
df["Location"] = df["Key"].replace(key2pos)

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,Key,Average Time (s),Distance,Modifier,Location
0,0,',0.708456,1,0.491064,22
1,1,",",0.75254,1,1.348371,30
2,2,.,0.824321,1,1.476985,31
3,3,/,0.799975,1,1.433363,32
4,4,;,0.854735,0,0.0,21


In [7]:
df[df["Key"] == "b"]

Unnamed: 0.1,Unnamed: 0,Key,Average Time (s),Distance,Modifier,Location
8,8,b,0.699311,2,1.536542,27


In [8]:
import numpy as np
import random

In [29]:
solutions_per_population = 8
num_solutions_mating = 4
key2mod = dict(zip(df["Location"], df["Modifier"]))
keys = list(df["Key"].unique())
num_weights = len(keys)
def init_population(population_size, letters):
    population = []
    for _ in range(population_size):
        genome = list(letters[:])
        random.shuffle(genome)
        population.append(genome)
    return population

In [30]:
def new_generation(population, sorted_evals, p_size):
    new_gen = []
    sorted_pop = []
    for idx in sorted_evals:
        sorted_pop.append(population[idx])
    
    # Best 10% of layouts in this generation
    for i in range(int(p_size*0.1)):
        new_gen.append(sorted_pop[i])
    
    # Combine two from top 50% and create a new generation
    for _ in range(int(p_size * 0.9)):
        pop = sorted_pop[:int(p_size*.5)]
        p1, p2 = random.choices(pop, k=2)
        child = mate(p1,p2)
        new_gen.append(child)
    return new_gen

In [31]:
def calculate_time(s, layout, mod):
    total_distance = 0
    for char in s:
        total_distance += mod[layout.index(char)]
    return total_distance

In [32]:
def clean_text(text, keys):
    for initial, replacement in {"}":"]", "{":"[", ":":";", '"':"'", ">":".", "<":","}.items():
        text = text.replace(initial, replacement)
    return ''.join([t for t in text if t in keys])

In [39]:
len(keys)

33

In [40]:
def mate(board1, board2):
    idx = random.randint(0, 32)
    length = random.randint(0,32)
    child = ["_" for i in range(33)]
    # Add keys from keyboard 1
    for _ in range(length):
        if idx > 28:
            idx = 0
        child[idx] = board1[idx]
        idx += 1
    
    # Add remaining keys from keyboard 2
    child_idx = idx
    while "_" in child:
        if idx > 32:
            idx = 0
        if child_idx > 32:
            child_idx = 0
        char = board2[idx]
        if char in child:
            idx += 1
            continue
        child[child_idx] = board2[idx]
        child_idx += 1
        idx += 1
    
    # 10% chance of random mutation
    if random.random() >= 0.9:
        p1 = random.randint(0,32)
        p2 = random.randint(0,32)
        allele1 = child[p1]
        allele2 = child[p2]
        child[p1] = allele2
        child[p2] = allele1
    return child

In [15]:
cleaned = clean_text(text, keys)

In [25]:
len(cleaned), len(text)

(6114239, 10182785)

In [None]:
populations = init_population(10, keys)
num_generations = 1000
for generation in range(num_generations):
    distances = []
    for population in populations:
        distances.append(
            calculate_time(
                cleaned, 
                population,
                key2mod
            )
        )
    sorted_evals = [x for _,x in sorted(zip(distances, range(len(distances) - 1)))]
    if generation == num_generations - 1 or generation % 50 == 0:
        print(f"Generation {generation} best:\n\t{list(sorted(list(distances)))[0]}")
    if generation < num_generations - 1:
        populations = new_generation(populations, sorted_evals, 10)

Generation 0 best:
	4407863.396626647
Generation 50 best:
	3293578.9244242073
Generation 100 best:
	2938604.0072293095
Generation 150 best:
	2624725.1940415935
Generation 200 best:
	2543517.3303936347
Generation 250 best:
	2526412.4243099485
Generation 300 best:
	2523277.875122347
Generation 350 best:
	2523277.875122347


In [22]:
distances

[5528428.507625384,
 5798765.715982061,
 6233825.4043046385,
 5528428.507625384,
 5528428.507625384,
 5573013.953519357]

In [None]:
# 12, 11, 9

In [202]:
layout = populations[0]
print(layout[:12])
print(layout[12:(12+11)])
print(layout[12+11:])

['j', 'i', 'x', 'o', 'a', 'n', 'q', 'y', 'u', ',', 'c', 'k']
['r', 'p', 'm', 'e', 'v', '.', 't', 'd', ';', 'l', 'w']
['h', 'z', 's', "'", '[', 'b', 'f', ']', 'g']
