In [1]:
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb

In [2]:
# The idea is to encode alphabets into numbers
alphabets = 'abcdefghijklmnopqrstuvwxyz0'

# assign a number to each alphabet
alphabet2int = dict(zip(alphabets, range(0, len(alphabets)+1)))
int2alphabet = dict(zip(range(0, len(alphabets)+1), alphabets))

# make a 2 bit encoding for int
int2binary = {}
binary = np.unpackbits(np.array([range(len(alphabets))], dtype=np.uint8).T, axis=1)[:, -5:]

for i in range(len(alphabets)):
    int2binary[i] = binary[i]

int2binary, alphabet2int, int2alphabet

({0: array([0, 0, 0, 0, 0], dtype=uint8),
  1: array([0, 0, 0, 0, 1], dtype=uint8),
  2: array([0, 0, 0, 1, 0], dtype=uint8),
  3: array([0, 0, 0, 1, 1], dtype=uint8),
  4: array([0, 0, 1, 0, 0], dtype=uint8),
  5: array([0, 0, 1, 0, 1], dtype=uint8),
  6: array([0, 0, 1, 1, 0], dtype=uint8),
  7: array([0, 0, 1, 1, 1], dtype=uint8),
  8: array([0, 1, 0, 0, 0], dtype=uint8),
  9: array([0, 1, 0, 0, 1], dtype=uint8),
  10: array([0, 1, 0, 1, 0], dtype=uint8),
  11: array([0, 1, 0, 1, 1], dtype=uint8),
  12: array([0, 1, 1, 0, 0], dtype=uint8),
  13: array([0, 1, 1, 0, 1], dtype=uint8),
  14: array([0, 1, 1, 1, 0], dtype=uint8),
  15: array([0, 1, 1, 1, 1], dtype=uint8),
  16: array([1, 0, 0, 0, 0], dtype=uint8),
  17: array([1, 0, 0, 0, 1], dtype=uint8),
  18: array([1, 0, 0, 1, 0], dtype=uint8),
  19: array([1, 0, 0, 1, 1], dtype=uint8),
  20: array([1, 0, 1, 0, 0], dtype=uint8),
  21: array([1, 0, 1, 0, 1], dtype=uint8),
  22: array([1, 0, 1, 1, 0], dtype=uint8),
  23: array([1, 0, 1,

In [3]:
# training corpus; predict next character given last two characters

# model takes "xgb" nad predicts "o" 
# thus a training sample is (x, y) = ("xgb", "o") as numbers

# training data

MAXLEN_PER_WORD = len("xgboost0")
N_SAMPLES = 1000
N_CHARS_IN = 4
N_CHARS_OUT = 1
N_LABELS = len(int2binary[0])

corpus = ""
for i in range(N_SAMPLES):
    corpus += "xgboost0"

In [4]:
# chunking the corpus into training samples
x_train, y_train = [], []

for i in range(0, len(corpus) - MAXLEN_PER_WORD, 1):
    
    x_text = corpus[i:i+N_CHARS_IN]
    y_text = corpus[i+N_CHARS_IN]

    x_text_int = [alphabet2int[c] for c in x_text]
    y_text_int = alphabet2int[y_text]

    # binary encoding
    x_train.append([int2binary[c] for c in x_text_int])
    y_train.append(int2binary[y_text_int])

print(f"Text: {x_text} -> {y_text}")
print(f"Int: {x_text_int} -> {y_text_int}")
print(f"Binary: {x_train[-1]} -> {y_train[-1]}")

Text: 0xgb -> o
Int: [26, 23, 6, 1] -> 14
Binary: [array([1, 1, 0, 1, 0], dtype=uint8), array([1, 0, 1, 1, 1], dtype=uint8), array([0, 0, 1, 1, 0], dtype=uint8), array([0, 0, 0, 0, 1], dtype=uint8)] -> [0 1 1 1 0]


In [5]:
x_train = np.array(x_train)
y_train = np.array(y_train)

# reshape X from (N_SAMPLES, N_CHARS_IN, N_LABELS) -> (N_SAMPLES, N_CHARS_IN * N_LABELS)
x_train = x_train.reshape(x_train.shape[0], -1)
print(f"X shape: {x_train.shape}")

X shape: (7992, 20)


In [6]:
# build model

model = xgb.XGBClassifier(
    tree_method="hist", 
    multi_strategy="multi_output_tree",
    objective="binary:logistic",
    n_estimators=10000,
    )
model.fit(x_train, y_train)


In [12]:
# make predictions

input_text = "gboo"

def predict_next_char(input_text, model):
    input_text_int = [alphabet2int[c] for c in input_text]
    input_text_binary = [int2binary[c] for c in input_text_int]
    input_text_binary = np.array(input_text_binary).reshape(1, -1)

    pred = model.predict(input_text_binary)
    # pad with zeros
    pred = [[0]*3 + list(p) for p in pred]
    pred = np.array(pred).reshape(-1).astype(bool)

    # convert binary to int
    pred_int = np.packbits(pred)
    pred_int = int(pred_int)

    if pred_int > 26:
        pred_int = 26
    # convert int to alphabet
    pred_text = int2alphabet[pred_int]
    return pred_text

input_samples = [corpus[i:i+N_CHARS_IN] for i in range(0, len(corpus) - MAXLEN_PER_WORD, 1)][:10]

for input_text in input_samples:
    print(f"Input: {input_text} -> {predict_next_char(input_text, model)}")

Input: xgbo -> o
Input: gboo -> s
Input: boos -> t
Input: oost -> 0
Input: ost0 -> x
Input: st0x -> g
Input: t0xg -> b
Input: 0xgb -> o
Input: xgbo -> o
Input: gboo -> s
