 Categorical encoding 

In [1]:
# Use labelled training data. Each pair in `train` is [wuc_as_coded, wuc_manual]

# A Word2Vec model should learn that A is closer to B than to C. 
# That is, when A is miscoded, it is more likely to be miscoded as B than as C. 

# Also, the embedding should learn that B and C are very close, and are closer 
# than A and B. 

# This embedding can be passed as a feature to the WUC classifier. 


from gensim.models import Word2Vec

train = [
    ['A', 'B'], 
    ['B', 'C'], 
    ['B', 'C'], 
    ['B', 'C'], 
    ['C', 'B'], 
    ['B', 'C'],
    ['B', 'C'],
]

model = Word2Vec(sentences=train, sg=1, vector_size=1, min_count=1, window=1)

print(f'indexes: {model.wv.key_to_index}')

embeddings = {}
for word in ['A', 'B', 'C']: 
    vec = model.wv.get_vector(word)
    embeddings[word] = vec

print(f'embeddings: {embeddings}')

abs_A_B = abs(embeddings['A'][0] - embeddings['B'][0])
abs_A_C = abs(embeddings['A'][0] - embeddings['C'][0])
abs_B_C = abs(embeddings['B'][0] - embeddings['C'][0])


print(f"distance between A and B: {abs_A_B}")
print(f'distance between A and C: {abs_A_C}')
print(f'distance between B and C: {abs_B_C}')


try: 
    assert abs_A_B < abs_A_C, "assert abs_A_B < abs_A_C"
except AssertionError as e: 
    print(f'>> Error in {e}')
    
try:
    assert abs_B_C < abs_A_B, "assert abs_B_C < abs_A_B"
except AssertionError as e: 
    print(f'>> Error in {e}')
    

indexes: {'B': 0, 'C': 1, 'A': 2}
embeddings: {'A': array([0.51033497], dtype=float32), 'B': array([-0.05362272], dtype=float32), 'C': array([0.02364302], dtype=float32)}
distance between A and B: 0.563957691192627
distance between A and C: 0.486691951751709
distance between B and C: 0.07726573944091797
>> Error in assert abs_A_B < abs_A_C
