# Step 3

Converting sequences of words into their phoneme counterparts, and also including some noise.

In [1]:
import os
import numpy as np

In [2]:
input_file = os.getcwd() + "/data/gutenberg_np_60k.txt"
version = "s3.1"

## Part 1 - SentencePiece embeddings

In [3]:
import sentencepiece as spm

In [4]:
max_sentence_length = 9000
vocab_size = 19099
model_type = "unigram"
SP_MODEL_NAME = f"models/{model_type}_vs{vocab_size}_{version}"

In [None]:
# train sentencepiece model from `botchan.txt` and makes `m.model` and `m.vocab`
# `m.vocab` is just a reference. not used in the segmentation.
spm.SentencePieceTrainer.train(
    f"--input={input_file} " \
    f"--model_type={model_type} " \
    f"--model_prefix={SP_MODEL_NAME} " \
    f"--vocab_size={vocab_size} " \
    f"--max_sentence_length={max_sentence_length} " \
    f"--train_extremely_large_corpus"
)

In [5]:
# makes segmenter instance and loads the model file (m.model)
sp = spm.SentencePieceProcessor()
sp.load(f"{SP_MODEL_NAME}.model")

# encode: text => id
print(sp.EncodeAsPieces('apple'))
print(sp.encode_as_ids('boyhood'))
print(sp.encode_as_ids('boy'))
print(sp.encode_as_ids('man'))

['▁', 'ap', 'ple']
[222, 1668, 473, 1816, 103, 238]
[222, 1668, 473]
[222, 8585]


## Step 2 - Word2Vec encodings

We use the SentencePiece encoder on our phoneme dataset to create a new dataset with it's tokens.

In [7]:
# TODO : Replace with entire dataset, if possible
with open(input_file) as corpus_file:
    corpus = corpus_file.readlines()

sentences = [sp.EncodeAsPieces(sentence) for sentence in corpus]

In [None]:
select_from = 920

sentences[0][select_from:select_from + 10]

Now, training Word2Vec

In [6]:
from gensim.models.word2vec import Word2Vec

In [7]:
window = 5
vector_size = 100

In [8]:
W2V_MODEL_PATH = f"models/w2v_vs{vector_size}_w{window}_{version}.model"

In [22]:
model = Word2Vec(sentences, size=vector_size, window=window, min_count=0, workers=4)

In [23]:
model.save(W2V_MODEL_PATH)

## Step 3 - Testing

In [9]:
model = Word2Vec.load(W2V_MODEL_PATH)

In [11]:
model.wv.most_similar("PɐL", topn=10)

[('PʌL', 0.978600025177002),
 ('Pʌl', 0.9757851362228394),
 ('Pɐl', 0.9756471514701843),
 ('pɐL', 0.9684648513793945),
 ('pʌl', 0.967623233795166),
 ('pɐl', 0.9671772122383118),
 ('pʌL', 0.953802764415741),
 ('pɐ', 0.4324934482574463),
 ('Pɐ', 0.42574286460876465),
 ('Dɨf', 0.4195268154144287)]

## Step 4 - Clustering

In [12]:
from tqdm import tqdm

In [13]:
words = list(model.wv.vocab.keys())

word_to_cluster = dict()  # Stores map from word to cluster
cluster_to_words = dict()  # Stores map from cluster to words
cluster_idx = 0  # Counter

for word in tqdm(words):
    # Check if word has already been clustered
    if word not in word_to_cluster.keys():
        # Create new cluster
        cluster_idx += 1
        cluster_key = chr(0x0020 + cluster_idx)

        # Add new word to cluster
        cluster_to_words[cluster_key] = [word]
        word_to_cluster[word] = cluster_key
        
        # Add all similar words
        for similar_word, score in model.wv.most_similar(word, topn=200):
            if score > 0.88:
                cluster_to_words[cluster_key].append(similar_word)
                word_to_cluster[similar_word] = cluster_key

100%|██████████| 19078/19078 [00:02<00:00, 6556.95it/s] 


In [13]:
len(cluster_to_words)

2539

In [16]:
list(cluster_to_words.keys())[-10:]

['ਂ', 'ਃ', '\u0a04', 'ਅ', 'ਆ', 'ਇ', 'ਈ', 'ਉ', 'ਊ', '\u0a0b']

In [17]:
with open("logs/s3.1_clusters.txt", "w+", encoding="utf-8") as cluster_fp:
    for cluster in cluster_to_words.keys():
        cluster_fp.write(cluster +"\t" +"\t".join(sorted(cluster_to_words[cluster])) +"\n")

In [18]:
cluster_to_words[list(cluster_to_words.keys())[201]]

['post',
 'pOst',
 'poSt',
 'POSt',
 'POST',
 'PosT',
 'Post',
 'POsT',
 'PoST',
 'posT',
 'PoSt',
 'poST',
 'pOsT',
 'pOSt',
 'POst',
 'pOST']

In [14]:
clustered_output_filepath = "data/gutenberg_np_clustered.txt"

In [20]:
with open(clustered_output_filepath, "w+", encoding="utf-8") as clustered_output_file:
    for sentence in sentences:
        clustered_output_file.write("".join([word_to_cluster[word] for word in sentence]) +"\n")

# clustered_sentences = [
#     [word_to_cluster[word] for word in sentence]
#     for sentence in sentences
# ]

## Step 5 - Splitting clustered data into units

In [15]:
cluster_input_file = clustered_output_filepath
clustered_version = "s3.2"

In [16]:
max_sentence_length = 8000
vocab_size = 19099
model_type = "unigram"
SP_MODEL_NAME_CLUSTERS = f"models/{model_type}_vs{vocab_size}_{clustered_version}"

In [None]:
# train sentencepiece model from `botchan.txt` and makes `m.model` and `m.vocab`
# `m.vocab` is just a reference. not used in the segmentation.
spm.SentencePieceTrainer.train(
    f"--input={cluster_input_file} " \
    f"--model_type={model_type} " \
    f"--model_prefix={SP_MODEL_NAME_CLUSTERS} " \
    f"--vocab_size={vocab_size} " \
    f"--max_sentence_length={max_sentence_length} " \
    f"--train_extremely_large_corpus"
)

In [17]:
# makes segmenter instance and loads the model file (m.model)
spc = spm.SentencePieceProcessor()
spc.load(f"{SP_MODEL_NAME_CLUSTERS}.model")

True

In [30]:
with open(cluster_input_file) as corpus_file:
    corpus = corpus_file.readlines()

clustered_sentences = [spc.EncodeAsPieces(sentence) for sentence in corpus]

## Step 6 - Word2Vec from clustered data

In [31]:
window_c = 5
vector_size_c = 100

In [32]:
W2V_MODEL_PATH_CLUSTERS = f"models/w2v_vs{vector_size_c}_w{window_c}_{clustered_version}.model"

In [34]:
model_c = Word2Vec(clustered_sentences, size=vector_size_c, window=window_c, min_count=0, workers=4, iter=15)

In [35]:
model_c.save(W2V_MODEL_PATH_CLUSTERS)

## Step 7 - Testing

In [36]:
model_c = Word2Vec.load(W2V_MODEL_PATH_CLUSTERS)

In [37]:
def find_most_similar_clusters(test_word, topn=5):
    most_similar = list()
    for cluster, score in model_c.wv.most_similar(word_to_cluster[test_word], topn=topn):
        most_similar.append(([cluster_to_words[char][:3] for char in cluster], score))
    
    return most_similar

In [38]:
test_word = "HÄNDZ"
# sp.EncodeAsPieces(test_word)

find_most_similar_clusters(test_word, 15)

[([['hänD', 'hÄND', 'HÄnɖ'], ['S', 's'], ['Änɖ', 'äNɖ', 'ÄNɖ']],
  0.8297650218009949),
 ([['śOlD', 'ʃoLD', 'śoLD'], ['ɜz', 'ɝz', 'ɝZ']], 0.8046799302101135),
 ([['RÆT', 'ʁÆt', 'RæT'], ['āRm', 'ARm', 'ārm']], 0.7949904799461365),
 ([['LɨPs', 'LipS', 'lɨPS']], 0.7779038548469543),
 ([['hänD', 'hÄND', 'HÄnɖ']], 0.7738319039344788),
 ([['hänD', 'hÄND', 'HÄnɖ'], ['S', 's']], 0.7736732959747314),
 ([['Mąd', 'mĄð', 'Mąð']], 0.7693848609924316),
 ([['āRm', 'ARm', 'ārm'], ['Zʌnɖ', 'zɐnD', 'zɐND']], 0.7676824331283569),
 ([['pɨ', 'Pɨ', 'pi'], ['LO', 'Lo', 'lo']], 0.7655820846557617),
 ([['RÆT', 'ʁÆt', 'RæT'], ['hänD', 'hÄND', 'HÄnɖ']], 0.7582882642745972),
 ([['śOlD', 'ʃoLD', 'śoLD'], ['ɜz', 'ɝz', 'ɝZ'], ['ʌNɖ', 'ɐND', 'ɐNɖ']],
  0.7423087954521179),
 ([['hänD', 'hÄND', 'HÄnɖ'], ['ʌNɖ', 'ɐND', 'ɐNɖ']], 0.731292724609375),
 ([['śOlD', 'ʃoLD', 'śoLD'], ['ɝ', 'ɜ', 'ɚ']], 0.7303489446640015),
 ([['tı', 'Tı', 'tI'], ['ʈ']], 0.7164862155914307),
 ([['čI', 'Čı', 'ĆI'], ['kS', 'KS', 'Ks']], 0.716256916

In [39]:
test_cluster = word_to_cluster[test_word]
print(test_cluster)

к


In [28]:
most_similar = model_c.wv.most_similar(test_cluster)
print(most_similar)

[('ј', 0.6868019104003906), ('ÈT', 0.569015622138977), ('Rɒ', 0.5664949417114258), ('Ԫ', 0.5378398895263672), ('ɢt', 0.5141922831535339), ('ɕ', 0.5025638341903687), ('/±', 0.49946022033691406), ('iɣ', 0.49369749426841736), ('7', 0.48990678787231445), ('ȧA', 0.46953094005584717)]


In [29]:
cluster_to_words[most_similar[0][0][0]]

['ńg', 'ńɠ', 'ŃG', 'Ńɠ', 'Ńg', 'ńG']