# **This code is for training custom word embedding. Below is the original code (tutorial) I adapted from.**

# The codes can be used in the jupyter notebook as well (with minimal adaptation)

# Source:

https://www.kaggle.com/chewzy/tutorial-how-to-train-your-custom-word-embedding



In [0]:
import numpy as np
import pandas as pd
import os
import re
import time

from gensim.models import Word2Vec
from tqdm import tqdm

tqdm.pandas()

# **Importing dataset**

# After downloading the dataset (from /data folder), change path in order to load the file (for own use)

# For training custom word embedding, chat.csv dataset (which consists of all conversations) is used. 


In [2]:
# may need to change path for own use

data = "/content/drive/My Drive/Colab Notebooks/Colab Datasets/chat.csv"
data = pd.read_csv(data,encoding= 'unicode_escape')
data.head()

Unnamed: 0,channel,body
0,2343-chat_chatroom_bot_seeker-119-1,Hello!
1,2343-chat_chatroom_bot_seeker-119-1,Goode
2,2346-chat_chatroom_bot_seeker-120-1,hello
3,2349-chat_chatroom_giver_seeker-152-1,Good
4,2349-chat_chatroom_giver_seeker-152-1,Hello


# **Spliting data into train (80%) and test (20%) datasets**

In [0]:
data['split'] = np.random.randn(data.shape[0], 1)

msk = np.random.rand(len(data)) <= 0.8

df_train = data[msk]
df_test = data[~msk]

# **Preprocessing**

In [0]:
def preprocessing(titles_array):
    
    """
    Take in an array of titles, and return the processed titles.
    
    (e.g. input: 'i am a boy', output - 'am boy')  -> since I remove those words with length 1

    """
    
    processed_array = []
    
    for title in tqdm(titles_array):
        
        # remove other non-alphabets symbols with space (i.e. keep only alphabets and whitespaces).
        processed = re.sub('[^a-zA-Z ]', '', title)
        
        words = processed.split()
        
        # keep words that have length of more than 1 (e.g. gb, bb), remove those with length 1.
        processed_array.append(' '.join([word for word in words if len(word) > 1]))
    
    return processed_array

In [5]:
df_train['processed'] = preprocessing(df_train['body'])
df_test['processed'] = preprocessing(df_test['body'])

sentences = pd.concat([df_train['processed'], df_test['processed']],axis=0)
train_sentences = list(sentences.progress_apply(str.split).values)

100%|██████████| 37531/37531 [00:00<00:00, 150961.03it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
100%|██████████| 9308/9308 [00:00<00:00, 157730.16it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
100%|██████████| 46839/46839 [00:00<00:00, 310434.63it/s]


In [25]:
df_test.head()


Unnamed: 0,channel,body,split,processed
0,2343-chat_chatroom_bot_seeker-119-1,Hello!,0.477785,Hello
8,2432-chat_chatroom_giver_seeker-153-1,is it because of school load?,-1.189789,is it because of school load
21,2432-chat_chatroom_giver_seeker-153-1,I'm doing well. How are you?,2.026907,Im doing well How are you
26,2432-chat_chatroom_giver_seeker-153-1,I have one and that helps but she is still kin...,-0.080865,have one and that helps but she is still kinda...
28,2432-chat_chatroom_giver_seeker-153-1,I have been like that my entire life though,-0.690493,have been like that my entire life though


# **Customizing own embedding**

# In my model, the following features/parameters are used.

1) word2vec - skip-gram

2) window = 5 (maximum distance between the current and predicted word within a sentence) 

3) min_count = 5 (ignore words with frequency lower than 5)

4) size = 100 (dimensionality)


In [14]:
# Parameters reference : https://radimrehurek.com/gensim/models/word2vec.html
# https://blog.cambridgespark.com/tutorial-build-your-own-embedding-and-use-it-in-a-neural-network-e9cde4a81296
# https://code.google.com/archive/p/word2vec/
# https://machinelearningmastery.com/develop-word-embeddings-python-gensim/

# Feel free to customise your own embedding

start_time = time.time()

model = Word2Vec(sentences=train_sentences, 
                 sg=1, # 1 is skipgram, otherwise CBOW,
                 window = 5, #maximum distance between the current and predicted word within a sentence
                 min_count = 5, #ignore all words with total frequency lower than this
                 size=100,  # size is dimensionality of the word vectors
                 workers=8) # faster training with multicore machines (worker threads)

print(f'Time taken : {(time.time() - start_time) / 60:.2f} mins')

Time taken : 0.18 mins


In [15]:
# Total number of vocab in our custom word embedding

len(model.wv.vocab.keys())

3410

# Checking out the custom word embedding

In [17]:
# Check out the dimension of each word (we set it to 100 in the above training step)

model.wv.vector_size

100

In [18]:
# Check out how 'help' is represented (an array of 100 numbers)

# model.wv.get_vector()

model.wv.get_vector('help')


array([ 0.21495429, -0.50048214, -0.24143033, -0.34450617, -0.25399566,
       -0.25645986, -0.07022206, -0.13176784,  0.00922605,  0.38232967,
        0.27231622, -0.06061785, -0.19137514, -0.05118341, -0.68798375,
       -0.44578826,  0.04183128,  0.15660809,  0.32332975,  0.04915569,
        0.26782903,  0.85323566,  0.55630547, -0.01376082, -0.17712848,
        0.17594993, -0.24976543, -0.10387248, -0.10015015,  0.12089061,
        0.1234631 , -0.04847742,  0.4745364 , -0.07295658, -0.2096475 ,
       -0.09335148,  0.43573454, -0.02736659,  0.22523093, -0.05802866,
        0.45304695,  0.42329055,  0.05600791, -0.11216345, -0.39543858,
       -0.32024956,  0.14385705, -0.8615967 , -0.13376516,  0.46861103,
       -0.3354829 , -0.20952511,  0.52477056, -0.47828615, -0.33059335,
       -0.13526139, -0.4701433 ,  0.4411752 ,  0.27758184, -0.137238  ,
        0.76059896, -0.08159399, -0.55866456, -0.0097414 , -0.40151542,
        0.39222232,  0.14563218,  0.0568664 ,  0.00285046, -0.64

In [19]:
word_vectors = model.wv
result = word_vectors.similar_by_word("not")
print(result[:3])

[('Not', 0.5344575643539429), ('Fair', 0.5196408033370972), ('doesnt', 0.4999895989894867)]


  if np.issubdtype(vec.dtype, np.int):


In [21]:
result = word_vectors.most_similar(positive = ["charity"], negative = ['person'])
print("Most similar to 'charity' but dissimilar to 'person':\n", result[:3])

Most similar to 'charity' but dissimilar to 'person':
 [('organization', 0.4008829593658447), ('foundation', 0.37110769748687744), ('Charity', 0.31258493661880493)]


  if np.issubdtype(vec.dtype, np.int):


In [22]:
result = word_vectors.most_similar(positive=['children', 'charity'], negative=['money'], topn=1)
print(result)

[('organization', 0.5570844411849976)]


  if np.issubdtype(vec.dtype, np.int):


# **Save the model to a specific path**

The path & filename can be changed 

In [0]:
# may need to change path for own use

model.wv.save_word2vec_format('/content/drive/My Drive/Colab Notebooks/Colab Datasets/custom_glove_100d.txt')

# How to load:
# w2v = KeyedVectors.load_word2vec_format('custom_glove_100d_1.txt')

# How to get vector using loaded model
# w2v.get_vector('iphone')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
