In [1]:
# copyright @ Ziming Li
# version 1.0
# This code is created alone by Ziming Li, a Ph.D student from Tsinghua University, China.
# This code is for the final project of the my summer internship in Yunzhixin'an Technology Co., LTD, Zhengzhou, China.
# If you have any questions, please contact me by email: lzm22@mails.tsinghua.edu.cn

In [23]:
import csv
from gensim.models import Word2Vec

In [24]:
# hyperparameters
vector_size = 50

### this file: word2vec and establish the data set for training

In [25]:
file_sentences = [] # sentences from the file to establish the word2vec model
raw_strings = [] # raw strings from the file to be classified
data_set_sentences = [] # sentences from the data set to be classified, preseve the word order
class_labels = [] # class labels for the data set
split_chars = [' ', '-', '+', '(', ')', '/', '*', ',', '.', ':', '@', '#', '_', '\n']

In [26]:
# load csv file, path = './raw_training_data.csv'
# read each column, append to sentences

with open('./raw_training_data.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    # get the header
    header = next(reader)
    column_number = len(header)

    for i in range(column_number):
        file_sentences.append([])
    for row in reader:

        for i in range(column_number):
            
            raw_strings.append(row[i])
            tmp_sentence = row[i].lower()

            # split sentence into words
            for char in split_chars:
                tmp_sentence = tmp_sentence.replace(char, ',')
            words = tmp_sentence.split(',')
            words = [word for word in words if word != '']

            # add to sentences recorder
            data_set_sentences.append(words)
            class_labels.append(i)
            for word in words:
                file_sentences[i].append(word)

# randomize the order of the sentences and the labels, but keep the matching between them
import random
random.seed(0)
random.shuffle(raw_strings)
random.seed(0)
random.shuffle(data_set_sentences)
random.seed(0)
random.shuffle(class_labels)

# print('Data set size:', len(data_set_sentences))
# print('Class labels size:', len(class_labels))
# print(data_set_sentences[:10])
# print(class_labels[:10])



### refer to word2vec model: https://radimrehurek.com/gensim/models/word2vec.html
### establish the word2vec model

In [27]:
lang_model = Word2Vec(
    sentences = file_sentences,
    vector_size = vector_size,
    min_count = 1,
    window = 5,
)

In [28]:
print("Vector for kelly:")
print(lang_model.wv['kelly'])
print('successfully encode words into vectors')

Vector for kelly:
[ 0.86695564 -0.28123128  0.73866993  0.7363451  -0.863093    0.2048312
  0.0258419  -0.66088855 -0.17692803 -0.7492832   0.34000912  0.06237699
  0.31536603 -0.11916703  0.22222662 -0.05943888  1.1584444   0.20450635
 -0.7332263  -1.0642958  -0.07563085  0.50791556  0.5548607   0.21765712
 -0.0282851   0.6083949  -0.4134535   1.0337824  -0.05319791  0.2527455
 -0.6261593  -0.4706853   0.60876626 -0.76131266 -0.7357941  -0.59791815
 -0.26468563  0.05810263 -0.7626965  -0.21389204  0.4078369  -0.08348862
  0.19382037 -0.24699345  0.7604668  -0.45386875 -0.10818419 -0.6490807
  0.7058431   0.4393891 ]
successfully encode words into vectors


In [29]:
sentence_length = 0
for sentence in data_set_sentences:
    sentence_length = max(sentence_length, len(sentence))

print('sentence_length:', sentence_length)

# pad the sentences to the same length

data_set_vectors = [[] for _ in range(len(data_set_sentences))] # turn data_set_sentences into vectors
for i in range(len(data_set_sentences)):
    for word in data_set_sentences[i]:
        data_set_vectors[i].append(lang_model.wv[word])
    while len(data_set_vectors[i]) < sentence_length:
        data_set_vectors[i].append([0 for _ in range(vector_size)])


sentence_length: 11


In [30]:
# divide data set into training set, validation set and test set
training_set_size = int(len(data_set_vectors) * 0.8)
validation_set_size = int(len(data_set_vectors) * 0.1)
test_set_size = len(data_set_vectors) - training_set_size - validation_set_size

training_set = data_set_vectors[:training_set_size]
training_labels = class_labels[:training_set_size]

validation_set = data_set_vectors[training_set_size:training_set_size + validation_set_size]
validation_labels = class_labels[training_set_size:training_set_size + validation_set_size]

raw_test_strings = raw_strings[training_set_size + validation_set_size:]
test_set = data_set_vectors[training_set_size + validation_set_size:]
test_labels = class_labels[training_set_size + validation_set_size:]

print('Training set size:', len(training_set))
print('Validation set size:', len(validation_set))
print('Test set size:', len(test_set))

Training set size: 176000
Validation set size: 22000
Test set size: 22000


In [31]:
# save the data set as python list type

import pickle
with open('./data/training_set.pkl', 'wb') as f:
    pickle.dump(training_set, f)
with open('./data/training_labels.pkl', 'wb') as f:
    pickle.dump(training_labels, f)

with open('./data/validation_set.pkl', 'wb') as f:
    pickle.dump(validation_set, f)
with open('./data/validation_labels.pkl', 'wb') as f:
    pickle.dump(validation_labels, f)

with open('./data/raw_test_strings.pkl', 'wb') as f:
    pickle.dump(raw_test_strings, f)
with open('./data/test_set.pkl', 'wb') as f:
    pickle.dump(test_set, f)
with open('./data/test_labels.pkl', 'wb') as f:
    pickle.dump(test_labels, f)

with open('./data/mapper_between_index_and_label.pkl', 'wb') as f:
    pickle.dump(header, f)