Prerequisities: Access data in google drive

In [0]:
from google.colab import drive

MOUNT_POINT = "/content/drive/"
drive.mount(MOUNT_POINT)
DATA_DIR = MOUNT_POINT + "My Drive/Colab Notebooks/Thesis-Data/bg"

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive/


Imports

In [0]:
import random
import os
import numpy
import pickle
from collections import defaultdict

Set random seed

In [0]:
def seed(seed=666):
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  numpy.random.seed(seed)

seed()

Read positive examples

In [0]:
def read_data(file):
  with open(file) as f:
    return { w.strip().lower() for w in f.readlines() }

positive_examples = read_data(DATA_DIR + "/words.dat")

print("Positive examples size {0:,}".format(len(positive_examples)))

MAX_WORD_LENGTH = max([len(w) for w in positive_examples])
print (f'Maximum word length {MAX_WORD_LENGTH}')

alphabet = { c for w in positive_examples for c in w }
alphabet = list(alphabet)
alphabet.sort()

print (f'Alphabet size {len(alphabet)}')

alphabet = { c : i + 1 for (i, c) in enumerate(alphabet) }

with open(DATA_DIR + "/model/large/alphabet.dict", 'wb') as f:
  pickle.dump(alphabet, f)

with open(DATA_DIR + '/model/large/alphabet.tsv', "w") as f:
  for c, i in alphabet.items():
    f.write(c)
    f.write("\t")
    f.write(str(i))
    f.write("\n")

Positive examples size 721,823
Maximum word length 25
Alphabet size 30


Generate negative words

In [0]:
def delete_random_char(s):
  if len(s) >= 2:
    pos = random.randint(0, len(s) - 1)
    return s[:pos] + s[pos + 1:]
  return s

def insert_random_char(s, alphabet):
  pos = random.randint(0, len(s))
  return s[:pos] + random.choice(alphabet) + s[pos:]

def replace_random_char(s, alphabet):
  if len(s) >= 1:
    pos = random.randint(0, len(s) - 1)
    choices = alphabet.copy()
    choices = choices.remove(s[pos])
    return s[:pos] + random.choice(alphabet) + s[pos + 1:]
  return s

def random_transposition(s):
  if len(s) >= 2:
    pos = random.randint(0, len(s) - 2)
    return s[:pos] + s[pos + 1] + s[pos] + s[pos + 2:]
  return s

def generate_negative_examples(positive_examples, alphabet):
  negative_examples = set()

  while len(negative_examples) < len(positive_examples):

    for w in positive_examples:

      if len(negative_examples) == len(positive_examples):
        break

      if len(w) == 1: 
        if bool(random.getrandbits(1)):
          w1 = replace_random_char(w, alphabet)
        else:
          w1 = insert_random_char(w, alphabet)

      if len(w) >= 2:
        op = random.randint(1, 4)
        if op == 1:
          w1 = replace_random_char(w, alphabet)
        elif op == 2:
          w1 = insert_random_char(w, alphabet)
        elif op == 3:
          w1 = delete_random_char(w)
        else:
          w1 = random_transposition(w)

      if w1 not in positive_examples and w1 not in negative_examples:
        negative_examples.add(w1)

  return negative_examples

# negative_examples = generate_negative_examples(positive_examples, list(alphabet.keys()))
# print (f'Generated negative examples {len(negative_examples)}')

# with open(DATA_DIR + "/numeral-50-NEG.dat", "w") as f:
#   for w in negative_examples:
#     f.write(w)
#     f.write("\n")


Generated negative examples 50


Read negative examples

In [0]:
negative_examples = read_data(DATA_DIR + "/negative-2.0.dat")
print("Negative examples size {0:,}".format(len(negative_examples)))

Negative examples size 721,823


In [0]:
def shuffle_and_split_data(data, train_size=0.8, dev_size=0.1, test_size=0.1):

  random.shuffle(data)

  train, dev, test = data[:round(len(data) * train_size)], \
    data[round(len(data) * train_size):round(len(data) * (train_size + dev_size))], \
    data[round(len(data) * (train_size + dev_size)):]
  
  return train, dev, test

all_data = list(positive_examples) + list(negative_examples)
train, dev, test = shuffle_and_split_data(all_data)

print("Train size {0:,}".format(len(train)))
print("Dev size {0:,}".format(len(dev)))
print("Test size {0:,}".format(len(test)))

MAX_WORD_LENGTH = max([len(w) for w in train] + [len(w) for w in dev])
print (f'Maximum word length {MAX_WORD_LENGTH}')


Train size 80
Dev size 10
Test size 10
Maximum word length 16


Serialize data as numpy arrays

In [0]:
def save_data_as_np_arrays(dataset, dataset_name):
  x, length, y = [], [], []

  for w in dataset:
    length.append(len(w))
    y.append(1 if w in positive_examples else 0)
    w_x = [alphabet[c] for c in w]
    if len(w_x) < MAX_WORD_LENGTH:
      w_x.extend([0] * (MAX_WORD_LENGTH - len(w_x)))
    elif len(w_x) > MAX_WORD_LENGTH:
      w_x[:MAX_WORD_LENGTH]
    x.append(w_x)

  numpy.save(DATA_DIR + "/model/experiment1/" + dataset_name + ".data.npy", numpy.array(x))
  numpy.save(DATA_DIR + "/model/experiment1/" + dataset_name + ".length.npy", numpy.array(length))
  numpy.save(DATA_DIR + "/model/experiment1/" + dataset_name + ".labels.npy", numpy.array(y))

save_data_as_np_arrays(train, "train")
save_data_as_np_arrays(dev, "dev")
save_data_as_np_arrays(test, "test")

Serialize data as text

In [0]:
def save_data_as_tsv(filename, data):
  POS, NEG = str(1), str(0)

  with open(DATA_DIR + filename, "w") as f:
    for w in data:
      if w in positive_examples:
        f.write(POS)
      else:
        f.write(NEG)
      f.write("\t")
      f.write(w)
      f.write("\n")

save_data_as_tsv("/model/experiment1/train.tsv", train)
save_data_as_tsv("/model/experiment1/dev.tsv", dev)
save_data_as_tsv("/model/experiment1/test.tsv", test)

Serialize data extension 

For each word we add all it's prefixes except epsilon and the word itself


In [0]:
# TODO remove reading data
def extend_dataset(data, positive_examples, alphabet, base_path):
  inv_alphabet = { v : k for k, v in alphabet.items() }
  x, length, y = data

  word_to_count = defaultdict(int)
  for i in range(len(x)):
    for j in range(1, length[i] + 1):
      prefix = ''.join([inv_alphabet[c] for c in x[i][:j]])
      word_to_count[prefix] += 1

  assert sum(length) == sum([count for w, count in word_to_count.items()])

  MAX_WORD_LENGTH = max(length)

  x1, length1, y1, weights = [], [], [], []

  for w, count in word_to_count.items():
    w_x = [alphabet[c] for c in w]
    if len(w_x) < MAX_WORD_LENGTH:
      w_x.extend([0] * (MAX_WORD_LENGTH - len(w_x)))
    elif len(w_x) > MAX_WORD_LENGTH:
      w_x[:MAX_WORD_LENGTH]
    x1.append(w_x)
    length1.append(len(w))
    y1.append(1 if w in positive_examples else 0)
    weights.append(count)

  print("Size {0:,}".format(len(x)))
  print("Extended size {0:,}".format(len(x1)))
  numpy.save(base_path + ".extended.data.npy", numpy.array(x1))
  numpy.save(base_path + ".extended.length.npy", numpy.array(length1))
  numpy.save(base_path + ".extended.labels.npy", numpy.array(y1))
  numpy.save(base_path + ".extended.weights.npy", numpy.array(weights))

def load_data(filename_data, filename_length, filename_labels):
  return numpy.load(filename_data, allow_pickle=True), \
    numpy.load(filename_length, allow_pickle=True), \
    numpy.load(filename_labels, allow_pickle=True)

train_data = load_data(
  DATA_DIR + "/model/large/train.data.npy",
  DATA_DIR + "/model/large/train.length.npy",
  DATA_DIR + "/model/large/train.labels.npy"
)

extend_dataset(train_data, positive_examples, alphabet, DATA_DIR + "/model/large/train")

Size 1,154,917
Extended size 3,301,985


In [0]:
x = numpy.array([[0, 2], [1, 1], [2, 0]]).T
print (x)
print (numpy.cov(x))

x = numpy.array([[1692, 68], [1978, 102], [1884, 110], [2151, 112], [2519, 154]]).T
print (x)
print (numpy.cov(x))
# vectors = []
# with open(DATA_DIR + "/kMeans/light/vectors.txt", "r") as f:
#   lines = f.readlines()
#   for line in lines:
#     line = line.strip()
#     if line:
#       vectors.append([float(p) for p in line.split()])

# v = numpy.array(vectors).T
# #print (v)
# print (numpy.cov(v))

[[0 1 2]
 [2 1 0]]
[[ 1. -1.]
 [-1.  1.]]
[[1692 1978 1884 2151 2519]
 [  68  102  110  112  154]]
[[97732.7  9107.3]
 [ 9107.3   941.2]]
