In [1]:
import pandas as pd
import numpy as np
import codecs
from nltk.tokenize import sent_tokenize
from collections import Counter, defaultdict
import re
import dill as pickle
import random
from tqdm import tqdm

example.txt  meta.json      meta_json.json
meta.csv     meta_json.csv  word_level_meta.csv


In [2]:
word_level_meta_path = '../../data/preprocessed/word_level_meta.csv'
word_level_df = pd.read_csv(word_level_meta_path)

In [3]:
available_image_letters = word_level_df.token.values.tolist()
print(available_image_letters[:10])

['Or', 'when', 'he', 'found', 'cock-fighting', 'going', 'on', 'in', 'church', '?']


In [26]:
data_path = '../../data/raw/language_model/wikitext-103/wiki.train.tokens'
small_data_path = '../../data/raw/language_model/wikitext-2/wiki.train.tokens'


def create_data(data_path, save=False, target_sep=['<<', '>>'], out_path='../../data/processed/', dataset_name='wiki2'):
    random.seed(100)
    X = []
    y = []
    raw = []
    bad_words = ['=', '戦', '場', 'の', 'ヴ', 'ァ', 'ル', 'キ', 'ュ', 'リ', 'ア', '戦場のヴァルキュリア3']
    N = 0
    threshold_length = 3
    iteration_threshold = 30
    bad_lines = 0
    counter = Counter()
    
    with codecs.open(data_path, encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line.rstrip():
                if not any(bad_word in line for bad_word in bad_words):
                    for sent in sent_tokenize(line):
                        split_sent = sent.split()
                        found = True
                        iteration = 0
                        while (found == True):
                            iteration += 1
                            random_int = random.randint(0, len(split_sent)-1)
                            target = split_sent[random_int]
                            if (len(target) >= threshold_length) and (target in available_image_letters):
#                                 X.append(split_sent[:random_int] + ['<TARGET>'] + split_sent[random_int+1:])
                                X.append(split_sent[:random_int] + [''.join([target_sep[0]] + [target] + [target_sep[1]])] 
                                             + split_sent[random_int+1:])
                                y.append((target, random_int))
                                raw.append(split_sent)
                                found = False
                                # get word counts 
                                for token in split_sent:
                                    counter[token] += 1

                            if (iteration >= iteration_threshold):
                                found = False
                                bad_lines += 1
                if N % 5000 == 0:
                    print('processed {} lines'.format(N))
#                 if N >= 100: break
                N += 1
        print('bad_lines', bad_lines)

    if save:
        np.save(out_path + 'X_' + dataset_name, X)
        np.save(out_path + 'y_' + dataset_name, y)
        np.save(out_path + 'raw_' + dataset_name, raw)


    return X, y, raw, counter


X, y, raw, counter = create_data(data_path, save=True, dataset_name='wiki103')


processed 0 lines
processed 5000 lines
processed 10000 lines
processed 15000 lines
processed 20000 lines
processed 25000 lines
processed 30000 lines
processed 35000 lines
processed 40000 lines
processed 45000 lines
processed 50000 lines
processed 55000 lines
processed 60000 lines
processed 65000 lines
processed 70000 lines
processed 75000 lines
processed 80000 lines
processed 85000 lines
processed 90000 lines
processed 95000 lines
processed 100000 lines
processed 105000 lines
processed 110000 lines
processed 115000 lines
processed 120000 lines
processed 125000 lines
processed 130000 lines
processed 135000 lines
processed 140000 lines
processed 145000 lines
processed 150000 lines
processed 155000 lines
processed 160000 lines
processed 165000 lines
processed 170000 lines
processed 175000 lines
processed 180000 lines
processed 185000 lines
processed 190000 lines
processed 195000 lines
processed 200000 lines
processed 205000 lines
processed 210000 lines
processed 215000 lines
processed 220

In [24]:
raw

[['The',
  'game',
  'began',
  'development',
  'in',
  '2010',
  ',',
  'carrying',
  'over',
  'a',
  'large',
  'portion',
  'of',
  'the',
  'work',
  'done',
  'on',
  'Valkyria',
  'Chronicles',
  'II',
  '.'],
 ['While',
  'it',
  'retained',
  'the',
  'standard',
  'features',
  'of',
  'the',
  'series',
  ',',
  'it',
  'also',
  'underwent',
  'multiple',
  'adjustments',
  ',',
  'such',
  'as',
  'making',
  'the',
  'game',
  'more',
  '<unk>',
  'for',
  'series',
  'newcomers',
  '.'],
 ['Character',
  'designer',
  '<unk>',
  'Honjou',
  'and',
  'composer',
  'Hitoshi',
  'Sakimoto',
  'both',
  'returned',
  'from',
  'previous',
  'entries',
  ',',
  'along',
  'with',
  'Valkyria',
  'Chronicles',
  'II',
  'director',
  'Takeshi',
  'Ozawa',
  '.'],
 ['A', 'large', 'team', 'of', 'writers', 'handled', 'the', 'script', '.'],
 ['The',
  'game',
  "'s",
  'opening',
  'theme',
  'was',
  'sung',
  'by',
  'May',
  "'n",
  '.'],
 ['It',
  'met',
  'with',
  'positive

In [15]:
counter['<<']

0

In [22]:
np.array(X[:10])

array([list(['The', 'game', 'began', 'development', 'in', '2010', ',', 'carrying', 'over', 'a', 'large', 'portion', 'of', 'the', '<<work>>', 'done', 'on', 'Valkyria', 'Chronicles', 'II', '.']),
       list(['While', 'it', 'retained', 'the', 'standard', 'features', 'of', 'the', 'series', ',', 'it', 'also', 'underwent', 'multiple', 'adjustments', ',', 'such', 'as', 'making', 'the', 'game', 'more', '<unk>', 'for', '<<series>>', 'newcomers', '.']),
       list(['Character', 'designer', '<unk>', 'Honjou', 'and', '<<composer>>', 'Hitoshi', 'Sakimoto', 'both', 'returned', 'from', 'previous', 'entries', ',', 'along', 'with', 'Valkyria', 'Chronicles', 'II', 'director', 'Takeshi', 'Ozawa', '.']),
       list(['A', 'large', 'team', 'of', 'writers', 'handled', '<<the>>', 'script', '.']),
       list(['The', 'game', "'s", 'opening', 'theme', '<<was>>', 'sung', 'by', 'May', "'n", '.']),
       list(['It', 'met', 'with', 'positive', 'sales', 'in', 'Japan', ',', 'and', 'was', 'praised', 'by', 'both', 

In [79]:

# bad_words = ['=', '戦', '場', 'の', 'ヴ', 'ァ', 'ル', 'キ', 'ュ', 'リ', 'ア', '戦場のヴァルキュリア3']
# N = 0
# threshold_length = 3
# iteration_threshold = 30
# bad_lines = 0
# counter = Counter()

# X = []
# y = []

    
# with codecs.open(small_data_path, encoding='utf-8') as f:
#     for line in f:
#         line = line.strip()
#         if line.rstrip():
#             if not any(bad_word in line for bad_word in bad_words):
#                 for sent in sent_tokenize(line):
#                     split_sent = sent.split()
#                     found = True
#                     iteration = 0
#                     while (found == True):
#                         iteration += 1
#                         random_int = random.randint(0, len(split_sent)-1)
#                         target = split_sent[random_int]
#                         if (len(target) >= threshold_length) and (target in available_image_letters):
#                             X.append(split_sent[:random_int] + ['<TARGET>'] + split_sent[random_int+1:])
#                             y.append((target, random_int))
#                             found = False
#                             # get word counts 
#                             for token in split_sent:
#                                 counter[token] += 1
                    
#                         if (iteration >= iteration_threshold):
#                             found = False
#                             bad_lines += 1
#             if N % 1000 == 0:
#                 print('processed {} lines'.format(N))
# #             if N >= 100: break
#             N += 1
            
# bad_lines

processed 0 lines
processed 1000 lines
processed 2000 lines
processed 3000 lines
processed 4000 lines
processed 5000 lines
processed 6000 lines
processed 7000 lines
processed 8000 lines
processed 9000 lines
processed 10000 lines
processed 11000 lines
processed 12000 lines
processed 13000 lines
processed 14000 lines
processed 15000 lines
processed 16000 lines
processed 17000 lines
processed 18000 lines
processed 19000 lines
processed 20000 lines
processed 21000 lines
processed 22000 lines
processed 23000 lines


2382

In [73]:
counter.most_common(10)

[('the', 485),
 (',', 385),
 ('.', 259),
 ('of', 223),
 ('and', 169),
 ('to', 168),
 ('in', 118),
 ('was', 95),
 ('a', 91),
 ('"', 74)]

In [5]:
!ls ../../data/raw/language_model/wikitext-103/wiki.train.tokens

wiki.test.tokens  wiki.train.tokens  wiki.valid.tokens


In [12]:
!head ../../data/raw/language_model/wikitext-103/wiki.train.tokens

 
 = Valkyria Chronicles III = 
 
 Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " . 
 The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for series newc