In [1]:
import pandas as pd
import numpy as np
import codecs
from nltk.tokenize import sent_tokenize
from collections import Counter, defaultdict
import re
import dill as pickle
import random
from tqdm import tqdm
import os

In [2]:
word_level_meta_path = '../../data/preprocessed/word_level_meta.csv'
word_level_df = pd.read_csv(word_level_meta_path)

In [3]:
available_image_letters = word_level_df.token.values.tolist()
print(available_image_letters[:10])

['Or', 'when', 'he', 'found', 'cock-fighting', 'going', 'on', 'in', 'church', '?']


In [20]:

wikitext2_path = '../../data/raw/language_model/wikitext-2'
wikitext103_path = '../../data/raw/language_model/wikitext-103'

def create_full_path(wiki_path, dataset):
    """Joins to paths together for wiki dataset"""
    return os.path.join(wiki_path, 'wiki.{}.tokens'.format(dataset))

def create_data(wiki_path, dataset, save=False, target_sep=['<<', '>>'], threshold_length=3,
                iteration_threshold=30, out_path='../../data/processed/'):
    """
    Loads and searchs for useable target word (one we have an image for) and saves wiki text dataset
    
    Args:
    
    
    Returns:
    
    """
    dataset_name = wiki_path.split('/')[-1]
    data_path = create_full_path(wikitext2_path, 'train')
    
    random.seed(100)
    X = []
    y = []
    raw = []
    bad_words = ['=', '戦', '場', 'の', 'ヴ', 'ァ', 'ル', 'キ', 'ュ', 'リ', 'ア', '戦場のヴァルキュリア3']
    N = 0
    bad_lines = 0
    counter = Counter()
    
    with codecs.open(data_path, encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line.rstrip():
                if not any(bad_word in line for bad_word in bad_words):
                    for sent in sent_tokenize(line):
                        split_sent = sent.split()
                        found = True
                        iteration = 0
                        while (found == True):
                            iteration += 1
                            random_int = random.randint(0, len(split_sent)-1)
                            target = split_sent[random_int]
                            if (len(target) >= threshold_length) and (target in available_image_letters):
#                                 X.append(split_sent[:random_int] + ['<TARGET>'] + split_sent[random_int+1:])
                                X.append(split_sent[:random_int] + [''.join([target_sep[0]] + [target] + [target_sep[1]])] 
                                             + split_sent[random_int+1:])
                                y.append((target, random_int))
                                raw.append(split_sent)
                                found = False
                                # get word counts 
                                for token in split_sent:
                                    counter[token] += 1

                            if (iteration >= iteration_threshold):
                                found = False
                                bad_lines += 1
                if N % 5000 == 0:
                    print('processed {} lines'.format(N))
#                 if N >= 100: break
                N += 1
        print('bad_lines', bad_lines)
    if save:
        print('Save Path', out_path + '----' + dataset_name + '-' + dataset)
        np.save(out_path + 'X_' + dataset_name + '-' + dataset, X)
        np.save(out_path + 'y_' + dataset_name + '-' + dataset, y)
        np.save(out_path + 'raw' + dataset_name + '-' + dataset, raw)


    return X, y, raw, counter

In [25]:
X, y, raw, counter = create_data(wikitext103_path, 'test', save=True)

wikitext-103
processed 0 lines
processed 5000 lines
processed 10000 lines
processed 15000 lines
processed 20000 lines
bad_lines 2380
Save Path ../../data/processed/----wikitext-103-test


In [79]:

# bad_words = ['=', '戦', '場', 'の', 'ヴ', 'ァ', 'ル', 'キ', 'ュ', 'リ', 'ア', '戦場のヴァルキュリア3']
# N = 0
# threshold_length = 3
# iteration_threshold = 30
# bad_lines = 0
# counter = Counter()

# X = []
# y = []

    
# with codecs.open(small_data_path, encoding='utf-8') as f:
#     for line in f:
#         line = line.strip()
#         if line.rstrip():
#             if not any(bad_word in line for bad_word in bad_words):
#                 for sent in sent_tokenize(line):
#                     split_sent = sent.split()
#                     found = True
#                     iteration = 0
#                     while (found == True):
#                         iteration += 1
#                         random_int = random.randint(0, len(split_sent)-1)
#                         target = split_sent[random_int]
#                         if (len(target) >= threshold_length) and (target in available_image_letters):
#                             X.append(split_sent[:random_int] + ['<TARGET>'] + split_sent[random_int+1:])
#                             y.append((target, random_int))
#                             found = False
#                             # get word counts 
#                             for token in split_sent:
#                                 counter[token] += 1
                    
#                         if (iteration >= iteration_threshold):
#                             found = False
#                             bad_lines += 1
#             if N % 1000 == 0:
#                 print('processed {} lines'.format(N))
# #             if N >= 100: break
#             N += 1
            
# bad_lines

processed 0 lines
processed 1000 lines
processed 2000 lines
processed 3000 lines
processed 4000 lines
processed 5000 lines
processed 6000 lines
processed 7000 lines
processed 8000 lines
processed 9000 lines
processed 10000 lines
processed 11000 lines
processed 12000 lines
processed 13000 lines
processed 14000 lines
processed 15000 lines
processed 16000 lines
processed 17000 lines
processed 18000 lines
processed 19000 lines
processed 20000 lines
processed 21000 lines
processed 22000 lines
processed 23000 lines


2382

In [73]:
counter.most_common(10)

[('the', 485),
 (',', 385),
 ('.', 259),
 ('of', 223),
 ('and', 169),
 ('to', 168),
 ('in', 118),
 ('was', 95),
 ('a', 91),
 ('"', 74)]

In [5]:
!ls ../../data/raw/language_model/wikitext-103/wiki.train.tokens

wiki.test.tokens  wiki.train.tokens  wiki.valid.tokens


In [12]:
!head ../../data/raw/language_model/wikitext-103/wiki.train.tokens

 
 = Valkyria Chronicles III = 
 
 Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " . 
 The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for series newc