In [1]:
import numpy as np
import torch
torch.set_printoptions(edgeitems=2, threshold=50)

In [2]:
with open('../data/p1ch4/jane-austen/1342-0.txt', encoding='utf8') as f:
    text = f.read()

In [3]:
lines = text.split('\n')
line = lines[200]
line

'“Impossible, Mr. Bennet, impossible, when I am not acquainted with him'

In [4]:
letter_t = torch.zeros(len(line), 128) # <1> 
letter_t.shape

torch.Size([70, 128])

In [5]:
for i, letter in enumerate(line.lower().strip()):
    letter_index = ord(letter) if ord(letter) < 128 else 0  # <1>
    letter_t[i][letter_index] = 1

In [6]:
def clean_words(input_str):
    punctuation = '.,;:"!?”“_-'
    word_list = input_str.lower().replace('\n',' ').split()
    word_list = [word.strip(punctuation) for word in word_list]
    return word_list

words_in_line = clean_words(line)
line, words_in_line

('“Impossible, Mr. Bennet, impossible, when I am not acquainted with him',
 ['impossible',
  'mr',
  'bennet',
  'impossible',
  'when',
  'i',
  'am',
  'not',
  'acquainted',
  'with',
  'him'])

In [34]:
word_list = sorted(set(clean_words(text)))
word2index_dict = {word: i for (i, word) in enumerate(word_list)}

len(word2index_dict), word2index_dict['impossible'], word2index_dict

(7261,
 3394,
 {'': 0,
  '#1342]': 1,
  '$5,000)': 2,
  "'_she": 3,
  "'after": 4,
  "'ah": 5,
  "'as-is'": 6,
  "'bingley": 7,
  "'had": 8,
  "'having": 9,
  "'i": 10,
  "'keep": 11,
  "'lady": 12,
  "'lately": 13,
  "'lydia": 14,
  "'mr": 15,
  "'my": 16,
  "'oh": 17,
  "'s": 18,
  "'this": 19,
  "'tis": 20,
  "'violently": 21,
  "'yes,'": 22,
  "'you": 23,
  '($1': 24,
  '(801)': 25,
  '(a)': 26,
  '(an': 27,
  '(and': 28,
  '(any': 29,
  '(available': 30,
  '(b)': 31,
  '(by': 32,
  '(c)': 33,
  '(comparatively': 34,
  '(does': 35,
  '(for': 36,
  '(glancing': 37,
  '(if': 38,
  '(lady': 39,
  '(like': 40,
  '(most': 41,
  '(my': 42,
  '(or': 43,
  '(trademark/copyright)': 44,
  '(unasked': 45,
  '(what': 46,
  '(who': 47,
  '(www.gutenberg.org)': 48,
  '(“the': 49,
  '*': 50,
  '***': 51,
  '*****': 52,
  '1': 53,
  '1.a': 54,
  '1.b': 55,
  '1.c': 56,
  '1.d': 57,
  '1.e': 58,
  '1.e.1': 59,
  '1.e.2': 60,
  '1.e.3': 61,
  '1.e.4': 62,
  '1.e.5': 63,
  '1.e.6': 64,
  '1.e.7': 65,

In [30]:
word_t = torch.zeros(len(words_in_line), len(word2index_dict))
for i, word in enumerate(words_in_line):
    word_index = word2index_dict[word]
    word_t[i][word_index] = 1
    print('{:2} {:4} {}'.format(i, word_index, word))
    
print(word_t.shape)


 0 3394 impossible
 1 4305 mr
 2  813 bennet
 3 3394 impossible
 4 7078 when
 5 3315 i
 6  415 am
 7 4436 not
 8  239 acquainted
 9 7148 with
10 3215 him
torch.Size([11, 7261])


In [31]:
# word_t = word_t.unsqueeze(2)
word_t = word_t.unsqueeze(1)
word_t.shape

torch.Size([11, 1, 7261])

In [36]:
[(c, ord(c)) for c in sorted(set(text))]


[('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),
 ('\n', 10),

In [11]:
ord('l'
   )

108