In [1]:
import numpy as np
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import nltk
from nltk.stem.snowball import SnowballStemmer
import urllib.request
from tensorflow.keras import models
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Dropout, Flatten, LSTM
from keras.utils.np_utils import to_categorical
import keras
import collections

In [2]:
hu = SnowballStemmer("hungarian")

In [3]:
print(hu.stem("házakban"),hu.stem("játszott"))

ház játszot


In [4]:
fr = SnowballStemmer("french")

In [5]:
print(fr.stem("oeufs"),fr.stem("parlent"))

oeuf parlent


In [6]:
de = SnowballStemmer("german")

In [7]:
print(de.stem("Bücher"),de.stem("gehabt"))

buch gehabt


In [8]:
en = SnowballStemmer('english')

In [9]:
print(en.stem("held"),en.stem("houses"),en.stem("house"))

held hous hous


### regex

In [10]:
s = "almahab"
res = re.search(r'hab',s)
if res:
    print(res.group())

hab


In [11]:
#beginning
res = re.search(r'^hab',"ahabab")
if res:
    print(res.group())

In [12]:
#end
res = re.search(r'hab$',s)
if res:
    print(res.group())

hab


In [13]:
# . matches anything
# * as many as possible but can also be zero
# + as many as possible but at least 1
# {n} exacly n
# ? as few as possible
s = "almahaab"
print(re.sub(r"a.*a","",s)) 
print(re.sub(r"a*","X",s))
print(re.sub(r"a.{2}a","",s))
print(re.sub(r"a.*?a","X",s))
print(re.sub(r"a*m","",s))
print(re.sub(r"a+m","",s))

b
XXlXmXXhXXbX
haab
XhXb
alahaab
almahaab


In [14]:
#find sequences of a
pattern = re.compile(r"a+")
for v in re.finditer(pattern, "almahaab"):
    print(v.group())

a
a
aa


In [15]:
#there can be anything between two a
pattern = re.compile(r"a.*?a")
for v in re.finditer(pattern, "almahaab"):
    print(v.group())

alma
aa


In [16]:
#starts with a or l ends with a and there is at least something in between 
pattern = re.compile(r"[al].+?a")
for v in re.finditer(pattern, "alma labda baa"):
    print(v.group())

alma
labda


In [17]:
#same as before but the last character befor a is not a d
pattern = re.compile(r"[al].+?(?<!d)a")
for v in re.finditer(pattern, "alma labda"):
    print(v.group())

alma


In [18]:
s = "I have 205 euros"
res = re.search(r'[0-9]+',s)
if res:
    print(res.group())

205


In [19]:
s = "The value of pi is 3.14."
res = re.search(r'[0-9]*\.?[0-9]+',s)
if res:
    print(res.group())

3.14


In [20]:
s = "The gravitational constant is 6.77e-11 m3kg−1s−2."
res = re.search(r'[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?',s)
if res:
    print(res.group())

6.77e-11


In [21]:
s = "1 July 2014"
res = re.search(r'[0-9]+ (July|August) [1-9][0-9]{3}',s)
if res:
    print(res.group())
s = "1 August 2014"
res = re.search(r'[0-9]+ (July|August) [1-9][0-9]{3}',s)
if res:
    print(res.group())
s = "1 September 2014"
res = re.search(r'1 ([A-Z][a-z]*) [1-9][0-9]{3}',s)
if res:
    print(res.group())

1 July 2014
1 August 2014
1 September 2014


In [22]:
name1 = "Török János"
re.sub(r"(.*) (.*)",r"\2, \1",name1)

'János, Török'

### TASK 1
Change date format using regex

In [23]:
date = "Easter Sunday this year was 17.04.2022."
#change date to yyyy.mm.dd format
re.sub(r"([0-9]{2})\.([0-9]{2})\.([1-9][0-9]{3})",r"\3.\2.\1",date)

'Easter Sunday this year was 2022.04.17.'

In [24]:
s = "Ez egy mondat. Ez egy másik. Ez pedig egy harmadik. Ma 23. éve nem iszom. "

In [25]:
pattern = re.compile(r"[A-Z].*?\. ")
for v in re.finditer(pattern, s):
    print(v.group(0))

Ez egy mondat. 
Ez egy másik. 
Ez pedig egy harmadik. 
Ma 23. 


In [26]:
hp = """
Mr. and Mrs. Dursley, of number four, Privet Drive, 
were proud to say that they were perfectly normal, 
thank you very much. They were the last people you’d 
expect to be involved in anything strange or 
mysterious, because they just didn’t hold with such 
nonsense. 

Mr. Dursley was the director of a firm called 
Grunnings, which made drills. He was a big, beefy 
man with hardly any neck, although he did have a 
very large mustache. Mrs. Dursley was thin and 
blonde and had nearly twice the usual amount of 
neck, which came in very useful as she spent so 
much of her time craning over garden fences, spying 
on the neighbors. The Dursley s had a small son 
called Dudley and in their opinion there was no finer 
boy anywhere. 

The Dursleys had everything they wanted, but they 
also had a secret, and their greatest fear was that 
somebody would discover it. They didn’t think they 
could bear it if anyone found out about the Potters. 
Mrs. Potter was Mrs. Dursley’s sister, but they hadn’t 
met for several years; in fact, Mrs. Dursley pretended 
she didn’t have a sister, because her sister and her 
good-for-nothing husband were as unDursleyish as it 
was possible to be. The Dursleys shuddered to think 
what the neighbors would say if the Potters arrived in 
the street. The Dursleys knew that the Potters had a 
small son, too, but they had never even seen him. """

In [27]:
#make a continuous text out of it (remove newline)
HP = hp.replace("\n","")
HP

'Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you’d expect to be involved in anything strange or mysterious, because they just didn’t hold with such nonsense. Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursley s had a small son called Dudley and in their opinion there was no finer boy anywhere. The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didn’t think they could bear it if anyone found out about the Potters. Mrs. Potter was Mrs. Dursley’s sister, but they hadn’t met for several yea

In [28]:
#Finds sentences. The problem is that 'Mr.' looks like the end of a sentence and it isn't.
pattern = re.compile(r"[A-Z].*?(?<!Mr|Dr)(?<!Mrs)\. ")
for v in re.finditer(pattern, HP):
    print(v.group(0))

Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. 
They were the last people you’d expect to be involved in anything strange or mysterious, because they just didn’t hold with such nonsense. 
Mr. Dursley was the director of a firm called Grunnings, which made drills. 
He was a big, beefy man with hardly any neck, although he did have a very large mustache. 
Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. 
The Dursley s had a small son called Dudley and in their opinion there was no finer boy anywhere. 
The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. 
They didn’t think they could bear it if anyone found out about the Potters. 
Mrs. Potter was Mrs. Dursley’s sister, but they hadn’t met for seve

In [29]:
#Get full HP1
url = "https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%201%20-%20The%20Philosopher's%20Stone.txt"
f = urllib.request.urlopen(url)
myfile = f.read()

In [30]:
len(myfile)

492161

In [31]:
type(myfile)

bytes

In [32]:
#convert it to utf8, remove newlines, quotation marks and page footer
hp = myfile.decode("utf-8").replace("\n","").replace("”","")
HP = re.sub(r"Page \| .*? Harry Potter and the Philosophers Stone - J.K. Rowling ","",hp)

In [33]:
#Let us see how it worked
pattern = re.compile(r"[A-Z][a-z].*?(?<!Mr|Dr)(?<!Mrs)[\.\?!] ")
i = 0
for v in re.finditer(pattern, HP):
    print("-",v.group(0))
    i += 1
    if i >=10: break

- Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. 
- They were the last people you’d expect to be involved in anything strange or mysterious, because they just didn’t hold with such nonsense. 
- Mr. Dursley was the director of a firm called Grunnings, which made drills. 
- He was a big, beefy man with hardly any neck, although he did have a very large mustache. 
- Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. 
- The Dursley s had a small son called Dudley and in their opinion there was no finer boy anywhere. 
- The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. 
- They didn’t think they could bear it if anyone found out about the Potters. 
- Mrs. Potter was Mrs. Dursley’s sister, but they h

In [34]:
len(HP)

418304

### TASK 2
Suggestion: create a list from the first 10 sentences in order to avoid extensive outputs, if works you can use an other array containing then all sentences.

Write a loop which
* goes through the sentences
* splits the sentences to words
* removes non-letter characters (w is the actual word):
<pre>
onlylett = re.compile('[^a-zA-Z]')
onlylett.sub('', w)
</pre>
* print the stem of the word

In [35]:
# Get the sentences
pattern = re.compile(r"[A-Z][a-z].*?(?<!Mr|Dr)(?<!Mrs)[\.\?!] ")
sentences = []
for v in re.finditer(pattern, HP):
    sentences.append(v.group(0))

len(sentences)

5969

In [36]:
subset = sentences

In [37]:
def get_sentence_stems(sentence):
  words = []
  sentence_words = re.split(' ', sentence)
  for w in sentence_words: 
    word = onlylett.sub('', w)
    if word == '': continue
    words.append(en.stem(onlylett.sub('', w)))
  return words

In [38]:
words = []

onlylett = re.compile('[^a-zA-Z]')

for sentence in subset:
  words = words + get_sentence_stems(sentence)

words

['mr',
 'and',
 'mrs',
 'dursley',
 'of',
 'number',
 'four',
 'privet',
 'drive',
 'were',
 'proud',
 'to',
 'say',
 'that',
 'they',
 'were',
 'perfect',
 'normal',
 'thank',
 'you',
 'veri',
 'much',
 'they',
 'were',
 'the',
 'last',
 'peopl',
 'youd',
 'expect',
 'to',
 'be',
 'involv',
 'in',
 'anyth',
 'strang',
 'or',
 'mysteri',
 'becaus',
 'they',
 'just',
 'didnt',
 'hold',
 'with',
 'such',
 'nonsens',
 'mr',
 'dursley',
 'was',
 'the',
 'director',
 'of',
 'a',
 'firm',
 'call',
 'grun',
 'which',
 'made',
 'drill',
 'he',
 'was',
 'a',
 'big',
 'beefi',
 'man',
 'with',
 'hard',
 'ani',
 'neck',
 'although',
 'he',
 'did',
 'have',
 'a',
 'veri',
 'larg',
 'mustach',
 'mrs',
 'dursley',
 'was',
 'thin',
 'and',
 'blond',
 'and',
 'had',
 'near',
 'twice',
 'the',
 'usual',
 'amount',
 'of',
 'neck',
 'which',
 'came',
 'in',
 'veri',
 'use',
 'as',
 'she',
 'spent',
 'so',
 'much',
 'of',
 'her',
 'time',
 'crane',
 'over',
 'garden',
 'fenc',
 'spi',
 'on',
 'the',
 'nei

### TASK 3
* now instead of printing the stems collect them in an array
* count the number of occurences of the stems (this can be done in an old fashioned way or using Counter https://pymotw.com/2/collections/counter.html
* sort the stems according to occurence and print the top 250 stems
* create a list with just the words in the above sorted order (the previous method should give you a dictionary, or a list of tuples)

In [39]:
c = collections.Counter(words)
sorted_words = []

for word, _ in c.most_common():
  sorted_words.append(word)

sorted_words

['the',
 'and',
 'to',
 'a',
 'he',
 'harri',
 'it',
 'of',
 'was',
 'his',
 'in',
 'you',
 'had',
 'that',
 'on',
 'at',
 'they',
 'said',
 'as',
 'but',
 'him',
 'ron',
 'look',
 'hagrid',
 'with',
 'all',
 'be',
 'what',
 'up',
 'i',
 'for',
 'out',
 'were',
 'there',
 'them',
 'have',
 'hermion',
 'back',
 'one',
 'go',
 'this',
 'if',
 'so',
 'from',
 'get',
 'not',
 'into',
 'me',
 'an',
 'about',
 'like',
 'been',
 'their',
 'she',
 'off',
 'could',
 'no',
 'didnt',
 'do',
 'your',
 'snape',
 'down',
 'know',
 'got',
 'her',
 'professor',
 'over',
 'see',
 'now',
 'just',
 'is',
 'when',
 'veri',
 'dumbledor',
 'tri',
 'then',
 'are',
 'who',
 'dudley',
 'we',
 'by',
 'around',
 'hed',
 'well',
 'time',
 'how',
 'malfoy',
 'say',
 'come',
 'someth',
 'head',
 'right',
 'uncl',
 'eye',
 'dont',
 'vernon',
 'potter',
 'nevill',
 'quirrel',
 'turn',
 'even',
 'door',
 'hand',
 'dursley',
 'want',
 'where',
 'never',
 'onli',
 'face',
 'through',
 'think',
 'or',
 'gryffindor',
 'fi

### TASK 4
* the following routine returns the position of the stem in the sorted list of the words (please note that it works with a list of words, wheras sorting the result of Counter results in a dictionary
 - w is the stem
 - sortedstemlist is the sorted list of stems
 - lim is a limit (integer), is a stem is lower in the rank than lim the function returns lim
* Try it with lim=250 and words in and otside the top 250 (e.g. "hogwart", and above 250 e.g. "stupid")

We need it to give words numbers. 250 will be the number of an unimportant word

In [40]:
def word_code(w,sortedstemlist,lim):
    if w in sortedstemlist[:lim]:
        return sortedstemlist.index(w)
    else:
        return(lim)

In [41]:
word_code('drgbgive', sorted_words, 250)

250

### TASK 5
Create lists to train data
* For each sentence go through the words with an index <code>i</code>
* Generate word indices for words <code>i-2,i-1,i,i+1,i+2</code>, of course if index is out of range the result should be lim=250. The resulting indices are <code>i0,i1,i2,i3,i4</code>
* (This is what you should do: For each word <code>i</code> remove the nonletter characters, get the stem then encode it with the word_code function, which returns <code>i0,i1,i2,i3,i4</code>)
* Create an input list and an output list:
<pre>
if i2 < 250:
    Xl.append([i0,i1,i3,i4])
    yl.append(i2)
</pre>
(optional) You can use new indices for beginning and end of sentence, and nonexistent

In [42]:
Xl = []
yl = []

for sentence in subset:
  sentence_words = get_sentence_stems(sentence)
  for i in range(2, len(sentence_words) - 2):
    i0, i1, i2, i3, i4 = [word_code(w, sorted_words, 250) for w in sentence_words[i - 2: i + 3]]
    if i2 < 250:
      Xl.append([i0,i1,i3,i4])
      yl.append(i2)

### TASK 6
Create a one or two layer dense network, input size is (4, 251), output size is 250, compile. Use flatten before the last layer.

In [43]:
model = models.Sequential()
model.add(Dense(units = 100, input_shape = (4, 251), activation = 'relu'))
model.add(Flatten())
model.add(Dense(units = 250, activation = 'softmax'))
model.compile(loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 4, 100)            25200     
                                                                 
 flatten (Flatten)           (None, 400)               0         
                                                                 
 dense_1 (Dense)             (None, 250)               100250    
                                                                 
Total params: 125,450
Trainable params: 125,450
Non-trainable params: 0
_________________________________________________________________


### TASK 7
* convert our list to one hot encoding <code>to_categorical(Xl), to_categorical(yl)</code>
* train

In [44]:
X = to_categorical(Xl)
Y = to_categorical(yl)

model.fit(X, Y, batch_size = 40, epochs = 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f45bf863d10>

### TASK 8
verify

In [45]:
s = ["The", "Dursleys", "had", "everything", "they"]
a = [ word_code(en.stem(onlylett.sub('', s[0])),sorted_words,250),
     word_code(en.stem(onlylett.sub('', s[1])),sorted_words,250),
     word_code(en.stem(onlylett.sub('', s[3])),sorted_words,250),
     word_code(en.stem(onlylett.sub('', s[4])),sorted_words,250)]
b = to_categorical(a)
res = model.predict(np.array([b]))



In [46]:
np.array(sorted_words)[(-res[0]).argsort()[:5]]

array(['and', 'was', 'had', 'of', 'the'], dtype='<U16')

In [47]:
sorted(res[0],reverse=True)[:5]

[0.20789923, 0.19742687, 0.08706977, 0.0725936, 0.020993423]

## Task 9
 Predict next word:
 * The input data should be $L$ successive words. This time remove words with no code.
 * The target is the next word
 * Use an LSTM network to predict the next word

In [59]:
L = 5

trainX = []
trainY = []

for sentence in subset:
  sentence_words = get_sentence_stems(sentence)

  # Get the codes under 250 of the sentence
  sentence_codes = list(filter(lambda code: code < 250, [word_code(w, sorted_words, 250) for w in sentence_words]))

  for i in range(L, len(sentence_codes)):
    trainX.append(sentence_codes[i - L: i])
    trainY.append(sentence_codes[i])

print(trainX[:10])
print(trainY[:10])

[[140, 1, 103, 7, 32], [1, 103, 7, 32, 2], [103, 7, 32, 2, 87], [7, 32, 2, 87, 13], [32, 2, 87, 13, 16], [2, 87, 13, 16, 32], [87, 13, 16, 32, 11], [13, 16, 32, 11, 72], [16, 32, 0, 144, 135], [32, 0, 144, 135, 2]]
[2, 87, 13, 16, 32, 11, 72, 162, 2, 26]


In [60]:
np.array(sorted_words)[trainX[0]]

array(['mr', 'and', 'dursley', 'of', 'were'], dtype='<U16')

In [61]:
np.array(sorted_words)[trainY[0]]

'to'

In [62]:
model = models.Sequential()
model.add(LSTM(120, input_shape=(L, 250)))
model.add(Dense(units = 250, activation = 'softmax'))
model.compile(loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 120)               178080    
                                                                 
 dense_4 (Dense)             (None, 250)               30250     
                                                                 
Total params: 208,330
Trainable params: 208,330
Non-trainable params: 0
_________________________________________________________________


In [63]:
X = to_categorical(trainX)
Y = to_categorical(trainY)

model.fit(X, Y, batch_size = 40, epochs = 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f45bbee3890>

In [64]:
s = ['mr', 'and', 'dursley', 'of', 'were', 'to']
a = [ word_code(en.stem(onlylett.sub('', s[0])),sorted_words,250),
     word_code(en.stem(onlylett.sub('', s[1])),sorted_words,250),
     word_code(en.stem(onlylett.sub('', s[2])),sorted_words,250),
     word_code(en.stem(onlylett.sub('', s[3])),sorted_words,250), 
     word_code(en.stem(onlylett.sub('', s[4])),sorted_words,250)]
b = to_categorical(a, num_classes = 250)
res = model.predict(np.array([b]))



In [65]:
np.array(sorted_words)[(-res[0]).argsort()[:5]]

array(['to', 'the', 'in', 'and', 'on'], dtype='<U16')

In [66]:
sorted(res[0],reverse=True)[:5]

[0.18678398, 0.13137503, 0.05326851, 0.052650973, 0.04327776]

The accuracy with the LSTM model was mostly similar to the previous, but with some more epochs it would increase more than the other one. As we can see from the example above, with L = 5, the model can successfully predict the word `to` given the 5 previous words.