# SNLP 2021 Final Project

Name 1: Lukas Wilde<br/> 
Student id 1: 2564597<br/>
Email 1: s8luwild@stud.uni-saarland.de<br/>


Name 2: Dennis Heß<br/>
Student id 2: 2574005<br/>
Email 2: s8dehess@stud.uni-saarland.de<br/> 

In [1]:
import sentencepiece as spm
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import os
import random

In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/lukas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/lukas/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [142]:
%cd ~/dev/snlp_project

/home/lukas/dev/snlp_project


In [112]:
random.seed(42)

PATH = "./data"

tokenizer = RegexpTokenizer(r"\w+")

for name in ['alice_in_wonderland.txt', 'bengali_corpus.txt']:
    file = os.path.join(PATH, name)
    with open(file, 'r') as f:
        
        # lowercase input
        x = f.read().lower().replace("'", "")
        
        total_symbols = len(x)
        
        # tokenize sentences
        sentences = sent_tokenize(x)
        
        # remove additional whitespaces
        sentences = list(map(lambda x: " ".join(x.split()), sentences))
        
        # remove punctuation (only in english corpus, in bengali words are strangely split up by this)
        if name == 'alice_in_wonderland.txt':
            temp = []
            for sent in sentences:
                words = word_tokenize(sent)
                modified_words = tokenizer.tokenize(' '.join(words))
                temp.append(' '.join(modified_words))

            sentences = temp
        
        # remove stopwords
        temp = []
        for sent in sentences:
            temp.append([])
            for word in sent.split():
                if word not in (stopwords.words('english')):
                    temp[-1].append(word)
        
        sentences = list(map(lambda x: " ".join(x), temp))
        
        num_sentences = len(sentences)
        train_set = sentences
        
        # shuffle sentences
        random.shuffle(train_set)
        
        test_set = []
        test_symbols = 0
        
        # get as many sentences in train set, until there are only 80 % of symbols in this set
        while test_symbols < 0.2 * total_symbols:
            idx = random.randint(0, len(train_set)-1)
            test_sentence = train_set.pop(idx)
            test_symbols += len(test_sentence)
            test_set.append(test_sentence)
                
        print("train-test-split: %f %f" % (1 - test_symbols/total_symbols, test_symbols/total_symbols))
        
        # write contents to file
        for output in ['train.txt', 'test.txt']:
            output_file = os.path.join(PATH, ("en_" if name == 'alice_in_wonderland.txt' else 'bn_') + output)
            with open(output_file, 'w') as out:
                out.write("\n".join(train_set) if output == 'train.txt' else "\n".join(test_set))

train-test-split: 0.799948 0.200052
train-test-split: 0.799757 0.200243


In [33]:
with open('data/en_train.txt', 'r') as f:
    text = f.read().replace('\n', ' ')
    
with open('data/en_test.txt', 'r') as f:
    test = f.read().replace('\n', ' ')

In [22]:
# English on character level

spm.SentencePieceTrainer.Train(f'--input=data/en_train.txt --model_prefix=en_chars --vocab_size=33 --character_coverage=1.0 --model_type=bpe')

sp = spm.SentencePieceProcessor()
sp.load('en_chars.model')

encoded_train = sp.EncodeAsPieces(text)
encoded_test = sp.EncodeAsPieces(test)

with open('en_s1.txt', 'w') as f:
    f.write(' '.join(encoded_train))
    
with open('en_test1.txt', 'w') as f:
    f.write(' '.join(encoded_test))

In [23]:
# English on smaller sub units

spm.SentencePieceTrainer.Train(f'--input=data/en_train.txt --model_prefix=en_smaller --vocab_size=250 --character_coverage=1.0 --model_type=bpe')

sp = spm.SentencePieceProcessor()
sp.load('en_smaller.model')

encoded_train = sp.EncodeAsPieces(text)
encoded_test = sp.EncodeAsPieces(test)

with open(f'en_s2.txt', 'w') as f:
    f.write(' '.join(encoded_train))

with open(f'en_test2.txt', 'w') as f:
    f.write(' '.join(encoded_test))

In [96]:
# English on larger sub units

spm.SentencePieceTrainer.Train(f'--input=data/en_train.txt --model_prefix=en_larger --vocab_size=1500 --character_coverage=1.0 --model_type=bpe')

sp = spm.SentencePieceProcessor()
sp.load('en_larger.model')

encoded_train = sp.EncodeAsPieces(text)
encoded_test = sp.EncodeAsPieces(test)

with open(f'en_s3.txt', 'w') as f:
    f.write(' '.join(encoded_train))

with open(f'en_test3.txt', 'w') as f:
    f.write(' '.join(encoded_test))

In [143]:
with open('data/bn_train.txt', 'r') as f:
    text = f.read().replace('\n', ' ')
    
with open('data/bn_test.txt', 'r') as f:
    test = f.read().replace('\n', ' ')

In [118]:
# Bengali on character level

spm.SentencePieceTrainer.Train(f'--input=data/bn_train.txt --model_prefix=bn_chars --vocab_size=71 --character_coverage=0.995 --model_type=bpe')

sp = spm.SentencePieceProcessor()
sp.load('bn_chars.model')

encoded_train = sp.EncodeAsPieces(text)
encoded_test = sp.EncodeAsPieces(test)

with open('bn_s1.txt', 'w') as f:
    f.write(' '.join(encoded_train))
    
with open('bn_test1.txt', 'w') as f:
    f.write(' '.join(encoded_test))

In [131]:
# Bengali on smaller sub units

spm.SentencePieceTrainer.Train(f'--input=data/bn_train.txt --model_prefix=bn_smaller --vocab_size=250 --character_coverage=0.995 --model_type=bpe')

sp = spm.SentencePieceProcessor()
sp.load('bn_smaller.model')

encoded_train = sp.EncodeAsPieces(text)
encoded_test = sp.EncodeAsPieces(test)

with open(f'bn_s2.txt', 'w') as f:
    f.write(' '.join(encoded_train))

with open(f'bn_test2.txt', 'w') as f:
    f.write(' '.join(encoded_test))

In [144]:
# Bengali on larger sub units

spm.SentencePieceTrainer.Train(f'--input=data/bn_train.txt --model_prefix=bn_larger --vocab_size=1500 --character_coverage=0.995 --model_type=bpe')

sp = spm.SentencePieceProcessor()
sp.load('bn_larger.model')

encoded_train = sp.EncodeAsPieces(text)
encoded_test = sp.EncodeAsPieces(test)

with open('bn_s3.txt', 'w') as f:
    f.write(' '.join(encoded_train))
    
with open('bn_test3.txt', 'w') as f:
    f.write(' '.join(encoded_test))

# Analysis for Ex. 3




In [3]:
%cd models

/home/lukas/dev/snlp_project/models


In [5]:
# change train_file, test_file to ../en_sx.txt and /en_testx.txt or ../bn_sx.txt and ../bn_testx.txt

train_file, test_file = '../bn_s3.txt', '../bn_test3.txt'

vocab, hidden, bptt, _class = 1972, 300, 10, 1

command = f"""../rnnlm/rnnlm \
-train {train_file} \
-valid {test_file} \
-rnnlm model_{vocab}_{hidden}_{bptt}_{_class} \
-hidden {hidden} \
-rand-seed 1 \
-debug 2 \
-bptt {bptt} \
-class {_class}"""

with open('rnnlm.sh', 'w') as f:
    f.write(command)
    
!bash rnnlm.sh

debug mode: 2
train file: ../bn_s3.txt
valid file: ../bn_test3.txt
class size: 1
Hidden layer size: 300
BPTT: 10
Rand seed: 1
rnnlm file: model_1972_300_10_1
Starting training using file ../bn_s3.txt
Vocab size: 1972
Words in train file: 396857
Iter:   0	Alpha: 0.100000	   TRAIN entropy: 9.3581    Words/sec: 1806.1   VALID entropy: 8.8129
Iter:   1	Alpha: 0.100000	   TRAIN entropy: 8.2916    Words/sec: 1815.7   VALID entropy: 8.1537
Iter:   2	Alpha: 0.100000	   TRAIN entropy: 7.8143    Words/sec: 1814.7   VALID entropy: 7.9017
Iter:   3	Alpha: 0.100000	   TRAIN entropy: 7.5736    Words/sec: 1808.4   VALID entropy: 7.7782
Iter:   4	Alpha: 0.100000	   TRAIN entropy: 7.4128    Words/sec: 1802.8   VALID entropy: 7.7054
Iter:   5	Alpha: 0.100000	   TRAIN entropy: 7.2892    Words/sec: 1814.2   VALID entropy: 7.6533
Iter:   6	Alpha: 0.100000	   TRAIN entropy: 7.1864    Words/sec: 1814.3   VALID entropy: 7.6174
Iter:   7	Alpha: 0.100000	   TRAIN entropy: 7.0964    Words/sec: 1814.5   VALID ent

# Analysis for Ex. 4

In [None]:
rnnlm = 'en_s'