In [None]:
! unzip merged_clean.txt.zip

https://huggingface.co/datasets/merve/folk-mythology-tales/tree/main

In [None]:
! pip install hmmlearn

In [None]:
! head merged_clean.txt

# Hidden Markov Model

In [None]:
import sys, json, codecs, pickle, argparse
import numpy as np
import string

import joblib
from sklearn.preprocessing import LabelEncoder
from hmmlearn import hmm
from nltk import FreqDist

In [None]:
file = open("./merged_clean.txt", 'r', encoding='utf-8').read()
sentences = file.split("\n")
table = str.maketrans('', '', string.punctuation)

for index, sentence in enumerate(sentences):
  if sentence == "":
      del sentences[index]

In [None]:
lines = [line.split() for line in sentences]

for line_index, line in enumerate(lines):
  for word_index, word in enumerate(line):
    lines[line_index][word_index] = word.translate(table)

words = [word.lower() for line in lines for word in line]

table = str.maketrans('', '', string.punctuation)
for index, word in enumerate(words):
    words[index] = word.translate(table)

alphabet = set(words)
le = LabelEncoder()
le.fit(list(alphabet))

seq = le.transform(words)
features = np.fromiter(seq, np.int64)
features = features.reshape(-1, 1)
# features = np.atleast_2d(features).T
fd = FreqDist(seq)

In [None]:
print(len(alphabet))
print(words[:10])
print(seq[:10])
print(features[:10])
print(fd.get(0))

In [None]:
model = hmm.CategoricalHMM(n_components=3, init_params="ste")

In [None]:
print(lines[:2])
print(features[:2])

In [None]:
model = model.fit(features)

In [None]:
symbols, _states = model.sample(20, random_state=42)
output = le.inverse_transform(np.squeeze(symbols))
for word in output:
        print(word, end=" ")

# GPT-2

In [None]:
! pip install --force-reinstall transformers[tf-cpu] tensorflow==2.8.0

In [None]:
from transformers import (
    GPT2Tokenizer,
    DataCollatorForLanguageModeling,
    TextDataset,
    GPT2LMHeadModel,
    TrainingArguments,
    Trainer,
    pipeline)

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2") 

In [None]:
print('vocabulary size: %d, max squence length: %d' % (tokenizer.vocab_size, tokenizer.model_max_length))
print('tokenize sequence "Once upon a time in a little village":', tokenizer('Once upon a time in a little village'))

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="merged_clean.txt",
    block_size=128)
     
test_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="merged_clean.txt",
    block_size=128)

In [None]:
print(tokenizer.decode(train_dataset[5]))

In [None]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [None]:
training_args = TrainingArguments(
    output_dir = 'data/out', # the output directory for the model predictions and checkpoints
    overwrite_output_dir = True, # overwrite the content of the output directory
    per_device_train_batch_size = 4, # the batch size for training
    per_device_eval_batch_size = 4, # the batch size for evaluation
    learning_rate = 5e-5, # defaults to 5e-5
    num_train_epochs = 1, # total number of training epochs to perform
)

trainer = Trainer(
    model = model,
    args = training_args,
    data_collator=data_collator,
    train_dataset = train_dataset,
    eval_dataset = test_dataset
)

In [None]:
import torch

torch.cuda.empty_cache()

In [None]:
trainer.train()

In [None]:
import time
time.sleep(3600)

In [None]:
trainer.save_model()

In [None]:
generator = pipeline('text-generation', tokenizer='gpt2', model='data/out')

In [None]:
print(generator('There is a building', max_length=500)[0]['generated_text'])

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("DeepESP/gpt2-spanish")

model = AutoModelForCausalLM.from_pretrained("DeepESP/gpt2-spanish")

In [None]:
generator = pipeline('text-generation', tokenizer='DeepESP/gpt2-spanish', model='DeepESP/gpt2-spanish')

In [None]:
print(generator('Había una vez', max_length=100)[0]['generated_text'])

In [None]:
generator = pipeline('sentiment-analysis')

In [None]:
print(generator('this is very good'))