# 2. Small-scale ChatGPT - generování textu pomocí neuronových sítí

!pip install -r requirements.txt

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import torch

In [2]:
# Make device agnostic code
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [3]:
#device = 'cpu'  # Force CPU for this example

In [4]:
torch.device(device)

device(type='cuda')

# Generování jmen pomocí jednoduché RNN

## Příprava dat

Databáze evropských jmen a příjmení: https://data.europa.eu/data/datasets/5bc35259634f41122d982759?locale=cs

Složka /datasets/SeznamJmenCR

In [138]:
import glob
import pandas as pd

csv_files = glob.glob('datasets/SeznamJmenCR/Seznam_muzskych_jmen*.csv')
dfs = [pd.read_csv(f, encoding='utf-8') for f in csv_files]
print(f"Loaded {len(dfs)} CSV files.")

jmena_muz_all = []

for i in range(len(dfs)):
    jmena = dfs[i][2:].values.flatten()
    jmena = jmena[~pd.isna(jmena)]
    jmena_muz_all = np.concatenate((jmena_muz_all, jmena))
    
jmena_muz_all.shape, jmena_muz_all

Loaded 5 CSV files.


((3063,),
 array(['AADAR', 'BACH', 'CADEN', ..., 'TYMUR', 'TYRIAN', 'TYSON'],
       shape=(3063,), dtype=object))

In [139]:
csv_files = glob.glob('datasets/SeznamJmenCR/Seznam_zenskych_jmen*.csv')
dfs = [pd.read_csv(f, encoding='utf-8') for f in csv_files]
print(f"Loaded {len(dfs)} CSV files.")

jmena_zen_all = []

for i in range(len(dfs)):
    jmena = dfs[i][2:].values.flatten()
    jmena = jmena[~pd.isna(jmena)]
    jmena_zen_all = np.concatenate((jmena_zen_all, jmena))
    
jmena_zen_all.shape, jmena_zen_all

Loaded 5 CSV files.


((3937,),
 array(['AALIYAH', 'BABETA', 'CAITIR', ..., 'VRATISLAVA', 'VRINDAVANI',
        'VRONA'], shape=(3937,), dtype=object))

In [140]:
csv_files = glob.glob('datasets/SeznamJmenCR/Seznam_rodove_neutralnich_jmen*.csv')
dfs = [pd.read_csv(f, encoding='utf-8') for f in csv_files]
print(f"Loaded {len(dfs)} CSV files.")

jmena_rod_all = []

for i in range(len(dfs)):
    jmena = dfs[i][2:].values.flatten()
    jmena = jmena[~pd.isna(jmena)]
    jmena_rod_all = np.concatenate((jmena_rod_all, jmena))
    
jmena_rod_all.shape, jmena_rod_all

Loaded 5 CSV files.


((4296,),
 array(['AAGTE', 'BAAIKE', 'CAELESTIS', ..., 'TYRESE', 'TYSK', 'TYSKE'],
       shape=(4296,), dtype=object))

In [141]:
jmena_all = np.concatenate((jmena_muz_all, jmena_zen_all, jmena_rod_all))
jmena_all.shape

(11296,)

In [142]:
np.unique(jmena_all).shape

(11162,)

In [143]:
jmena_all[0]

'AADAR'

In [144]:
positions = np.where(jmena_all == 'SARA **')[0]
print(positions)

[10100]


In [185]:
dfs = pd.read_csv('datasets/SeznamJmenCR/OpenData_-_Seznam_jmen_k_2025-05-31.csv', encoding='utf-8')
jmena_typ2 = dfs.iloc[:, 0]
jmena_all2 = dfs.iloc[:, 1]
jmena_all2.shape

(11331,)

In [194]:
jmena_all2, idx = np.unique(jmena_all2, return_index=True)
jmena_typ2 = jmena_typ2[idx]
jmena_all2.shape, jmena_typ2.shape

((11172,), (11172,))

Vyčištění speciálních znaků

In [195]:
import string

jmena_all3 = []

# Set vocab to all utf-8 printable characters
vocab = list('`1234567890-=[];\',./*-+.~!@#$%^&*()_+{}:"|<>?')

for jmeno in jmena_all2:
    cleaned = ''.join([ch for ch in str(jmeno) if ch not in vocab])
    first_seq = cleaned.split()[0] if cleaned.split() else ''
    jmena_all3.append(first_seq)
    
jmena_all3 = np.array(jmena_all3)
len(jmena_all3)

11172

In [196]:
positions = np.where(jmena_all3 == 'SARA **')[0]
print(positions)

[]


In [207]:
# The unique characters in the file
vocab = sorted(set('\n'.join(jmena_all3)))

# insert special starting character
vocab.append('$')

print(f'{len(vocab)} unique characters')
print(vocab)

63 unique characters
['\n', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'i', 'Á', 'Â', 'Ä', 'Æ', 'Ç', 'É', 'Ë', 'Í', 'Ï', 'Ó', 'Ô', 'Ö', 'Ø', 'Ú', 'Û', 'Ü', 'Ý', 'Ć', 'Č', 'Ď', 'Ě', 'Ľ', 'Ł', 'Ň', 'Ř', 'Ş', 'Š', 'Ť', 'Ů', 'Ű', 'Ź', 'Ż', 'Ž', 'ʼ', '$']


In [208]:
allowed_characters = ''.join(vocab)
print(allowed_characters)
n_letters = len(vocab)


ABCDEFGHIJKLMNOPQRSTUVWXYZiÁÂÄÆÇÉËÍÏÓÔÖØÚÛÜÝĆČĎĚĽŁŇŘŞŠŤŮŰŹŻŽʼ$


In [209]:
# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    # return our out-of-vocabulary character if we encounter a letter unknown to our model
    if letter not in allowed_characters:
        return allowed_characters.find("_")
    else:
        return allowed_characters.find(letter)

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

In [212]:
lineToTensor('JAN')

tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]])

# Generování textu pomocí character-level RNN

## Příprava dat

In [None]:
import os

url = 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt'
path_to_file = 'datasets/shakespeare.txt'

if not os.path.exists(path_to_file):
    torch.hub.download_url_to_file(url, path_to_file)

100.0%


### Prozkoumání textu

In [8]:
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print(f'Length of text: {len(text)} characters')

Length of text: 1115394 characters


In [9]:
# Take a look at the first 250 characters in text
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [43]:
# The unique characters in the file
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')
print(vocab)

65 unique characters
['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


### Zpracování textu na tokeny

In [None]:
allowed_characters = ''.join(vocab)
print(allowed_characters)
n_letters = len(vocab)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [45]:
# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    # return our out-of-vocabulary character if we encounter a letter unknown to our model
    if letter not in allowed_characters:
        return allowed_characters.find("_")
    else:
        return allowed_characters.find(letter)

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

## Zdroje

https://docs.pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html

https://www.tensorflow.org/text/tutorials/text_generation