https://docs.pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial

In [2]:
from pathlib import Path
import pandas as pd

In [3]:
data_path = '~/data/tbts_data'
! mkdir -p $data_path
data_path = ! find $data_path
data_path = Path(data_path[0])

In [4]:
outputfile_name = 'tbts-names'

In [5]:
path_to_raw_data = '~/data/tbts_data/raw/names/'
! mkdir -p $path_to_raw_data
path_to_raw_data = ! find $path_to_raw_data
path_to_raw_data = path_to_raw_data[0]

In [6]:

! wget --no-check-certificate -P $path_to_raw_data https://download.pytorch.org/tutorial/data.zip

--2025-09-02 15:02:44--  https://download.pytorch.org/tutorial/data.zip
Resolving download.pytorch.org (download.pytorch.org)... 18.165.72.122, 18.165.72.111, 18.165.72.41, ...
Connecting to download.pytorch.org (download.pytorch.org)|18.165.72.122|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2882130 (2.7M) [application/zip]
Saving to: ‘/home/jovyan/data/tbts_data/raw/names/data.zip’


2025-09-02 15:02:46 (1.74 MB/s) - ‘/home/jovyan/data/tbts_data/raw/names/data.zip’ saved [2882130/2882130]



In [12]:
! ls $path_to_raw_data

! unzip $path_to_raw_data/'data.zip' -d $path_to_raw_data


data.zip
Archive:  /home/jovyan/data/tbts_data/raw/names/data.zip
   creating: /home/jovyan/data/tbts_data/raw/names/data/
  inflating: /home/jovyan/data/tbts_data/raw/names/data/eng-fra.txt  
   creating: /home/jovyan/data/tbts_data/raw/names/data/names/
  inflating: /home/jovyan/data/tbts_data/raw/names/data/names/Arabic.txt  
  inflating: /home/jovyan/data/tbts_data/raw/names/data/names/Chinese.txt  
  inflating: /home/jovyan/data/tbts_data/raw/names/data/names/Czech.txt  
  inflating: /home/jovyan/data/tbts_data/raw/names/data/names/Dutch.txt  
  inflating: /home/jovyan/data/tbts_data/raw/names/data/names/English.txt  
  inflating: /home/jovyan/data/tbts_data/raw/names/data/names/French.txt  
  inflating: /home/jovyan/data/tbts_data/raw/names/data/names/German.txt  
  inflating: /home/jovyan/data/tbts_data/raw/names/data/names/Greek.txt  
  inflating: /home/jovyan/data/tbts_data/raw/names/data/names/Irish.txt  
  inflating: /home/jovyan/data/tbts_data/raw/names/data/names/Italian.t

In [13]:

path_to_raw_data = Path(path_to_raw_data)
path = path_to_raw_data / 'data' / 'names'
! ls $path

Arabic.txt   English.txt  Irish.txt	Polish.txt	Spanish.txt
Chinese.txt  French.txt   Italian.txt	Portuguese.txt	Vietnamese.txt
Czech.txt    German.txt   Japanese.txt	Russian.txt
Dutch.txt    Greek.txt	  Korean.txt	Scottish.txt


In [16]:
def load_labeled_names(
    names_data_path: Path, file_names: list[str]
) -> tuple[list[list[str]], list[str]]:
    labels: list[str] = []
    name_cahrs: list[list[str]] = []
    for fnanme in file_names:
        with open(names_data_path / fnanme, mode="r", encoding="utf-8") as f:
            for line in f.read().strip().split("\n"):
                name_cahrs.append(list(line))
                labels.append(fnanme)
    return name_cahrs, labels

In [17]:
file_names = sorted([f_path.name for f_path in path.glob("*.txt")])
ts_column_name = 'name_chars'
df = pd.DataFrame(dict(zip([ts_column_name, 'labels'],  load_labeled_names(names_data_path=path, file_names=file_names))))

In [18]:
data_path / Path(f'{outputfile_name}-labels.parquet')

PosixPath('/home/jovyan/data/tbts_data/tbts-names-labels.parquet')

In [30]:

df[['labels']].to_parquet(path=data_path / Path(f'{outputfile_name}-labels.parquet') , engine='pyarrow')

In [28]:
df.head()

Unnamed: 0,name_chars,labels
0,"[K, h, o, u, r, y]",Arabic.txt
1,"[N, a, h, a, s]",Arabic.txt
2,"[D, a, h, e, r]",Arabic.txt
3,"[G, e, r, g, e, s]",Arabic.txt
4,"[N, a, z, a, r, i]",Arabic.txt


In [32]:
df[['labels']][-5:]

Unnamed: 0,labels
20069,Vietnamese.txt
20070,Vietnamese.txt
20071,Vietnamese.txt
20072,Vietnamese.txt
20073,Vietnamese.txt


In [21]:
df.loc[:, df.columns != 'labels']

Unnamed: 0,name_chars
0,"[K, h, o, u, r, y]"
1,"[N, a, h, a, s]"
2,"[D, a, h, e, r]"
3,"[G, e, r, g, e, s]"
4,"[N, a, z, a, r, i]"
...,...
20069,"[T, r, u, o, n, g]"
20070,"[V, a, n]"
20071,"[V, i, n, h]"
20072,"[V, u, o, n, g]"


In [22]:
df_tbts =df.loc[:, df.columns != 'labels'].explode(ts_column_name).reset_index().rename(columns={'index': 'G'})
df_tbts.head(10)

Unnamed: 0,G,name_chars
0,0,K
1,0,h
2,0,o
3,0,u
4,0,r
5,0,y
6,1,N
7,1,a
8,1,h
9,1,a


In [23]:

df_tbts.to_parquet(path=data_path / Path(f'{outputfile_name}.parquet') , engine='pyarrow')

In [24]:
output_files_path = data_path / Path(f'{outputfile_name}')
! ls -lah $output_files_path*

-rw-r--r-- 1 jovyan users 1.9K Sep  2 15:14 /home/jovyan/data/tbts_data/tbts-names-labels.parquet
-rw-r--r-- 1 jovyan users 388K Sep  2 15:19 /home/jovyan/data/tbts_data/tbts-names.parquet


In [38]:
pd.read_parquet(path=data_path / Path(f'{outputfile_name}.parquet') , engine='pyarrow').groupby('G').ngroups

20074

In [36]:
pd.read_parquet(path=data_path / Path(f'{outputfile_name}-labels.parquet') , engine='pyarrow')

Unnamed: 0,labels
0,Arabic.txt
1,Arabic.txt
2,Arabic.txt
3,Arabic.txt
4,Arabic.txt
...,...
20069,Vietnamese.txt
20070,Vietnamese.txt
20071,Vietnamese.txt
20072,Vietnamese.txt


In [35]:
data_path / Path(f'{outputfile_name}-labels.parquet')

PosixPath('/home/jovyan/data/tbts_data/tbts-names-labels.parquet')