In [2]:
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)

In [3]:
import os
from datetime import datetime
from pathlib import Path

import pandas as pd
import numpy as np

import dask
from dask import dataframe as dd
from dask import array as da

data_path = Path(os.getcwd()).parent/"data"

### From 
https://www.youtube.com/watch?v=RlIiVeig3hc&t=269s

### Show installed version

In [4]:
pd.show_versions()


INSTALLED VERSIONS
------------------
commit                : d9cdd2ee5a58015ef6f4d15c7226110c9aab8140
python                : 3.12.2.final.0
python-bits           : 64
OS                    : Linux
OS-release            : 5.14.0-427.33.1.el9_4.x86_64
Version               : #1 SMP PREEMPT_DYNAMIC Wed Aug 28 17:34:59 UTC 2024
machine               : x86_64
processor             : x86_64
byteorder             : little
LC_ALL                : None
LANG                  : en_US.UTF-8
LOCALE                : en_US.UTF-8

pandas                : 2.2.2
numpy                 : 1.26.4
pytz                  : 2024.1
dateutil              : 2.9.0.post0
setuptools            : 69.5.1
pip                   : 24.0
Cython                : None
pytest                : None
hypothesis            : None
sphinx                : None
blosc                 : None
feather               : None
xlsxwriter            : None
lxml.etree            : None
html5lib              : None
pymysql               : Non

In [5]:
dask.__version__

'2024.5.0'

### Create sample Dataframe

In [6]:
pdf = pd.DataFrame({"col1":[1, 2], "col2":['a','b']})
pdf

Unnamed: 0,col1,col2
0,1,a
1,2,b


In [7]:
pd.DataFrame(np.random.rand(4,8), columns=list('abcdefgh'))

Unnamed: 0,a,b,c,d,e,f,g,h
0,0.204247,0.016034,0.984315,0.091247,0.313482,0.411202,0.305601,0.181554
1,0.55077,0.01802,0.367183,0.835317,0.464552,0.293975,0.081861,0.411832
2,0.524485,0.122259,0.171648,0.546236,0.125679,0.8849,0.344714,0.826906
3,0.344524,0.94714,0.235267,0.018802,0.622988,0.965541,0.129054,0.668323


In [9]:
ddf = dd.DataFrame.from_dict({"col1":[1, 2], "col2":['a','b']})
ddf

Unnamed: 0_level_0,col1,col2
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1
0,int64,string
1,...,...


In [22]:
rng = da.random.default_rng()
a = rng.random((100,10), chunks=(10,10))
ddf = dd.from_array(a)
ddf.compute()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.231766,0.069781,0.879722,0.483587,0.144375,0.193723,0.807462,0.632617,0.504236,0.976793
1,0.462958,0.282372,0.364058,0.271053,0.227345,0.647358,0.433484,0.645055,0.725535,0.679687
2,0.791118,0.117311,0.633800,0.719787,0.782236,0.586520,0.717798,0.621156,0.124227,0.992249
3,0.606188,0.239512,0.260884,0.934414,0.823046,0.854305,0.625959,0.842741,0.139494,0.361172
4,0.285336,0.105768,0.249023,0.529218,0.529070,0.960292,0.256111,0.810293,0.243414,0.439882
...,...,...,...,...,...,...,...,...,...,...
95,0.777997,0.647762,0.500732,0.812366,0.718206,0.429019,0.403460,0.696930,0.974498,0.527414
96,0.844981,0.352188,0.741689,0.286182,0.729170,0.365715,0.531138,0.967228,0.375592,0.030786
97,0.686314,0.272778,0.811421,0.131140,0.679675,0.332746,0.175911,0.263534,0.080005,0.922961
98,0.275575,0.959491,0.543524,0.758677,0.646440,0.512807,0.425919,0.811455,0.889433,0.956755


### Rename columns

In [25]:
pdf = pd.DataFrame({"col 1":[1, 2], "col 2":['a','b']})
pdf

Unnamed: 0,col 1,col 2
0,1,a
1,2,b


In [27]:
pdf.rename(columns=lambda x : x.replace(" ", "_"))

Unnamed: 0,col_1,col_2
0,1,a
1,2,b


In [28]:
pdf.add_prefix("x_")

Unnamed: 0,x_col 1,x_col 2
0,1,a
1,2,b


In [29]:
pdf.add_suffix("_y")

Unnamed: 0,col 1_y,col 2_y
0,1,a
1,2,b


In [30]:
ddf = dd.DataFrame.from_dict({"col 1":[1, 2], "col 2":['a','b']})
ddf

Unnamed: 0_level_0,col 1,col 2
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1
0,int64,string
1,...,...


In [33]:
ddf.rename(columns=lambda x : x.replace(" ", "_")).compute()

Unnamed: 0,col_1,col_2
0,1,a
1,2,b


In [35]:
ddf.add_prefix("x_").compute()

Unnamed: 0,x_col 1,x_col 2
0,1,a
1,2,b


In [36]:
ddf.add_suffix("_y").compute()

Unnamed: 0,col 1_y,col 2_y
0,1,a
1,2,b
