# Partitioning with Dask

In [2]:
# Loading a very big dataframe
import dask.dataframe as dd

In [7]:
# open a big file
df = dd.read_csv("data/systems_20250729.csv",  dtype={'number_records': 'float64'})
print(df.head())

   system_id system_public_name site_location timezone_or_utc_offset  \
0          2     Residential 1a  Lakewood, CO         America/Denver   
1          3     Residential 1b  Lakewood, CO         America/Denver   
2          4       NREL x-Si -1    Golden, CO                      7   
3         10        NREL CIS -1    Golden, CO                      7   
4         33  Silicor Materials    Golden, CO                      7   

   latitude  longitude  elevation_m  dc_capacity_kW kg_climate  \
0   39.7214  -105.0972       1675.0           2.912        Dfb   
1   39.7214  -105.0972       1675.0           2.720        Dfb   
2   39.7406  -105.1774       1795.3           1.000        BSk   
3   39.7404  -105.1774       1792.8           1.120        BSk   
4   39.7404  -105.1772       1794.0           2.400        BSk   

   pvcz_composite  ...  number_records  dataset_size_mb  \
0              12  ...      13685898.0           313.25   
1              12  ...      12668178.0           289

In [12]:
# print the number of partitions
print(df.npartitions)

1


Each partition within a dask dataset is a Pandas df.

In [13]:
# Check the partition structure
df.map_partitions(len).compute()

0    1862
dtype: int64

- more partitions: better parallelization but more overhead

In [14]:
## Chunking: defining the size of each partition

In [18]:
df_small_chunk = dd.read_csv("data/systems_20250729.csv", blocksize='50MB', dtype={'number_records': 'float64'})

In [19]:
df_small_chunk.head()

Unnamed: 0,system_id,system_public_name,site_location,timezone_or_utc_offset,latitude,longitude,elevation_m,dc_capacity_kW,kg_climate,pvcz_composite,...,number_records,dataset_size_mb,available_sensor_channels,qa_status,qa_issue,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30
0,2,Residential 1a,"Lakewood, CO",America/Denver,39.7214,-105.0972,1675.0,2.912,Dfb,12,...,13685898.0,313.25,7,fail,less than 1.0 years data,,,,,
1,3,Residential 1b,"Lakewood, CO",America/Denver,39.7214,-105.0972,1675.0,2.72,Dfb,12,...,12668178.0,289.95,7,fail,,,,,,
2,4,NREL x-Si -1,"Golden, CO",7,39.7406,-105.1774,1795.3,1.0,BSk,12,...,113978017.0,2608.75,15,pass,"Filtered time series less than 1.0 years data,...",,,,,
3,10,NREL CIS -1,"Golden, CO",7,39.7404,-105.1774,1792.8,1.12,BSk,12,...,113103574.0,2588.74,14,pass,Filtered time series less than 1.0 years data,,,,,
4,33,Silicor Materials,"Golden, CO",7,39.7404,-105.1772,1794.0,2.4,BSk,12,...,113673602.0,2601.78,15,pass,"Percent clipping exceeded threshold of 10%, Fi...",,,,,


## Saving in PARQUET format

This produces a lot a small parquet files (one per partition), which are faster to read than CSV

In [21]:
# Indicate a directory
df.to_parquet("data/save_parquet", write_index=False)