#  Dask-cuDFを活用したスケーラブルなデータ処理の実践

## データセットの生成

In [1]:
from sklearn.datasets import fetch_covtype
import cudf
import dask_cudf

In [2]:
# データの読み込み
covtype = fetch_covtype()

# Cudf
# Cudfのデータフレームへ変換
cdf = cudf.DataFrame(covtype.data)

# カラム名を入れる
cdf.columns = covtype.feature_names

cdf['Cover_Type'] = covtype.target

# Dask_Cudf
# Dask_Cudfのデータフレーム変換
ddf = dask_cudf.from_cudf(cdf, npartitions=2)

# # csvファイルの読み込み
# # cudf
# cdf = cudf.read_csv("example_output/foo.csv")
# # dask_cudf
# ddf = dask_cudf.read_csv("example_output/foo.csv")

In [3]:
# カラム名
cdf.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 581012 entries, 0 to 581011
Data columns (total 55 columns):
 #   Column                              Non-Null Count   Dtype
---  ------                              --------------   -----
 0   Elevation                           581012 non-null  float64
 1   Aspect                              581012 non-null  float64
 2   Slope                               581012 non-null  float64
 3   Horizontal_Distance_To_Hydrology    581012 non-null  float64
 4   Vertical_Distance_To_Hydrology      581012 non-null  float64
 5   Horizontal_Distance_To_Roadways     581012 non-null  float64
 6   Hillshade_9am                       581012 non-null  float64
 7   Hillshade_Noon                      581012 non-null  float64
 8   Hillshade_3pm                       581012 non-null  float64
 9   Horizontal_Distance_To_Fire_Points  581012 non-null  float64
 10  Wilderness_Area_0                   581012 non-null  float64
 11  Wilderness_Area_1           

## データセットの確認

先頭の五行の確認

In [4]:
# cudf
cdf.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_31,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39,Cover_Type
0,2596.0,51.0,3.0,258.0,0.0,510.0,221.0,232.0,148.0,6279.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
1,2590.0,56.0,2.0,212.0,-6.0,390.0,220.0,235.0,151.0,6225.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
2,2804.0,139.0,9.0,268.0,65.0,3180.0,234.0,238.0,135.0,6121.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
3,2785.0,155.0,18.0,242.0,118.0,3090.0,238.0,238.0,122.0,6211.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
4,2595.0,45.0,2.0,153.0,-1.0,391.0,220.0,234.0,150.0,6172.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5


In [5]:
# dask_cudf
ddf.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_31,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39,Cover_Type
0,2596.0,51.0,3.0,258.0,0.0,510.0,221.0,232.0,148.0,6279.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
1,2590.0,56.0,2.0,212.0,-6.0,390.0,220.0,235.0,151.0,6225.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
2,2804.0,139.0,9.0,268.0,65.0,3180.0,234.0,238.0,135.0,6121.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
3,2785.0,155.0,18.0,242.0,118.0,3090.0,238.0,238.0,122.0,6211.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
4,2595.0,45.0,2.0,153.0,-1.0,391.0,220.0,234.0,150.0,6172.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5


一つのカラムについてソート（みやすさのため先頭の五行のみ表示している）

In [6]:
# cudf
cdf.sort_values(by="Aspect")

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_31,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39,Cover_Type
18,2504.0,0.0,4.0,95.0,5.0,691.0,214.0,232.0,156.0,5572.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
121,3070.0,0.0,11.0,30.0,-6.0,6890.0,204.0,220.0,153.0,2858.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
224,3004.0,0.0,12.0,484.0,85.0,5068.0,202.0,217.0,152.0,5840.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
271,2942.0,0.0,3.0,0.0,0.0,4830.0,215.0,233.0,156.0,6062.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
312,2861.0,0.0,6.0,134.0,10.0,2980.0,211.0,228.0,155.0,3972.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
442805,2986.0,360.0,27.0,210.0,127.0,808.0,170.0,180.0,139.0,3315.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
456552,3218.0,360.0,27.0,335.0,102.0,3750.0,170.0,180.0,139.0,3480.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7
457063,3218.0,360.0,26.0,212.0,72.0,3929.0,173.0,183.0,140.0,3515.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7
510127,3318.0,360.0,30.0,752.0,142.0,1449.0,161.0,170.0,134.0,1585.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,7


In [7]:
# dask_cudf
ddf.sort_values(by="Aspect").head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_31,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39,Cover_Type
17602,2827.0,0.0,11.0,175.0,39.0,3824.0,204.0,220.0,153.0,5495.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
111687,3142.0,0.0,5.0,618.0,29.0,3643.0,213.0,231.0,156.0,3577.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
32459,2972.0,0.0,11.0,272.0,145.0,5075.0,204.0,220.0,153.0,4930.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
201379,3297.0,0.0,2.0,175.0,63.0,4754.0,216.0,235.0,156.0,594.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
101931,2668.0,0.0,7.0,300.0,16.0,819.0,210.0,228.0,155.0,1677.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2


## 行と列の選択

列の選択

In [8]:
# cudf
cdf["Aspect"]

0          51.0
1          56.0
2         139.0
3         155.0
4          45.0
          ...  
581007    153.0
581008    152.0
581009    159.0
581010    170.0
581011    165.0
Name: Aspect, Length: 581012, dtype: float64

In [9]:
# dask_cudf
ddf["Aspect"].head()

0     51.0
1     56.0
2    139.0
3    155.0
4     45.0
Name: Aspect, dtype: float64

Dask-cuDFでは分散処理ができるようにデータが複数のチャンクに分割されているため，データの全てがローカルのGPUメモリにロードされているとは限らない。

したがって，Dask-cuDFでデータを確認したい場合，データを確認するための操作を明示的に行わなければならない。

カラム'Aspect'と'Slope'の2行目から5行目を取得

In [10]:
# cudf
cdf.loc[2:5, ["Aspect", "Slope"]]

Unnamed: 0,Aspect,Slope
2,139.0,9.0
3,155.0,18.0
4,45.0,2.0
5,132.0,6.0


In [81]:
# dask_cudf
ddf.loc[2:5, ["Aspect", "Slope"]].head()

Unnamed: 0,Aspect,Slope
2,139.0,9.0
3,155.0,18.0
4,45.0,2.0
5,132.0,6.0


In [11]:
# cudf
cdf.describe()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_31,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39,Cover_Type
count,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,...,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0
mean,2959.365301,155.656807,14.103704,269.428217,46.418855,2350.146611,212.146049,223.318716,142.528263,1980.291226,...,0.090392,0.077716,0.002773,0.003255,0.000205,0.000513,0.026803,0.023762,0.01506,2.051471
std,279.984734,111.913721,7.488242,212.549356,58.295232,1559.25487,26.769889,19.768697,38.274529,1324.19521,...,0.286743,0.267725,0.052584,0.056957,0.01431,0.022641,0.161508,0.152307,0.121791,1.396504
min,1859.0,0.0,0.0,0.0,-173.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,2809.0,58.0,9.0,108.0,7.0,1106.0,198.0,213.0,119.0,1024.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,2996.0,127.0,13.0,218.0,30.0,1997.0,218.0,226.0,143.0,1710.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
75%,3163.0,260.0,18.0,384.0,69.0,3328.0,231.0,237.0,168.0,2550.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
max,3858.0,360.0,66.0,1397.0,601.0,7117.0,254.0,254.0,254.0,7173.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,7.0


In [12]:
# dask_cudf
ddf.describe().head(8)

Unnamed: 0,Aspect,Cover_Type,Elevation,Hillshade_3pm,Hillshade_9am,Hillshade_Noon,Horizontal_Distance_To_Fire_Points,Horizontal_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Slope,...,Soil_Type_5,Soil_Type_6,Soil_Type_7,Soil_Type_8,Soil_Type_9,Vertical_Distance_To_Hydrology,Wilderness_Area_0,Wilderness_Area_1,Wilderness_Area_2,Wilderness_Area_3
count,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,...,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0
mean,155.656807,2.051471,2959.365301,142.528263,212.146049,223.318716,1980.291226,269.428217,2350.146611,14.103704,...,0.011316,0.000181,0.000308,0.001974,0.056168,46.418855,0.448865,0.051434,0.436074,0.063627
std,111.913721,1.396504,279.984734,38.274529,26.769889,19.768697,1324.19521,212.549356,1559.25487,7.488242,...,0.105775,0.013442,0.01755,0.044387,0.230245,58.295232,0.497379,0.220882,0.495897,0.244087
min,0.0,1.0,1859.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-173.0,0.0,0.0,0.0,0.0
25%,64.0,1.0,2888.0,120.0,201.0,213.0,1121.0,120.0,1243.0,10.0,...,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0
50%,146.0,2.0,3064.0,145.0,219.0,227.0,1984.0,247.0,2549.0,14.0,...,0.0,0.0,0.0,0.0,0.0,36.0,1.0,0.0,0.0,0.0
75%,261.0,2.0,3222.0,172.0,232.0,240.0,3002.0,430.0,4414.0,19.0,...,0.0,0.0,0.0,0.0,0.0,81.0,1.0,0.0,1.0,0.0
max,360.0,7.0,3858.0,254.0,254.0,254.0,7173.0,1397.0,7117.0,66.0,...,1.0,1.0,1.0,1.0,1.0,601.0,1.0,1.0,1.0,1.0


## 遅延評価

In [13]:
# cudf
cdf.groupby('Cover_Type', sort=True).mean()

Unnamed: 0_level_0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_30,Soil_Type_31,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39
Cover_Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3128.644888,156.138227,13.12711,270.555245,42.156939,2614.834517,211.998782,223.430211,143.875038,2009.253517,...,0.056,0.100821,0.085668,0.000444,0.004395,6.6e-05,0.0,0.041206,0.037207,0.022781
2,2920.936061,152.060515,13.550499,279.916442,45.884219,2429.530799,213.844423,225.326596,142.983466,2168.154849,...,0.046625,0.104327,0.089333,0.005051,4.2e-05,0.000148,0.0,0.002612,0.001264,0.001172
3,2394.509845,176.37249,20.770208,210.276473,62.446915,943.940734,201.918415,215.826537,140.367176,910.955949,...,0.0,0.002965,0.00014,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2223.939934,137.139425,18.528941,106.934838,41.186749,914.19949,228.345832,216.997088,111.392792,859.124135,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2787.417571,139.283051,16.641315,212.354893,50.610344,1349.765722,223.474876,219.035816,121.920889,1577.719794,...,0.03255,0.048457,0.054567,0.002107,0.0,0.0,0.0,0.0,0.0,0.0
6,2419.181897,180.539068,19.048886,159.853458,45.437439,1037.169805,192.844302,209.827662,148.284044,1055.351471,...,0.003628,0.011516,0.031036,0.000864,0.0,0.0,0.0,0.0,0.0,0.0
7,3361.928669,153.236226,14.255924,356.994686,69.474305,2738.250463,216.967723,221.746026,134.932033,2070.031594,...,0.010824,0.040907,0.031009,0.002487,0.046221,0.003072,0.014529,0.297611,0.27138,0.175134


In [14]:
# dask_cudf
target_sum = ddf.groupby('Cover_Type', sort=True).mean()

target_sum.compute()

Unnamed: 0_level_0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_30,Soil_Type_31,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39
Cover_Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3128.644888,156.138227,13.12711,270.555245,42.156939,2614.834517,211.998782,223.430211,143.875038,2009.253517,...,0.056,0.100821,0.085668,0.000444,0.004395,6.6e-05,0.0,0.041206,0.037207,0.022781
2,2920.936061,152.060515,13.550499,279.916442,45.884219,2429.530799,213.844423,225.326596,142.983466,2168.154849,...,0.046625,0.104327,0.089333,0.005051,4.2e-05,0.000148,0.0,0.002612,0.001264,0.001172
3,2394.509845,176.37249,20.770208,210.276473,62.446915,943.940734,201.918415,215.826537,140.367176,910.955949,...,0.0,0.002965,0.00014,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2223.939934,137.139425,18.528941,106.934838,41.186749,914.19949,228.345832,216.997088,111.392792,859.124135,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2787.417571,139.283051,16.641315,212.354893,50.610344,1349.765722,223.474876,219.035816,121.920889,1577.719794,...,0.03255,0.048457,0.054567,0.002107,0.0,0.0,0.0,0.0,0.0,0.0
6,2419.181897,180.539068,19.048886,159.853458,45.437439,1037.169805,192.844302,209.827662,148.284044,1055.351471,...,0.003628,0.011516,0.031036,0.000864,0.0,0.0,0.0,0.0,0.0,0.0
7,3361.928669,153.236226,14.255924,356.994686,69.474305,2738.250463,216.967723,221.746026,134.932033,2070.031594,...,0.010824,0.040907,0.031009,0.002487,0.046221,0.003072,0.014529,0.297611,0.27138,0.175134
