In [1]:
import cudf
from cuml.preprocessing.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification

In [2]:
# generate fake data
n_samples, n_cat_features, n_cont_features, n_classes = 10_000, 2, 100, 2
features_cat_numpy = np.random.binomial(1, 0.5, size=(n_samples, n_cat_features))
features_cont_numpy, labels_numpy = make_classification(n_samples=n_samples, n_features=n_cont_features)

# combine dataset
dataset_numpy = np.c_[features_cat_numpy, features_cont_numpy, labels_numpy]


# set types
dataset_numpy = dataset_numpy.astype(np.float64)

print(dataset_numpy.dtype)
print(dataset_numpy.shape)
print(dataset_numpy[0])

float64
(10000, 103)
[ 1.          0.          0.96943081 -0.94587334 -0.38306207 -1.34187705
  0.09529707  0.9203697   1.07852331 -0.91985799  0.54983161  1.6118916
 -0.48190531  0.78806093  2.05944633  0.86572757 -1.14111906 -0.25521409
  1.67393951 -0.47865666  0.17791119 -1.14433277 -0.79134766  0.43823341
  1.38151481 -1.44798172  1.15377386 -2.19586368 -0.55137387  1.56585318
  0.23913585 -0.57153832 -0.11644495 -3.25390131  0.06674186 -2.05150703
  1.48168734  0.6952747   0.18511629  0.04429615 -1.00958391  0.37563284
  1.01371203  0.57127259  1.47252721 -0.63335838  2.04563295  0.19982813
 -0.88490954 -1.75416386  0.53646123 -1.37322762  1.39387177 -0.5871511
  0.22583313 -1.95406999 -1.94639048  1.93356712 -0.36380218 -0.83268376
 -0.26961751 -0.04396685  0.18572127  0.14682872  0.10646075 -0.16167882
 -0.55390738 -0.43807029 -1.32622329 -0.87473373 -0.18396113 -0.67610005
 -1.36474895 -0.16348101 -0.96766089  0.42646688  0.28428658  0.75647606
 -2.29572377  0.28910566  0.1681

In [3]:
# create dataframe
df = pd.DataFrame(dataset_numpy)

# set column names
column_names = []
column_names += ['feature_cat_{}'.format(i) for i in range(n_cat_features)]
column_names += ['feature_cont_{}'.format(i) for i in range(n_cont_features)]
column_names += ['target']
df.columns = column_names

print(df.dtypes)
print(df.shape)

feature_cat_0      float64
feature_cat_1      float64
feature_cont_0     float64
feature_cont_1     float64
feature_cont_2     float64
feature_cont_3     float64
feature_cont_4     float64
feature_cont_5     float64
feature_cont_6     float64
feature_cont_7     float64
feature_cont_8     float64
feature_cont_9     float64
feature_cont_10    float64
feature_cont_11    float64
feature_cont_12    float64
feature_cont_13    float64
feature_cont_14    float64
feature_cont_15    float64
feature_cont_16    float64
feature_cont_17    float64
feature_cont_18    float64
feature_cont_19    float64
feature_cont_20    float64
feature_cont_21    float64
feature_cont_22    float64
feature_cont_23    float64
feature_cont_24    float64
feature_cont_25    float64
feature_cont_26    float64
feature_cont_27    float64
                    ...   
feature_cont_71    float64
feature_cont_72    float64
feature_cont_73    float64
feature_cont_74    float64
feature_cont_75    float64
feature_cont_76    float64
f

In [4]:
# convert to cudf
gdf = cudf.DataFrame.from_pandas(df)
print(gdf)

      feature_cat_0  feature_cat_1  feature_cont_0  feature_cont_1  \
0               1.0            0.0        0.969431       -0.945873   
1               1.0            1.0       -2.714386       -0.576433   
2               0.0            1.0        0.599862        0.859744   
3               0.0            0.0        1.712468       -0.136523   
4               0.0            0.0        0.021567       -1.096121   
5               0.0            1.0        0.166401       -0.483437   
6               0.0            0.0        0.875240       -0.788792   
7               1.0            1.0       -1.603928        0.751741   
8               1.0            1.0       -1.830694       -0.704603   
9               1.0            0.0        0.184704        0.143249   
10              0.0            0.0       -1.633593       -1.085122   
11              1.0            1.0       -0.863587       -0.823893   
12              1.0            1.0       -0.958481       -0.477798   
13              0.0 

In [5]:
# split datasets
X_train, X_test, y_train, y_test = train_test_split(gdf, 'target', train_size=0.80)

In [6]:
# recombine
train_gdf = cudf.concat([X_train, cudf.DataFrame({'target': y_train})], axis=1)
test_gdf = cudf.concat([X_test, cudf.DataFrame({'target': y_test})], axis=1)

In [7]:
# # write to parquet
# filename = 'dataset.parquet'
# cudf.io.parquet.to_parquet(gdf, filename)

# write to csv
filename = 'train_dataset.csv'
cudf.io.csv.to_csv(train_gdf, filename, index=False)

filename = 'test_dataset.csv'
cudf.io.csv.to_csv(test_gdf, filename, index=False)

In [8]:
# # load data
# gdf = cudf.io.parquet.read_parquet(filename)

# load data
# gdf = cudf.io.csv.read_csv(filename, index_col='Unnamed: 0')
gdf = cudf.io.csv.read_csv(filename)

In [9]:
print(gdf)
print(gdf.columns)

      feature_cat_0  feature_cat_1  feature_cont_0  feature_cont_1  \
0               0.0            0.0        1.123005       -0.534690   
1               0.0            1.0        0.767571        1.062773   
2               0.0            0.0       -0.734209        0.336177   
3               0.0            0.0        0.467867       -1.142996   
4               0.0            1.0       -1.133106       -0.016016   
5               1.0            1.0       -0.698909        0.776821   
6               1.0            0.0        0.314155        0.236959   
7               1.0            0.0       -1.075433       -1.565516   
8               1.0            0.0       -1.863171        1.044700   
9               0.0            0.0        0.909086       -0.890660   
10              1.0            1.0       -0.157249        0.462149   
11              1.0            1.0       -0.329658        1.791957   
12              0.0            1.0       -0.511981       -0.762221   
13              1.0 