In [None]:
import cudf
from cuml.preprocessing.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification

In [None]:
# generate fake data
n_samples, n_cat_features, n_cont_features, n_classes = 10_000, 2, 100, 2
features_cat_numpy = np.random.binomial(1, 0.5, size=(n_samples, n_cat_features))
features_cont_numpy, labels_numpy = make_classification(n_samples=n_samples, n_features=n_cont_features)

# combine dataset
dataset_numpy = np.c_[features_cat_numpy, features_cont_numpy, labels_numpy]


# set types
dataset_numpy = dataset_numpy.astype(np.float64)

print(dataset_numpy.dtype)
print(dataset_numpy.shape)
print(dataset_numpy[0])

In [None]:
# create dataframe
df = pd.DataFrame(dataset_numpy)

# set column names
column_names = []
column_names += ['feature_cat_{}'.format(i) for i in range(n_cat_features)]
column_names += ['feature_cont_{}'.format(i) for i in range(n_cont_features)]
column_names += ['target']
df.columns = column_names

print(df.dtypes)
print(df.shape)

In [None]:
# convert to cudf
gdf = cudf.DataFrame.from_pandas(df)
print(gdf)

In [None]:
# split datasets
X_train, X_test, y_train, y_test = train_test_split(gdf, 'target', train_size=0.80)

In [None]:
# recombine
train_gdf = cudf.concat([X_train, cudf.DataFrame({'target': y_train})], axis=1)
test_gdf = cudf.concat([X_test, cudf.DataFrame({'target': y_test})], axis=1)

In [None]:
# # write to parquet
# filename = 'dataset.parquet'
# cudf.io.parquet.to_parquet(gdf, filename)

# write to csv
filename = 'train_dataset.csv'
cudf.io.csv.to_csv(train_gdf, filename, index=False)

filename = 'test_dataset.csv'
cudf.io.csv.to_csv(test_gdf, filename, index=False)

In [None]:
# # load data
# gdf = cudf.io.parquet.read_parquet(filename)

# load data
# gdf = cudf.io.csv.read_csv(filename, index_col='Unnamed: 0')
gdf = cudf.io.csv.read_csv(filename)

In [None]:
print(gdf)
print(gdf.columns)