# comparison between Pandas, Dask, and Koalas

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import copy
import numpy as np
import pandas as pd
from pandas.testing import assert_frame_equal
from pandas.testing import assert_series_equal
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import treelite
import treelite_runtime
import dill
import joblib
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from pyspark.ml.classification import RandomForestClassifier as RFCSpark

**Gators** imports

In [3]:
# data cleaning
from gators.data_cleaning import (
    DropColumns,
    Replace,
)
# imputers
from gators.imputers import (
    NumericsImputer, 
    ObjectImputer,
)
# encoders
from gators.encoders import (
    WOEEncoder,
)
# binning
from gators.binning import (
    BinRareCategories,
    BinSingleTargetClassCategories,
    Binning,
    CustomBinning,
    QuantileBinning,
    TreeBinning,
)
# feature generation
from gators.feature_generation import (
    PolynomialFeatures,
    ElementaryArithmetics,
    PolynomialObjectFeatures,
    IsNull,
)
from gators.feature_generation_str import (
    StringContains,
    StringLength,
    Extract,
    SplitExtract,
)
# feature selection
from gators.feature_selection import (
    SelectFromModel,
    InformationValue
)
# model building
from gators.model_building import (
    TrainTestSplit,
    XGBBoosterBuilder,
    XGBTreeliteDumper,
)
# pipeline
from gators.pipeline import Pipeline

## pipeline

In [4]:
steps = [
    ('SplitExtractName', SplitExtract(['Name'], [', '], [1], ['Dummy'])),
    ('SplitExtractTitle', SplitExtract(['Dummy'], ['.'], [0], ['Title'])),
    ('StringLength', StringLength(columns=['Cabin', 'Ticket'])),
    ('DropColumns', DropColumns(['Name', 'Dummy', 'Cabin', 'Ticket'])),
    ('ObjectImputer', ObjectImputer(strategy='constant', value='MISSING')),
    ('BinSingleTargetClassCategories', BinSingleTargetClassCategories()),
    ('NumericsImputer', NumericsImputer(strategy='mean')),
    ('ElementaryArithmetics', ElementaryArithmetics(
        operator='+',
        columns_a=['SibSp'], 
        columns_b=['Parch'], 
        column_names=['FamilySize'])),
    ('TreeBinning', TreeBinning(
        tree=DecisionTreeClassifier(max_depth=2, min_samples_leaf=25),
        inplace=True)),
    ('PolynomialObjectFeatures', PolynomialObjectFeatures(
        columns=[
            'Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 
            'Title', 'FamilySize'],
        degree=2)),
    ('CleanCatego', BinRareCategories(min_ratio=0.)),
    ('Encoder', WOEEncoder(add_missing_categories=True)),
]

pipe = Pipeline(steps=steps, verbose=False)

## Pandas pipeline

In [5]:
data = pd.read_parquet('data/titanic.parquet')
data = data.reset_index(drop=True)
y = data['Survived']
X = data.drop('Survived', axis=1)
train_test_split = TrainTestSplit(test_ratio=0.3, strategy='ordered')
X_train, X_test, y_train, y_test = train_test_split.transform(X, y)

In [6]:
X_train_prepro_pd = pipe.fit_transform(X_train, y_train)
X_test_prepro_pd = pipe.transform(X_test)

## Dask pipeline

In [7]:
import dask.dataframe as dd
import dask.distributed
client = dask.distributed.Client()

In [8]:
data_dd = dd.read_parquet('data/titanic.parquet')
data_dd = data_dd.reset_index(drop=True)
y_dd = data_dd['Survived']
X_dd = data_dd.drop('Survived', axis=1)

train_test_split = TrainTestSplit(test_ratio=0.3, strategy='ordered')
X_train_dd, X_test_dd, y_train_dd, y_test_dd = train_test_split.transform(X_dd, y_dd)
X_train_dd = client.persist(X_train_dd)
X_test_dd = client.persist(X_test_dd)
y_train_dd = client.persist(y_train_dd)
y_test_dd = client.persist(y_test_dd)

In [9]:
X_train_prepro_dd = pipe.fit_transform(X_train_dd, y_train_dd)
X_test_prepro_dd = pipe.transform(X_test_dd)

## Koalas pipeline

In [10]:
from pyspark import SparkConf, SparkContext

conf = SparkConf()
conf.set('spark.executor.memory', '2g')
conf.set('spark.sql.codegen.wholeStage', 'false')
SparkContext(conf=conf)
import databricks.koalas as ks
ks.set_option('compute.default_index_type', 'distributed-sequence')

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/01/25 11:40:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [16]:
!pip install pyspark<3.2.0

zsh:1: no such file or directory: 3.2.0


In [11]:
data_ks = ks.read_parquet('data/titanic.parquet')
data_ks = data_ks.reset_index(drop=True)
y_ks = data_ks['Survived']
X_ks = data_ks.drop('Survived', axis=1)
train_test_split = TrainTestSplit(test_ratio=0.3, strategy='ordered')
X_train_ks, X_test_ks, y_train_ks, y_test_ks = train_test_split.transform(X_ks, y_ks)

                                                                                

In [12]:
X_train_prepro_ks = pipe.fit_transform(X_train_ks, y_train_ks)
X_test_prepro_ks = pipe.transform(X_test_ks)

22/01/25 11:41:05 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

## check results match

#### check pandas, dask, and koalas results match - production pipeline

In [13]:
assert_frame_equal(
    X_train_prepro_pd,
    X_train_prepro_ks.to_pandas())
assert_frame_equal(
    X_train_prepro_pd,
    X_train_prepro_dd.compute())
assert_frame_equal(
    X_test_prepro_pd,
    X_test_prepro_dd.compute())



In [14]:
assert_frame_equal(
    X_test_prepro_pd.reset_index(drop=True),
    X_test_prepro_ks.to_pandas())

                                                                                