In [1]:
# !pip install findspark==2.0.1
# !pip install pandas==2.0.0
# !pip install numpy==1.24.2
# !pip install catboost==1.2.2

In [2]:
import os

import findspark


findspark.init()
findspark.find()


from pyspark.sql import SparkSession
from pyspark import SparkConf
from catboost import CatBoostClassifier

In [None]:
os.environ['PYSPARK_DRIVER_PYTHON'] = "python"
os.environ['PYSPARK_PYTHON'] = "./environment/bin/python"

In [3]:
conf = (
    SparkConf().setMaster("yarn").setAppName("train_dummy_model")
        .set("spark.executor.memory", "2g")
        .set("spark.driver.memory", "4g")
        .set("spark.sql.execution.arrow.pyspark.enabled", "true")
)


spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [4]:
DEFAULT_INPUT_DATA_PATH = "s3a://automl-otus-practice/data.csv"

In [5]:
sdf = (
    spark.read
    .format('csv')
    .options(header='true', inferSchema='true', delimiter=",")
    .load(DEFAULT_INPUT_DATA_PATH)
)

pdf = sdf.toPandas()

In [6]:
spark.stop()

In [7]:
features = [
    'has_more_payment_types', 'has_sequential', 'has_installments', 'avg_payment_value',
    'mean_days_purchase_to_approved', 'mean_days_approved_to_carrier',
    'mean_days_limit_to_carrier', 'buy_has_work_day', 'itens', 'sum_price',
    'sum_freight', 'sum_same_city', 'sum_same_state', 'mean_distance_km',
    'mean_p_name_lenght', 'mean_p_photos_qty', 'mean_p_weight_g',
    'mean_volume', 'mean_length_width_ratio'
]
target = "score"

train = pdf[features]
y = pdf[target]


model = CatBoostClassifier(
    loss_function='MultiClass',
    bootstrap_type='Bernoulli',
    learning_rate=0.01,
    n_estimators=300,
    subsample=0.5,
    depth=4,
    verbose=False,
)

In [8]:
model.fit(train, y).save_model("model.cbm", format="cbm")