### Environment config for CoLab

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.0.3/spark-3.0.3-bin-hadoop2.7.tgz
!tar xf spark-3.0.3-bin-hadoop2.7.tgz
!pip install -q findspark==1.4.2 catboost==1.0.3

[K     |████████████████████████████████| 76.3 MB 1.3 MB/s 
[?25h

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.3-bin-hadoop2.7"
import findspark
findspark.init()

In [3]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from pyspark.sql.functions import col
from pyspark.sql.types import StructField, StructType

In [21]:
import pandas as pd
df = pd.read_csv("https://github.com/loverberg/portfolio/raw/main/Big_Data_MTC/telco-customer-churn.csv")
df = df.drop(['year', 'month', 'noadditionallines'], axis=1)

<a href='https://catboost.ai/en/docs/concepts/spark-cluster-configuration'>Catboost Spark Cluster config</a>

In [4]:
spark = SparkSession.builder\
    .master('local[*]')\
    .appName('CatBoostWithSpark')\
    .config("spark.jars.packages", "ai.catboost:catboost-spark_3.0_2.12:1.0.3")\
    .config("spark.executor.cores", "2")\
    .config("spark.task.cpus", "2")\
    .config("spark.driver.memory", "2g")\
    .config("spark.driver.memoryOverhead", "2g")\
    .config("spark.executor.memory", "2g")\
    .config("spark.executor.memoryOverhead", "2g")\
    .getOrCreate()

In [5]:
spark

Docs <a href='https://catboost.ai/docs/catboost-spark/3.0_2.12/latest/api/python/'>catboost-spark</a>

In [6]:
import catboost_spark

### Prepare DataSet

In [22]:
sparkDF = spark.createDataFrame(df)

In [16]:
TARGET_LABEL = 'churn'

In [50]:
evaluator = MulticlassClassificationEvaluator(
    labelCol=TARGET_LABEL, 
    predictionCol="prediction", 
    metricName='f1')

In [51]:
trainDF, testDF = sparkDF.randomSplit([0.85, 0.15])

### Pipeline model with CatBoost
<a href='https://catboost.ai/docs/catboost-spark/3.0_2.12/latest/api/python/api/catboost_spark.CatBoostClassifier.html?highlight=catboostclassifier#catboostclassifier'>CatBoostClassifier</a>

In [105]:
feature_metadata = create_metadata(features, {'customersuspended_index': 2,\
                                              'education_index': 2,\
                                              'gender_index': 2,\
                                              'homeowner_index': 2,\
                                              'maritalstatus_index': 2,\
                                              'occupation_index': 3,\
                                              'state_index': 50,\
                                              'usesinternetservice_index': 2,\
                                              'usesvoiceservice_index': 2,\
                                              })

In [106]:
customersuspended_indexer = StringIndexer(inputCol='customersuspended', 
                                          outputCol="customersuspended_index")
education_indexer = StringIndexer(inputCol='education',
                                  outputCol="education_index")
gender_indexer = StringIndexer(inputCol='gender',
                                  outputCol="gender_index")
homeowner_indexer = StringIndexer(inputCol='homeowner',
                                  outputCol="homeowner_index")
maritalstatus_indexer = StringIndexer(inputCol='maritalstatus',
                                  outputCol="maritalstatus_index")
occupation_indexer = StringIndexer(inputCol='occupation',
                                  outputCol="occupation_index")
state_indexer = StringIndexer(inputCol='state',
                                  outputCol="state_index")
usesinternetservice_indexer = StringIndexer(inputCol='usesinternetservice',
                                  outputCol="usesinternetservice_index")
usesvoiceservice_indexer = StringIndexer(inputCol='usesvoiceservice',
                                  outputCol="usesvoiceservice_index")

features = ['age', 'annualincome', 'calldroprate', 'callfailurerate', 'callingnum',
       'customerid', 'customersuspended_index', 'education_index', 'gender_index', 
       'homeowner_index', 'maritalstatus_index', 'monthlybilledamount', 'numberofcomplaints',
       'numberofmonthunpaid', 'numdayscontractequipmentplanexpiring',
       'occupation_index', 'penaltytoswitch', 'state_index', 'totalminsusedinlastmonth',
       'unpaidbalance', 'usesinternetservice_index', 'usesvoiceservice_index',
       'percentagecalloutsidenetwork', 'totalcallduration', 'avgcallduration']
       
assembler = VectorAssembler(inputCols=features, outputCol='features')

In [95]:
withColumn("features_with_meta", col("features").alias("", metadata=feature_metadata))

NameError: ignored

In [107]:
classifier = catboost_spark.CatBoostClassifier(featuresCol='features', labelCol=TARGET_LABEL)
classifier.setIterations(200)
classifier.setDepth(10)

CatBoostClassifier_508e83c45785

In [108]:
pipeline = Pipeline(stages=[customersuspended_indexer,\
                            education_indexer,\
                            gender_indexer,\
                            homeowner_indexer,\
                            maritalstatus_indexer,\
                            occupation_indexer,\
                            state_indexer,\
                            usesinternetservice_indexer,\
                            usesvoiceservice_indexer,\
                            assembler,\
                            classifier])

In [110]:
p_model = pipeline.fit(trainDF)   # хочу вставить стейдж с метаданными -
# withColumn("features_with_meta", col("features").alias("", metadata=feature_metadata)) 

In [59]:
predictions = p_model.transform(testDF)

In [61]:
print(f'Model F1 = {evaluator.evaluate(predictions)}')

Model F1 = 0.9831345126567173


In [56]:
type(p_model)

pyspark.ml.pipeline.PipelineModel

In [None]:
p_model.write().overwrite().save('catboost_pipeline')

### Catboost limitations
<a href='https://catboost.ai/en/docs/concepts/spark-known-limitations'>List of limitations</a>