In [1]:
!pip install findspark

Defaulting to user installation because normal site-packages is not writeable


In [1]:
import os

import findspark
findspark.init()

from pyspark.sql import SparkSession

In [29]:
os.environ['PYSPARK_PYTHON'] = './venv/bin/python'
spark = SparkSession\
    .builder\
    .master('local[*]')\
    .appName('Spark ML Research')\
    .config('spark.sql.repl.eagerEval.enabled', True) \
    .config('spark.yarn.dist.archives', 's3a://pyspark-venvs/mlflow-hyperopt-dataproc-2.1.18.tar.gz#venv')\
    .getOrCreate()

## Считываем данные 

In [30]:
df = spark.read.csv(
    's3a://mlops204-dataproc-bucket/data/titanic/train.csv', 
    header=True, 
    inferSchema=True
)

df = df.select([
    'Survived',
    'Pclass',
    'Sex',
    'Age',
    'SibSp',
    'Parch',
    'Fare',
    'Embarked'
]).na.drop()

df.limit(10)

Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
1,3,female,26.0,0,0,7.925,S
1,1,female,35.0,1,0,53.1,S
0,3,male,35.0,0,0,8.05,S
0,1,male,54.0,0,0,51.8625,S
0,3,male,2.0,3,1,21.075,S
1,3,female,27.0,0,2,11.1333,S
1,2,female,14.0,1,0,30.0708,C
1,3,female,4.0,1,1,16.7,S


## Конструируем пайплайн обработки данных

In [31]:
from pyspark.ml.feature import VectorAssembler, OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline

In [32]:
gender_index = StringIndexer(inputCol='Sex', outputCol='SexIndex')
gender_encoder = OneHotEncoder(inputCol='SexIndex', outputCol='SexVector')

embark_indexer = StringIndexer(inputCol='Embarked', outputCol='EmbarkedIndex')
embark_encoder = OneHotEncoder(inputCol='EmbarkedIndex', outputCol='EmbarkVector')

assembler = VectorAssembler(
    inputCols=[
        'Pclass',
        'SexVector',
        'Age',
        'SibSp',
        'Parch',
        'Fare',
        'EmbarkVector'
    ],
    outputCol='features'
)

dataproc = Pipeline(stages=[
    gender_index,
    embark_indexer,
    gender_encoder,
    embark_encoder,
    assembler
])

In [33]:
ready_data = dataproc.fit(df).transform(df)
ready_data.limit(10)

Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,SexIndex,EmbarkedIndex,SexVector,EmbarkVector,features
0,3,male,22.0,1,0,7.25,S,0.0,0.0,"(1,[0],[1.0])","(2,[0],[1.0])","[3.0,1.0,22.0,1.0..."
1,1,female,38.0,1,0,71.2833,C,1.0,1.0,"(1,[],[])","(2,[1],[1.0])","[1.0,0.0,38.0,1.0..."
1,3,female,26.0,0,0,7.925,S,1.0,0.0,"(1,[],[])","(2,[0],[1.0])","(8,[0,2,5,6],[3.0..."
1,1,female,35.0,1,0,53.1,S,1.0,0.0,"(1,[],[])","(2,[0],[1.0])","[1.0,0.0,35.0,1.0..."
0,3,male,35.0,0,0,8.05,S,0.0,0.0,"(1,[0],[1.0])","(2,[0],[1.0])","[3.0,1.0,35.0,0.0..."
0,1,male,54.0,0,0,51.8625,S,0.0,0.0,"(1,[0],[1.0])","(2,[0],[1.0])","[1.0,1.0,54.0,0.0..."
0,3,male,2.0,3,1,21.075,S,0.0,0.0,"(1,[0],[1.0])","(2,[0],[1.0])","[3.0,1.0,2.0,3.0,..."
1,3,female,27.0,0,2,11.1333,S,1.0,0.0,"(1,[],[])","(2,[0],[1.0])","[3.0,0.0,27.0,0.0..."
1,2,female,14.0,1,0,30.0708,C,1.0,1.0,"(1,[],[])","(2,[1],[1.0])","[2.0,0.0,14.0,1.0..."
1,3,female,4.0,1,1,16.7,S,1.0,0.0,"(1,[],[])","(2,[0],[1.0])","[3.0,0.0,4.0,1.0,..."


## Подбираем гиперпараметры и тренируем модель

In [34]:
from functools import partial

import mlflow
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, SparkTrials, Trials

from pyspark.ml.classification import LogisticRegression

In [38]:
# Определяем пространство поиска для hyperopt
search_space = {
    'regParam': hp.lognormal('regParam', 0, 1.0),
    'elasticNetParam': hp.lognormal('elasticNetParam', 0, 1.0),
    'fitIntercept': hp.choice('fitIntercept', [False, True])
}

def objective(params, train_data):
    print(params)

    accuracy = 1
    return {'loss': -accuracy, 'status': STATUS_OK}

In [39]:
train_data, test_data = ready_data.randomSplit([.7, .3])

trials = Trials()

best = fmin(
    fn=partial(objective, train_data=train_data),
    space=search_space,
    algo=tpe.suggest,
    max_evals=10,
    trials=trials
)

{'elasticNetParam': 4.57274705265695, 'fitIntercept': True, 'regParam': 0.2750182064962328}                                                                                           
{'elasticNetParam': 1.2868650819451264, 'fitIntercept': True, 'regParam': 1.368845223907106}                                                                                          
{'elasticNetParam': 0.7984944416134407, 'fitIntercept': False, 'regParam': 2.300725669366549}                                                                                         
{'elasticNetParam': 6.934564250187186, 'fitIntercept': False, 'regParam': 0.7032801181576447}                                                                                         
{'elasticNetParam': 1.1316492204072448, 'fitIntercept': False, 'regParam': 0.30000765509046656}                                                                                       
{'elasticNetParam': 2.561542480340059, 'fitIntercept': False, 'regParam': 6.269462242

Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,SexIndex,EmbarkedIndex,SexVector,EmbarkVector,features
0,1,female,25.0,1,2,151.55,S,1.0,0.0,"(1,[],[])","(2,[0],[1.0])","[1.0,0.0,25.0,1.0..."
0,1,male,24.0,0,0,79.2,C,0.0,1.0,"(1,[0],[1.0])","(2,[1],[1.0])","[1.0,1.0,24.0,0.0..."
0,1,male,27.0,0,2,211.5,C,0.0,1.0,"(1,[0],[1.0])","(2,[1],[1.0])","[1.0,1.0,27.0,0.0..."
0,1,male,29.0,0,0,30.0,S,0.0,0.0,"(1,[0],[1.0])","(2,[0],[1.0])","[1.0,1.0,29.0,0.0..."
0,1,male,31.0,1,0,52.0,S,0.0,0.0,"(1,[0],[1.0])","(2,[0],[1.0])","[1.0,1.0,31.0,1.0..."
0,1,male,36.0,0,0,40.125,C,0.0,1.0,"(1,[0],[1.0])","(2,[1],[1.0])","[1.0,1.0,36.0,0.0..."
0,1,male,37.0,0,1,29.7,C,0.0,1.0,"(1,[0],[1.0])","(2,[1],[1.0])","[1.0,1.0,37.0,0.0..."
0,1,male,37.0,1,0,53.1,S,0.0,0.0,"(1,[0],[1.0])","(2,[0],[1.0])","[1.0,1.0,37.0,1.0..."
0,1,male,38.0,0,1,153.4625,S,0.0,0.0,"(1,[0],[1.0])","(2,[0],[1.0])","[1.0,1.0,38.0,0.0..."
0,1,male,42.0,1,0,52.0,S,0.0,0.0,"(1,[0],[1.0])","(2,[0],[1.0])","[1.0,1.0,42.0,1.0..."
