In [1]:
!pip install findspark

Defaulting to user installation because normal site-packages is not writeable


In [1]:
import os

import findspark
findspark.init()

from pyspark.sql import SparkSession

In [3]:
os.environ['PYSPARK_PYTHON'] = './venv/bin/python'
spark = SparkSession\
    .builder\
    .master('local[*]')\
    .appName('Spark ML Research')\
    .config('spark.sql.repl.eagerEval.enabled', True) \
    .config('spark.yarn.dist.archives', 's3a://pyspark-venvs/mlflow-dataproc-2.1.18.tar.gz#venv')\
    .getOrCreate()

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/lib/spark/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/lib/hadoop/lib/slf4j-log4j12-1.7.30.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/12/07 18:11:42 WARN S3ABlockOutputStream: Application invoked the Syncable API against stream writing to dataproc/hadoop/var/log/spark/apps/local-1733595100267.inprogress. This is unsupported


## Считываем данные 

In [10]:
df = spark.read.csv(
    's3a://mlops204-dataproc-bucket/data/titanic/train.csv', 
    header=True, 
    inferSchema=True
)

df = df.select([
    'Survived',
    'Pclass',
    'Sex',
    'Age',
    'SibSp',
    'Parch',
    'Fare',
    'Embarked'
]).na.drop()

df.limit(10)

Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
1,3,female,26.0,0,0,7.925,S
1,1,female,35.0,1,0,53.1,S
0,3,male,35.0,0,0,8.05,S
0,1,male,54.0,0,0,51.8625,S
0,3,male,2.0,3,1,21.075,S
1,3,female,27.0,0,2,11.1333,S
1,2,female,14.0,1,0,30.0708,C
1,3,female,4.0,1,1,16.7,S


## Строим модель

In [8]:
from pyspark.ml.feature import VectorAssembler, OneHotEncoder, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml import Pipeline

In [9]:
gender_index = StringIndexer(inputCol='Sex', outputCol='SexIndex')
gender_encoder = OneHotEncoder(inputCol='SexIndex', outputCol='SexVector')

embark_indexer = StringIndexer(inputCol='Embarked', outputCol='EmbarkedIndex')
embark_encoder = OneHotEncoder(inputCol='EmbarkedIndex', outputCol='EmbarkVector')

assembler = VectorAssembler(
    inputCols=[
        'Pclass',
        'SexVector',
        'Age',
        'SibSp',
        'Parch',
        'Fare',
        'EmbarkVector'
    ],
    outputCol='features'
)

dataproc = Pipeline(stages=[
    gender_index,
    embark_indexer,
    gender_encoder,
    embark_encoder,
    assembler
])

In [11]:
train_data = dataproc.fit(df).transform(df)
train_data.limit(10)

Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,SexIndex,EmbarkedIndex,SexVector,EmbarkVector,features
0,3,male,22.0,1,0,7.25,S,0.0,0.0,"(1,[0],[1.0])","(2,[0],[1.0])","[3.0,1.0,22.0,1.0..."
1,1,female,38.0,1,0,71.2833,C,1.0,1.0,"(1,[],[])","(2,[1],[1.0])","[1.0,0.0,38.0,1.0..."
1,3,female,26.0,0,0,7.925,S,1.0,0.0,"(1,[],[])","(2,[0],[1.0])","(8,[0,2,5,6],[3.0..."
1,1,female,35.0,1,0,53.1,S,1.0,0.0,"(1,[],[])","(2,[0],[1.0])","[1.0,0.0,35.0,1.0..."
0,3,male,35.0,0,0,8.05,S,0.0,0.0,"(1,[0],[1.0])","(2,[0],[1.0])","[3.0,1.0,35.0,0.0..."
0,1,male,54.0,0,0,51.8625,S,0.0,0.0,"(1,[0],[1.0])","(2,[0],[1.0])","[1.0,1.0,54.0,0.0..."
0,3,male,2.0,3,1,21.075,S,0.0,0.0,"(1,[0],[1.0])","(2,[0],[1.0])","[3.0,1.0,2.0,3.0,..."
1,3,female,27.0,0,2,11.1333,S,1.0,0.0,"(1,[],[])","(2,[0],[1.0])","[3.0,0.0,27.0,0.0..."
1,2,female,14.0,1,0,30.0708,C,1.0,1.0,"(1,[],[])","(2,[1],[1.0])","[2.0,0.0,14.0,1.0..."
1,3,female,4.0,1,1,16.7,S,1.0,0.0,"(1,[],[])","(2,[0],[1.0])","[3.0,0.0,4.0,1.0,..."
