# Initialization

In [1]:
!spark-shell --version

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 3.4.1
      /_/
                        
Using Scala version 2.12.17, OpenJDK 64-Bit Server VM, 17.0.8
Branch HEAD
Compiled by user centos on 2023-06-19T23:01:01Z
Revision 6b1ff22dde1ead51cbf370be6e48a802daae58b6
Url https://github.com/apache/spark
Type --help for more information.


In [2]:
!pip install --quiet minio

# Titanic Logistic Regression using PySpark
This is a code along of the famous titanic dataset, its always nice to start off with this dataset because it is an example you will find across pretty much every data analysis language.

# Download data from public

In [3]:
!wget https://raw.githubusercontent.com/meddash-cloud/meddash-public-datasets/main/data/titanic.csv

--2023-08-29 13:30:12--  https://raw.githubusercontent.com/meddash-cloud/meddash-public-datasets/main/data/titanic.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 60302 (59K) [text/plain]
Saving to: ‘titanic.csv.1’


2023-08-29 13:30:12 (6.25 MB/s) - ‘titanic.csv.1’ saved [60302/60302]



# Upload the CSV file to Minio

In [4]:
BUCKET_NAME="jupyter-spark-03"
minio_url = "minio-service.kubeflow.svc.cluster.local:9000"
minio_key = "minio"
minio_secret = "minio123"

In [5]:
csv_file_name = "titanic.csv"

In [6]:
from minio import Minio
from minio.error import S3Error
import os

upload_file_name=csv_file_name
upload_file_path="./{}".format(upload_file_name)
config = {
    "endpoint": minio_url,
    "access_key": "minio",
    "secret_key": "minio123",
    "secure": False,
    }


# Create a client with the MinIO server playground, its access key
# and secret key.
print ("connecting to minio {}".format(minio_url))
minio_client = Minio(**config)

print("try to find bucket {}".format(BUCKET_NAME))
found = minio_client.bucket_exists(BUCKET_NAME)
print("found", found)
if not found:
    minio_client.make_bucket(BUCKET_NAME)
else:
    print("Bucket '{}' already exists".format(BUCKET_NAME))

print("upload file to minio...")
minio_client.fput_object(BUCKET_NAME, os.path.basename(upload_file_path), upload_file_path)

connecting to minio minio-service.kubeflow.svc.cluster.local:9000
try to find bucket jupyter-spark-03
found True
Bucket 'jupyter-spark-03' already exists
upload file to minio...


<minio.helpers.ObjectWriteResult at 0x7f8289b19e90>

In [7]:
from pyspark.sql import SparkSession

In [8]:
minio_file_path="s3a://{}/{}".format(BUCKET_NAME, csv_file_name)

In [9]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
import os

cwd = os.getcwd()

# .config("spark.jars", "spark_jars/hadoop-aws-2.7.3.jar") \
#.config('spark.jars.packages','org.apache.hadoop:hadoop-aws:2.7.3')\
#.config("spark.executor.extraClassPath", f"{cwd}/spark_jars/hadoop-aws-2.7.3.jar")\
#.config("spark.jars", f"{cwd}/spark_jars/hadoop-aws-2.7.3.jar")\
#.config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.3")\
#.config('spark.jars.packages','org.apache.hadoop:hadoop-aws:3.3.6')\

# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.hadoop:hadoop-aws:3.3.6 pyspark-shell'
#.config('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider')\
#.config('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider')\


minio_url = "minio-service.kubeflow.svc.cluster.local:9000"
spark = SparkSession\
.builder\
.appName("ReadTextFilesFromS3")\
.master("local[*]")\
.config('spark.jars.packages','org.apache.hadoop:hadoop-aws:3.3.4')\
.config("spark.hadoop.fs.s3a.endpoint", "http://"+minio_url)\
.config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")\
.config("spark.hadoop.fs.s3a.path.style.access", "true")\
.config("spark.hadoop.fs.s3a.access.key", "minio")\
.config("spark.hadoop.fs.s3a.secret.key", "minio123")\
.getOrCreate()



sc = spark.sparkContext




In [10]:
data = spark.read.csv(minio_file_path,inferSchema=True,header=True)

In [11]:
data.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [12]:
data.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [13]:
my_cols = data.select(['Survived',
 'Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked'])

In [14]:
my_final_data = my_cols.na.drop()

### Working with Categorical Columns

Let's break this down into multiple steps to make it all clear.

In [15]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,
                                OneHotEncoder,StringIndexer)

In [16]:
gender_indexer = StringIndexer(inputCol='Sex',outputCol='SexIndex')
gender_encoder = OneHotEncoder(inputCol='SexIndex',outputCol='SexVec')

In [17]:
embark_indexer = StringIndexer(inputCol='Embarked',outputCol='EmbarkIndex')
embark_encoder = OneHotEncoder(inputCol='EmbarkIndex',outputCol='EmbarkVec')

In [18]:
assembler = VectorAssembler(inputCols=['Pclass',
 'SexVec',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'EmbarkVec'],outputCol='features')

In [19]:
from pyspark.ml.classification import LogisticRegression

## Pipelines 

Let's see an example of how to use pipelines (we'll get a lot more practice with these later!)

In [20]:
from pyspark.ml import Pipeline

In [21]:
log_reg_titanic = LogisticRegression(featuresCol='features',labelCol='Survived')

In [22]:
pipeline = Pipeline(stages=[gender_indexer,embark_indexer,
                           gender_encoder,embark_encoder,
                           assembler,log_reg_titanic])

In [23]:
train_titanic_data, test_titanic_data = my_final_data.randomSplit([0.7,.3])

In [24]:
fit_model = pipeline.fit(train_titanic_data)

In [25]:
results = fit_model.transform(test_titanic_data)

In [26]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [27]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='Survived')

In [31]:
results.select('Survived','prediction').show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
+--------+----------+
only showing top 20 rows



In [32]:
AUC = my_eval.evaluate(results)

In [33]:
AUC

0.8038455121788455