# JupyterHub Notebook

### This notebook server is hosted on the OpenShift platform which provides a separate server for each individual user. The platform takes care of the provisioning of the server and allocating related to storage.

### First, install and import required libraries, watermark our file, initialise our Spark Session Builder and initialise our environment with required configuration

In [2]:
%pip install watermark
%pip install Minio
%pip install matplotlib


Collecting watermark
  Downloading watermark-2.2.0-py2.py3-none-any.whl (6.8 kB)
Installing collected packages: watermark
Successfully installed watermark-2.2.0
You should consider upgrading via the '/opt/app-root/bin/python3.8 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
You should consider upgrading via the '/opt/app-root/bin/python3.8 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
Collecting matplotlib
  Downloading matplotlib-3.5.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (11.3 MB)
[K     |████████████████████████████████| 11.3 MB 4.5 MB/s eta 0:00:01
[?25hCollecting fonttools>=4.22.0
  Downloading fonttools-4.28.2-py3-none-any.whl (880 kB)
[K     |████████████████████████████████| 880 kB 134.8 MB/s eta 0:00:01
Collecting setuptools-scm>=4
  Downloading setuptools_scm-6.3.2-py3-none-any.whl (33 kB)
Collecting numpy>=1.17
  Downloading numpy-1.

In [3]:
import os
import json
from pyspark import SparkConf
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import from_json, col, to_json, struct
import watermark
from minio import Minio

%matplotlib inline
%load_ext watermark

ModuleNotFoundError: No module named 'pyspark'

In [None]:
%watermark -n -v -m -g -iv


In [None]:
sparkSessionBuilder = SparkSession\
    .builder\
    .appName("Customer Churn ingest Pipeline")

In [6]:
submit_args = "--conf spark.jars.ivy=/tmp \
--conf spark.hadoop.fs.s3a.endpoint=http://minio-ml-workshop:9000 \
--conf spark.hadoop.fs.s3a.access.key=minio \
--conf spark.hadoop.fs.s3a.secret.key=minio123 \
--conf spark.hadoop.fs.s3a.path.style.access=true \
--conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
--packages org.apache.hadoop:hadoop-aws:3.2.0"



###  Connect to Spark Cluster provided by OpenShift Platform

In [None]:
import spark_util

spark = spark_util.getOrCreateSparkSession("ML Ops Demo", submit_args)
spark.sparkContext.setLogLevel("INFO")
print('Spark context started.')

###  Declare our input data sources, import and combine them

In [None]:
dataFrame_Customer = spark.read\
                .options(delimeter=',', inferSchema='True', header='True') \
                .csv("s3a://rawdata/customers/Customer-Churn_P1.csv")
dataFrame_Customer.printSchema()

In [None]:
dataFrame_Products = spark.read\
                .options(delimeter=',', inferSchema='True', header='True') \
                .csv("s3a://rawdata/products/Customer-Churn_P2.csv")
dataFrame_Products.printSchema()


In [None]:
# from pyspark.sql.types import *
# from  pyspark.sql.functions import *

# srcKafkaBrokers = "odh-message-bus-kafka-bootstrap:9092"
# srcKakaTopic = "datatelco"



# schema = StructType()\
#     .add("customerID", IntegerType())\
#     .add("PhoneService", StringType())\
#     .add("MultipleLines", StringType())\
#     .add("InternetService", StringType())\
#     .add("OnlineSecurity", StringType())\
#     .add("OnlineBackup", StringType())\
#     .add("DeviceProtection", StringType())\
#     .add("TechSupport", StringType())\
#     .add("StreamingTV", StringType())\
#     .add("StreamingMovies", StringType())\
#     .add("Contract", StringType())\
#     .add("PaperlessBilling", StringType())\
#     .add("PaymentMethod", StringType())\
#     .add("MonthlyCharges", StringType())\
#     .add("TotalCharges", DoubleType())\
#     .add("Churn", StringType())



# #Read from JSON Kafka messages into a dataframe
# dfKafka = spark.read.format("kafka")\
#     .option("kafka.bootstrap.servers", srcKafkaBrokers)\
#     .option("subscribe", srcKakaTopic)\
#     .option("startingOffsets", "earliest")\
#     .load()\
#     .withColumn("value", regexp_replace(col("value").cast("string"), "\\\\", "")) \
#     .withColumn("value", regexp_replace(col("value"), "^\"|\"$", "")) \
#     .selectExpr("CAST(value AS STRING) as jsonValue")\
#     .rdd.map(lambda row: row["jsonValue"])

# dfObj = spark.read.schema(schema).json(dfKafka)
# dfObj.printSchema()
# dfObj.show(n=2)


In [None]:
dataFrom_All = dataFrame_Customer.join(dataFrame_Products, "customerID", how="full")

###  Push prepared data to object storage and stop Spark cluster to save resources
###  Note - be sure to change this user_id on the next line to your username (something in the range user1 ... user30)

In [None]:
user_id = "user29"
file_location = "s3a://data/full_data_csv" + user_id
dataFrom_All.repartition(1).write.mode("overwrite")\
    .option("header", "true")\
    .format("csv").save(file_location)

In [None]:
spark.stop()