In [1]:
import os
import ray
import raydp
import pandas as pd
import pyspark

print(f'ray version {ray.__version__}')
print(f'pandas version {pd.__version__}')
print(f'raydp version {raydp.__version__}')
print(f'pyspark version {pyspark.__version__}')

ray version 1.2.0
pandas version 1.1.4
raydp version 0.1.1
pyspark version 3.0.3


In [2]:
!java --version

openjdk 11.0.13 2021-10-19
OpenJDK Runtime Environment (build 11.0.13+8-Ubuntu-0ubuntu1.20.04)
OpenJDK 64-Bit Server VM (build 11.0.13+8-Ubuntu-0ubuntu1.20.04, mixed mode, sharing)


### start ray cluster, since we are on the head node, use default

In [3]:
from hyperplane.ray_common import initialize_ray_cluster, stop_ray_cluster, find_ray_workers
num_workers = 2
cpu_core_per_worker = 15
ram_gb_per_worker = 12 #110 GB allocatible for 16_128 nodes, 12 for 16_16 nodes, 27 for 32_32 nodes
ray_cluster = initialize_ray_cluster(num_workers, cpu_core_per_worker, ram_gb_per_worker)



deleting pod ray-worker-2f43d397-d6d4-4149-ace5-2dec9f63bc51
deleting pod ray-worker-3ce5b2e4-22ca-4750-8385-a6fba39cdf64
👉 Hyperplane: selecting worker node pool


2021-12-27 21:55:31,771	INFO services.py:1172 -- View the Ray dashboard at [1m[32mhttp://10.1.88.2:8787[39m[22m


ray dashboard available at https://shakdemo.hyperplane.dev/ray-stella2/#/
Waiting for worker ray-worker-34a60559-da51-49eb-9c36-d58267921fec...
Waiting for worker ray-worker-339ce231-fa5d-43f9-aa16-6d229ae467bb...


### change the logging level of spark


In [4]:
from pyspark import SparkContext, SparkConf
conf = SparkConf().set('spark.ui.port', '8788')
sc = SparkContext(conf=conf)
log4j = sc._jvm.org.apache.log4j
log4j.LogManager.getRootLogger().setLevel(log4j.Level.ERROR)


21/12/27 21:55:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/12/27 21:56:18 WARN HttpParser: Header is too large 8193>8192


### start spark session 

In [5]:
spark = raydp.init_spark('example', num_executors=2, executor_cores=4, executor_memory='4G')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/opt/conda/lib/python3.8/site-packages/ray/jars/ray_dist.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/opt/conda/lib/python3.8/site-packages/pyspark/jars/slf4j-log4j12-1.7.30.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]


2021-12-27 21:57:46 WARN  NativeCodeLoader:60 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### read tsv data from s3

In [10]:
ds = spark.read.csv(path='s3a://d2v-tmp/demo/bach_inference/data/imdb_reviews.tsv', sep ='\t', header = True)

In [11]:
ds.show()

+-------+---------+--------------------+
|     id|sentiment|              review|
+-------+---------+--------------------+
| 5814_8|        1|With all this stu...|
| 2381_9|        1|"The Classic War ...|
| 7759_3|        0|The film starts w...|
| 3630_4|        0|It must be assume...|
| 9495_8|        1|Superbly trashy a...|
| 8196_8|        1|I dont know why p...|
| 7166_2|        0|This movie could ...|
|10633_1|        0|I watched this vi...|
|  319_1|        0|A friend of mine ...|
|8713_10|        1|<br /><br />This ...|
| 2486_3|        0|What happens when...|
|6811_10|        1|Although I genera...|
|11744_9|        1|"Mr. Harvey Light...|
| 7369_1|        0|I had a feeling t...|
|12081_1|        0|note to George Li...|
| 3561_4|        0|Stephen King adap...|
| 4489_1|        0|`The Matrix' was ...|
| 3951_2|        0|Ulli Lommel's 198...|
|3304_10|        1|This movie is one...|
|9352_10|        1|Most people, espe...|
+-------+---------+--------------------+
only showing top

### do some cleaning 

In [12]:
## dropna
ds = ds.dropna()
ds.count()

                                                                                

25000

In [13]:
## remove html tags
from pyspark.sql.functions import col, udf,regexp_replace,isnull
ds = ds.withColumn("review_clean",regexp_replace(col('review'), '<[^>]+>', ''))
ds.show(5)

+------+---------+--------------------+--------------------+
|    id|sentiment|              review|        review_clean|
+------+---------+--------------------+--------------------+
|5814_8|        1|With all this stu...|With all this stu...|
|2381_9|        1|"The Classic War ...|"The Classic War ...|
|7759_3|        0|The film starts w...|The film starts w...|
|3630_4|        0|It must be assume...|It must be assume...|
|9495_8|        1|Superbly trashy a...|Superbly trashy a...|
+------+---------+--------------------+--------------------+
only showing top 5 rows



### save cleaned data to parquet on s3 

In [14]:
try:
    ds.write.parquet("s3a://d2v-tmp/demo/bach_inference/data/imdb_reviews_clean.parquet")
except:
    pass

### read back parquet data with pandas to do downstream tasks

In [16]:
import pandas as pd
df = pd.read_parquet("s3://d2v-tmp/demo/bach_inference/data/imdb_reviews_clean.parquet")
print(df.shape)
df.head(2)

(25000, 4)


Unnamed: 0,id,sentiment,review,review_clean
0,5814_8,1,With all this stuff going down at the moment w...,With all this stuff going down at the moment w...
1,2381_9,1,"""The Classic War of the Worlds"" by Timothy Hin...","""The Classic War of the Worlds"" by Timothy Hin..."


In [8]:
stop_ray_cluster(ray_cluster)

Deleting ray-worker-2f43d397-d6d4-4149-ace5-2dec9f63bc51
Deleting ray-worker-3ce5b2e4-22ca-4750-8385-a6fba39cdf64


In [7]:
#Use this in case you forgot your workers
w = find_ray_workers()

ray-worker-2f43d397-d6d4-4149-ace5-2dec9f63bc51	Running	10.1.91.3
ray-worker-3ce5b2e4-22ca-4750-8385-a6fba39cdf64	Running	10.1.92.3
