# Example to Read / Write to Redis with Spark

Documentation: https://github.com/RedisLabs/spark-redis/

NOTE: Spark dataframe integration is limited to Redis hashes only. No other data structures are supported with Spark dataframes.

In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
# REDIS CONFIGURATION
redis_host = "redis"
redis_port = "6379"

In [3]:
# Spark init
spark = SparkSession \
    .builder \
    .master("local") \
    .appName('jupyter-pyspark') \
      .config("spark.redis.host", redis_host)\
      .config("spark.redis.port", redis_port)\
      .config("spark.jars.packages","com.redislabs:spark-redis_2.12:3.1.0")\
    .getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [4]:
# read local data
df = spark.read.option("multiline","true").json("/home/jovyan/datasets/json-samples/stocks.json")
df.toPandas()

Unnamed: 0,price,symbol
0,126.82,AAPL
1,3098.12,AMZN
2,251.11,FB
3,1725.05,GOOG
4,128.39,IBM
5,212.55,MSFT
6,78.0,NET
7,497.0,NFLX
8,823.8,TSLA
9,45.11,TWTR


In [5]:
# Write to back to redis as a hash under the following key stocks
df.write.format("org.apache.spark.sql.redis")\
  .mode("overwrite")\
  .option("table", "stocks")\
  .option("key.column","symbol")\
  .save()

In [6]:
# read back from Redis!
df1 = spark.read.format("org.apache.spark.sql.redis")\
  .option("table", "stocks")\
  .option("key.column", "symbol")\
  .load()
df1.toPandas()

Unnamed: 0,price,symbol
0,212.55,MSFT
1,1725.05,GOOG
2,823.8,TSLA
3,497.0,NFLX
4,3098.12,AMZN
5,126.82,AAPL
6,78.0,NET
7,128.39,IBM
8,45.11,TWTR
9,251.11,FB


In [12]:
# Read in HASH data that was not written from Spark

df1 = spark.read.format("org.apache.spark.sql.redis")\
  .option("keys.pattern", "user:*")\
  .option("key.column", "userid")\
  .option("infer.schema",True) \
  .load()
df1.toPandas()

Unnamed: 0,name,userid
0,bill,2
1,mike,1
