In [1]:
import sys, csv
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import datediff, col, abs
from pyspark import SparkConf

In [2]:
# change ip for spark cluster
spark = SparkSession.builder.master("spark://172.20.0.9:7077") \
.appName("extract_and_load") \
.config("spark.cassandra.connection.host", "172.20.0.11") \
.config("spark.cassandra.connection.port", "9042") \
.config("spark.cassandra.auth.username", "cassandra") \
.config("spark.cassandra.auth.password", "cassandra") \
.config("spark.jars.packages", "com.datastax.spark:spark-cassandra-connector_2.12:3.1.0") \
.getOrCreate()

#.config("spark.cassandra.output.ifNotExists", "true") \
#.config("spark.jars", "/opt/bitnami/spark/jars/spark-cassandra-connector-2.4.0-s_2.11.jar") \
#.config("spark.jars.packages", "com.datastax.spark:spark-cassandra-connector_2.12:3.1.0") \
#.config("spark.jars.packages","com.datastax.spark:spark-cassandra-connector_2.12:3.2.0-beta,com.datastax.cassandra:cassandra-driver-core:3.11 spark-cassandra-connector-assembly-1.1.1-SNAPSHOT.jar")


In [3]:
def load_and_get_table_df(keys_space_name, table_name):
    table_df = spark.read\
        .format("org.apache.spark.sql.cassandra")\
        .options(table=table_name, keyspace=keys_space_name)\
        .load()
    return table_df

#### Write dataframe in cassandra

In [4]:
csv_df = spark.read.format("csv").option("header", "true").load("/usr/local/spark/data/previous_employees_by_job_title.txt")

In [5]:
write_df = csv_df.select("job_title", "employee_id", "employee_name", "first_day", "last_day")

In [6]:
write_df.write\
    .format("org.apache.spark.sql.cassandra")\
    .mode('append')\
    .options(table="previous_employees_by_job_title", keyspace='test')\
    .save()

#### Consume cassandra to process and import

In [7]:
load_and_get_table_df("test", "previous_employees_by_job_title").show()

+-----------+--------------------+---------------+-------------------+-------------------+
|  job_title|         employee_id|  employee_name|          first_day|           last_day|
+-----------+--------------------+---------------+-------------------+-------------------+
|Audiologist|003f0fba-e91f-4d8...|   David Holmes|2020-10-25 14:13:41|2017-06-27 12:53:05|
|Audiologist|0053b81d-d54d-485...|   Ember Carter|2000-11-12 15:19:13|2009-12-08 22:18:37|
|Audiologist|0064f13e-f2b4-4e0...|    Kieth Evans|2006-05-12 19:42:58|2002-08-15 20:36:58|
|Audiologist|0085d6e9-f34a-442...|  Deborah Boyle|2014-11-18 02:25:32|2017-06-19 09:07:47|
|Audiologist|008af4a2-51c7-466...| Sebastian Lunt|2003-11-14 06:48:28|2000-06-11 04:56:36|
|Audiologist|009c9b1f-b02d-43e...|  Vanessa Allen|2007-01-22 15:30:56|2011-03-09 15:24:39|
|Audiologist|00ac5ab1-d111-450...|Phillip Clayton|2000-12-17 02:38:48|2003-05-25 18:18:33|
|Audiologist|00e58f28-3295-4e9...| Rhea Partridge|2010-01-08 17:51:55|2019-11-05 16:06:52|

In [8]:
# configure database catalog
spark.conf.set(f"spark.sql.catalog.cassandra", "com.datastax.spark.connector.datasource.CassandraCatalog")
spark.sql("use cassandra.test")

DataFrame[]

In [9]:
calcDF = spark.sql("select job_title, employee_id, employee_name, abs(datediff(last_day, first_day)) as days_worked from previous_employees_by_job_title")

In [10]:
calcDF.show()

+-----------+--------------------+---------------+-----------+
|  job_title|         employee_id|  employee_name|days_worked|
+-----------+--------------------+---------------+-----------+
|Audiologist|003f0fba-e91f-4d8...|   David Holmes|       1216|
|Audiologist|0053b81d-d54d-485...|   Ember Carter|       3313|
|Audiologist|0064f13e-f2b4-4e0...|    Kieth Evans|       1366|
|Audiologist|0085d6e9-f34a-442...|  Deborah Boyle|        944|
|Audiologist|008af4a2-51c7-466...| Sebastian Lunt|       1251|
|Audiologist|009c9b1f-b02d-43e...|  Vanessa Allen|       1507|
|Audiologist|00ac5ab1-d111-450...|Phillip Clayton|        889|
|Audiologist|00e58f28-3295-4e9...| Rhea Partridge|       3588|
|Audiologist|00fe8832-54b0-445...|     Liam Brown|        292|
|Audiologist|01010970-3adb-4be...|   Monica Glass|        887|
|Audiologist|0102d14d-226d-4b2...| Sabina Ventura|       3294|
|Audiologist|0103df49-7a5a-4bf...|  Amelia Coates|       4322|
|Audiologist|01043c6b-fb99-445...|     Bryon Lane|     

In [11]:
calcDF.write\
    .format("org.apache.spark.sql.cassandra")\
    .mode('append')\
    .options(table="days_worked_by_previous_employees_by_job_title", keyspace='test')\
    .save()

In [12]:
spark.stop()