In [1]:
import sys, csv
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import datediff, col, abs
from pyspark import SparkConf

### Dependency - create keyspace and tables in cassandra

> create keyspace test with replication = {'class': 'SimpleStrategy', 'replication_factor': 1};


> CREATE TABLE test.previous_employees_by_job_title (
    job_title text,
    employee_id uuid,
    employee_name text,
    first_day timestamp,
    last_day timestamp,
    PRIMARY KEY (job_title, employee_id)
) WITH CLUSTERING ORDER BY (employee_id ASC);

> CREATE TABLE test.days_worked_by_previous_employees_by_job_title (
    job_title text,
    employee_id uuid,
    employee_name text,
    days_worked int,
    PRIMARY KEY (job_title, employee_id)
) WITH CLUSTERING ORDER BY (employee_id ASC);

In [3]:
# change ip for spark cluster
spark = SparkSession.builder.master("spark://192.168.208.7:7077") \
.appName("extract_and_load") \
.config("spark.cassandra.connection.host", "192.168.208.2") \
.config("spark.cassandra.connection.port", "9042") \
.config("spark.cassandra.auth.username", "cassandra") \
.config("spark.cassandra.auth.password", "cassandra") \
.config("spark.jars.packages", "com.datastax.spark:spark-cassandra-connector_2.12:3.1.0") \
.getOrCreate()

#.config("spark.jars", "/opt/bitnami/spark/jars/spark-cassandra-connector-2.4.0-s_2.11.jar") \
#.config("spark.jars.packages", "com.datastax.spark:spark-cassandra-connector_2.12:3.1.0") \
#.config("spark.jars.packages","com.datastax.spark:spark-cassandra-connector_2.12:3.2.0-beta,com.datastax.cassandra:cassandra-driver-core:3.11 spark-cassandra-connector-assembly-1.1.1-SNAPSHOT.jar")


In [4]:
def load_and_get_table_df(keys_space_name, table_name):
    table_df = spark.read\
        .format("org.apache.spark.sql.cassandra")\
        .options(table=table_name, keyspace=keys_space_name)\
        .load()
    return table_df

#### Write dataframe in cassandra

In [8]:
csv_df = spark.read.format("csv").option("header", "true").load("/usr/local/spark/data/previous_employees_by_job_title.txt")

In [9]:
write_df = csv_df.select("job_title", "employee_id", "employee_name", "first_day", "last_day")

In [10]:
write_df.write\
    .format("org.apache.spark.sql.cassandra")\
    .mode('append')\
    .options(table="previous_employees_by_job_title", keyspace='test')\
    .save()

#### Consume cassandra to process and import

In [5]:
load_and_get_table_df("test", "previous_employees_by_job_title").show()

+-----------------+--------------------+----------------+-------------------+-------------------+
|        job_title|         employee_id|   employee_name|          first_day|           last_day|
+-----------------+--------------------+----------------+-------------------+-------------------+
|Food Technologist|0004db76-10fe-447...|Benjamin Hancock|2001-12-08 14:22:43|2007-03-23 06:57:42|
|Food Technologist|0050ce52-8282-4e9...|        Rae Roth|2007-11-27 09:31:08|2011-05-11 14:32:45|
|Food Technologist|00c428b1-0931-450...|  Domenic Russel|2019-08-09 05:07:32|2000-12-08 20:34:54|
|Food Technologist|00f6d71a-4e0c-4c3...|    Tom Villiger|2012-12-07 22:24:32|2017-02-01 16:37:06|
|Food Technologist|00f71b80-a9bd-48a...|    Barney Tyler|2018-12-17 13:45:35|2013-01-04 16:05:04|
|Food Technologist|0177f92b-ce0f-4d2...|     Peter Reese|2008-12-30 16:16:16|2003-01-26 19:34:52|
|Food Technologist|01842933-ab9d-432...|   Sabrina Logan|2005-02-03 12:08:51|2017-07-15 03:25:17|
|Food Technologist|0

In [11]:
# configure database catalog
spark.conf.set(f"spark.sql.catalog.cassandra", "com.datastax.spark.connector.datasource.CassandraCatalog")
spark.sql("use cassandra.test")

DataFrame[]

In [12]:
calcDF = spark.sql("select job_title, employee_id, employee_name, abs(datediff(last_day, first_day)) as days_worked from previous_employees_by_job_title")

In [13]:
calcDF.show()

+----------+--------------------+-----------------+-----------+
| job_title|         employee_id|    employee_name|days_worked|
+----------+--------------------+-----------------+-----------+
|Accountant|0016165e-6d9a-4f9...|     Enoch Parker|        474|
|Accountant|006b3b3e-fe15-4b1...|    Clint Michael|       4541|
|Accountant|0076bb2e-fd33-4e0...|     Chuck Clarke|       1405|
|Accountant|007beac9-e2ef-44c...|   Manuel Mcneill|       4204|
|Accountant|00d043fb-984c-450...|       Hank Avery|        971|
|Accountant|00d2ebd0-748f-484...|      Renee Riley|       1284|
|Accountant|00dceecd-e0db-49c...|   Lindsay Ingham|       1348|
|Accountant|00e69c60-f34d-44b...|    Javier Bolton|       1868|
|Accountant|010a2d77-c6ac-4c5...|       Mark James|       2928|
|Accountant|01361364-2de0-482...|  Crystal Ashwell|       2542|
|Accountant|01638366-d495-445...|       Leroy Ross|       1570|
|Accountant|01d087b7-1705-4d9...|Kimberly Matthews|       1583|
|Accountant|01d2cece-1d8f-466...|  Rosal

In [14]:
calcDF.write\
    .format("org.apache.spark.sql.cassandra")\
    .mode('append')\
    .options(table="days_worked_by_previous_employees_by_job_title", keyspace='test')\
    .save()

In [15]:
spark.stop()