In [1]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import when, col
import os
import subprocess


In [2]:
spark = SparkSession\
    .builder\
    .master("local[4]")\
    .appName("agent")\
    .config("spark.eventLog.logBlockUpdates.enabled", True)\
    .getOrCreate()

sc = spark.sparkContext

In [3]:
spark.conf.set("spark.sql.repl.eagerEval.maxColWidth", 1000)

In [4]:
fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
list_status = fs.listStatus(spark._jvm.org.apache.hadoop.fs.Path("/user/sales_agent"))
files = [(file.getPath().toString(), file.getModificationTime()) for file in list_status if file.isFile()]
files.sort(key=lambda x: x[1], reverse=True)
latest_file = files[0][0] if files else None


In [5]:
df_sales_agents = spark.read.csv(latest_file, header=True, inferSchema=True)

In [6]:
df_sales_agents.show()

+---------------+------------------+-------------------+
|sales_person_id|              name|          hire_date|
+---------------+------------------+-------------------+
|              1|          John Doe|2020-06-03 00:00:00|
|              2|        Jane Smith|2018-05-13 00:00:00|
|              3|   Michael Johnson|2021-10-03 00:00:00|
|              4|       Emily Brown|2020-10-25 00:00:00|
|              5|      David Wilson|2021-04-08 00:00:00|
|              6|       Emma Taylor|2019-03-28 00:00:00|
|              7|Christopher Miller|2020-01-11 00:00:00|
|              8|      Olivia Davis|2021-10-24 00:00:00|
|              9|   Daniel Martinez|2018-10-08 00:00:00|
|             10|      Sophia Moore|2019-05-25 00:00:00|
+---------------+------------------+-------------------+



In [7]:
checkpoint_path = "/user/checkpoint/checkpoint_sales_agent.txt"
result = subprocess.run(['hdfs', 'dfs', '-test', '-e', checkpoint_path])
if result.returncode != 0:
    latest_processed_file = ""
else:
    rdd = sc.textFile("/user/checkpoint/checkpoint_sales_agent.txt")
    latest_processed_file = rdd.take(rdd.count())[-1]
    print(latest_processed_file)

In [8]:
print(latest_processed_file)




In [9]:
def write_checkpoint(checkpoint_path, latest_file):
    checkpoint_dir = os.path.dirname(checkpoint_path)
    result = subprocess.run(['hdfs', 'dfs', '-test', '-e', checkpoint_dir])
    if result.returncode != 0:
        subprocess.run(['hdfs', 'dfs', '-mkdir', '-p', checkpoint_dir])
    
    with open('/tmp/checkpoint_tmp.txt', 'w') as f:
        f.write(latest_file)
    
    subprocess.run(['hdfs', 'dfs', '-put', '-f', '/tmp/checkpoint_tmp.txt', checkpoint_path])
    os.remove('/tmp/checkpoint_tmp.txt')

In [10]:
processed_path = "/user/silver/sales_agent/sales_agent.parquet"
processed_dir = "/user/silver/sales_agent"
checkpoint_path = "/user/checkpoint/checkpoint_sales_agent.txt"
    
if latest_processed_file == latest_file:
    print(f"File already processed before: {latest_processed_file}")
    write_checkpoint(checkpoint_path, latest_file)
else:
    result = subprocess.run(['hdfs', 'dfs', '-test', '-e', processed_dir])
    if result.returncode == 0:
        write_checkpoint(checkpoint_path, latest_file)
        print(f"File already exists in HDFS: {processed_dir}")
        df_sales_agents.write.parquet(processed_path, mode="overwrite")
    else:
        write_checkpoint(checkpoint_path, latest_file)
        subprocess.run(['hdfs', 'dfs', '-mkdir', '-p', processed_dir])
        df_sales_agents.write.parquet(processed_path)

In [None]:
sc.stop()