In [None]:
import os
import pyspark
import datetime

from pyspark.sql.functions import *
from datetime import *
from delta.tables import *
from pyspark.sql.window import Window
from notebookutils import mssparkutils
from pyspark.sql import functions as F, SparkSession

# Main Class

## Class Construct Path

In [None]:
class PathConstructor:
    def __init__(self,*args):
        self.filepath = args
    def pathconstructor(self):
        if isinstance(self.filepath,(tuple)):
            command = ""
            for i in range(len(self.filepath)-2):
                command += "/%s"
            path = "abfss://%s@%s.dfs.core.windows.net"
            pathcombine = path+command

            defaultPath = pathcombine %(self.filepath)
        return defaultPath 

#

## Class Reading File

In [None]:
class ReadFile:

    def __init__(self, path, file_format, delimiter, with_header):
        self.path = path
        self.file_format = file_format
        self.delimiter = delimiter
        self.with_header = with_header

    def readfrompath(self):
        df = spark.read.format(self.file_format) \
                .option("header", self.with_header) \
                .option('delimiter', self.delimiter) \
                .option("inferSchema", "true") \
                .load(self.path)
        return (df)
        
    def readfrompath_withschema(self, schema):
        df = spark.read.format(self.file_format) \
                .option("header", self.with_header) \
                .option('delimiter', self.delimiter) \
                .schema(schema) \
                .load(self.path)
        return (df)
    
    def readfrompath_excel(self, sheetInformation):
        df = spark.read.format("com.crealytics.spark.excel") \
                .option("header", self.with_header) \
                .option("inferSchema", "true") \
                .option("dataAddress", sheetInformation) \
                .load(self.path)
        return (df)
    
    def readfrompath_excel_withschema(self, sheetInformation, schema):
        df = spark.read.format("com.crealytics.spark.excel") \
                .option("header", self.with_header) \
                .option("inferSchema", "true") \
                .option("dataAddress", sheetInformation) \
                .schema(schema) \
                .load(self.path)
        return (df)

## Class Filtering Data

In [None]:
class GetDelta:
    def __init__(self,data,partiotionCol,dropCol,flagCol,orderByCol,changeFlag):
        self.data = data
        self.partiotionCol = partiotionCol
        self.dropCol = dropCol
        self.flagCol = flagCol
        self.orderByCol = orderByCol
        self.changeFlag = changeFlag
    def GetDelta(self):
        df_filter = self.data.where(F.col(self.flagCol).isin(self.changeFlag))\
        .drop(*self.dropCol).withColumnRenamed(self.flagCol,'change_flag')
        df = df_filter.withColumn('rank',F.dense_rank() \
        .over(Window.partitionBy(*self.partiotionCol))\
        .orderBy(F.desc(self.orderByCol)))\
        .where('rank == 1').drop(self.orderByCol,'rank')
        return(df)

## Class Merging Data

In [None]:
class MergingConstructor:
    def __init__(self,keyList,updateList):
        self.keyList = keyList
        self.updateList = updateList
    def Construct_Upsert(self):
        mainString = 'MERGE INTO old_data USING new_data ON'
        i = 0
        k = 0

        for key in self.keyList:
            if i ==0:
                keyString = 'old_data.' + key + ' = ' + 'new_data.' + key
            else :
                keyString = 'old_data.'+ key + ' = ' + 'new_data.' + key
            mainString = mainString + keyString
            i = i+1
        mainString = mainString + 'WHEN NOT MATCHED THEN INSERT *'
        return(mainString)

    def Construct_Delete(self):
        mainString = 'MERGE INTO old_data USING new_data ON'
        i = 0

        for key in keyList:
            if i == 0:
                keyString = 'old_data.'+ key + ' = ' +'new_data.'+key
            else:
                keyString = 'AND old_data.'+ key + '=' + 'new_data.'+key
            mainString = mainString + keyString
            i = i+1
        mainString = mainString + 'WHEN MATCHED THEN DELETE'
        return(mainString)


## Merge Process Class

In [None]:
class MergeClass:
    def __init__(self,df_old,df_new):
        self.df_old = df_old
        self.df_new = df_new
    def merge(self,mergeString):
        if self.df_new.count() > 0:
            self.df_old.registerTempTable('old_data')
            self.df_new.registerTempTable('new_data')
            try:
                print(mergeString)
                spark.sql(mergeString)
                print('Merge Success')
            except Exception as e:
                print('Error')
                print('Error Happened: ',e)
        else:
            print('No new data')

## Class Directory

In [None]:
class DirectoryClass:
    def __init__(self,path):
        self.path = path
    def isDirExist(self):
        try:
            mssparkutils.fs.ls(self.path)
            return True
        except Exception as e:
            return False
    def CreateDir(self):
        mssparkutils.fs.mkdirs(self.path)
    def RemoveDir(self,additional_path):
        if additional_path != '':
            removePath = self.path + '/' + additional_path
        else:
            removePath = self.path
        mssparkutils.fs.rm(removePath)
    def CopyDirectory(self,sourcePath,isRecurse:bool):
        mssparkutils.fs.cp(sourcePath,self.path,isRecurse)
    def MoveDirectory(self,sourcePath,isRecurse:bool):
        mssparkutils.fs.mv(sourcePath,self.path,isRecurse)

## Config Date Class

In [None]:
class Config_Date:
    def date_YYYYMM(self):
        now = datetime.now()
        return now.strftime('%Y%m')
    def date_YYYMMDD(self):
        now = datetime.now()
        return now.strftime('%Y%m%d')


## Class Partition

In [None]:
class Partition:
    def partition_YYYYMM(self):
        range_list = []
        for i in range(30):
            yymm = (datetime.now()-timedelta(days=i))
            range_list.append([yymm.year,yymm.month])
        schema_lst = ['year','month']
        return spark.createDataFrame(range_list,schema_lst).dropDuplicates().collect()
    def partition_YYYY(self):
        range_list = []
        yymm_0 = datetime.now()
        yymm_1 = datetime.now() - timedelta(days=365)
        range_list.append([yymm_0])
        range_list.append([yymm_1])
        schema_lst = ['year']
        return spark.createDataFrame(range_list,schema_lst).dropDuplicates().collect()