In [1]:
import sys, os, inspect, re
import functools
sys.path.append("/home/vbhargava/feature_test0/msaction_backend/common/BU3.0_core/util/Py_utils/taxonomy_utils")
import time, logging
import pandas as pd 
numeric_level = getattr(logging, 'INFO', None)
stdout_handler = logging.StreamHandler(sys.stdout)
logging.basicConfig(level=numeric_level,
                        format='%(asctime)s %(levelname)s %(name)s: %(message)s',
                        handlers=[stdout_handler])

In [2]:
from libs.s3_ops import S3_OPs
from libs.s3_stream import S3Stream
from libs.configs import Config
from libs.nio_executor import NIO
from libs import utils
from libs import xml_writer 
from libs import decorator
from collections import defaultdict
#from model.models import Taxonomy_Grp

In [3]:
class Taxonomy_Grp:
    
    def __init__(self, tg_name, key_cols=[], target_cols=[], data_location=''):
        self.tg_name = tg_name
        self.key_cols = key_cols
        self.target_cols = target_cols
        self.location =os.path.join(data_location, tg_name)
        
    def get_dict(self):
        if len(self.target_cols) == 0:
            return {'tg_name': self.tg_name}
        
        return {'tg_name': self.tg_name, 
                'key_cols': self.key_cols, 
                'target_cols': self.target_cols, 
                'location': self.location}

    def __str__(self):
        if len(self.target_cols) == 0:
            return 'tg_name: {}'.format(self.tg_name)
        return 'tg_name: {}, key_cols: {}, target_cols: {}, location: {}'.format(self.tg_name, self.key_cols, self.target_cols,self.location)


In [4]:
config = '/home/vbhargava/feature_test0/msaction_backend/customers/raj_ford_test/common/config/inputs/platform_config.xml'
lmt_src = 's3://qubole-ford/taxonomy_cs/test1/src/'
lmt_data = 's3://qubole-ford/taxonomy_cs/test1/data/'
config_input_loc = '/home/vbhargava/feature_test0/temp/taxo_config_xmls/'
config_file_name = 'test.xml'
dn_version = '12.1'


In [5]:
config_data = Config.get_qubole_config(config)
ACCESS_KEY=config_data['access_key']
SECRET_KEY=config_data['secret_key']

In [6]:
TG_EXTRACT_REGEX = '^.*?/([a-zA-Z]+\-?[0-9]*)/$' 
FILE_EXTRACT_REGEX = '^.*/([a-zA-Z0-9.\-_]{0,255}.csv)$' #'^.*/([a-zA-Z0-9.\-_]{0,255}.csv)$'
TARGET_EXTRACT_REGEX ='^.*,?(target_[A-Za-z0-9_-]+).*$'
TARGET_EXTRACT_2_REGEX ='(target_[A-Za-z0-9_-]+)'
VALID_FILE_KEY_REGEX = '^(.*/([a-zA-Z]+\-?[0-9]*)?/)?(([a-zA-Z]+\-?[0-9]*?)_([0-9]{4}-[0-9]{2}-[0-9]{2}?)_([a-zA-Z0-9.\-_]+?).csv?)$'
KEY_REGEX = '^[Kk]ey_[A-Za-z0-9_]{2,30}$'
TARGET_REGEX = '^[Tt]arget_[A-Za-z0-9_]{2,30}$'

In [7]:
s3_ops = S3_OPs(ACCESS_KEY, SECRET_KEY)

def filename_by_key(key):
    return get_val_by_regex(key, FILE_EXTRACT_REGEX, error_msg="Not vaild key for taxonomy data csv file")

def find_by_data_tg(key, regex):
    return get_val_by_regex(key, regex, error_msg="Not vaild taxonomy data dir")

        
def get_val_by_regex(key, regex, error_msg="can't be extract a val."):
    matched = re.findall(regex, key)
    if len(matched) > 0:
        return matched[0]
    else:
        raise Exception(error_msg)
        
def get_data_n_schema(tg, data_files_loc):
    data_file_lock_detail = s3_ops.get_bucket_name(data_files_loc)
    files = s3_ops.list_complete(data_file_lock_detail['bucket'], data_file_lock_detail['key'])
    res = {}
    if len(files)>0:
        s3_stream = S3Stream(ACCESS_KEY, SECRET_KEY)
        schema = s3_stream.get_header(s3_ops.get_full_s3_path(data_file_lock_detail['bucket'],files[0]['Key']))
        #res[tg]={'schema':schema, 'files': files}
        res['schema'] = {tg:schema}
        res['files'] = {tg:files}
    return res

def extract_schema(schema):
    return schema.replace(" ","").lower()

def validate_schema(schema):
    if schema=='': 
        return {'IsValid' : False, 'schema': schema, 'message' : "Schema shouldn't be empty"}
    tokens = schema.split(',')
    if len(tokens) < 2:
         return {'IsValid' : False, 'schema': schema, 'message' : "Schema should have at least 2 columns"}
    
    key_cnt = 0
    target_cnt = 0
    invalid_headers = []
    columns = defaultdict(list)
    res = {}
    target_cols_set = set()
    key_cols_set = set()
    for t in tokens:
        t = t.strip()
        if re.match(TARGET_REGEX, t):
            target_cnt = target_cnt + 1
            target_cols_set.add(t)
        elif re.match(KEY_REGEX, t):
            key_cnt = key_cnt + 1
            key_cols_set.add(t)
        else:
            invalid_headers.append(t)
        columns[t.lower()].append(1)

    error_msgs=[]
    if target_cnt < 1 :
        error_msgs.append("At least one Target column is required!")
    if key_cnt < 1 :
        error_msgs.append("At least one Key column is required!")
    if len(invalid_headers) > 0 :
        error_msgs.append("All given columns should Key or Target!")
    for k, v in columns.items():

        if len(v) > 1:
            print("--")
            error_msgs.append("Same name: {} should not represent more than one column in schema! cols names are case insensitive. ".format(k))

    if len(error_msgs) > 0:
        return {'IsValid' : False, 'schema': schema, 'errors' : " \n".join(error_msgs)}
    #print(str(key_cnt)+":"+str(target_cnt)+":"+str(invalid_headers)+":"+str(columns))
    return {'IsValid' : True, 'Schema': schema.replace(" ","").lower(), 
            'TargetColsSet' : target_cols_set, 'KeyColsSet' : key_cols_set}

In [8]:
@decorator.elapsed_time(func_name='extract_data_detail')
def extract_data_detail(lmt_src, lmt_data, access_key, secret_key):
#     Valid data Taxonomy Grps
    
    #
    lmt_data_loc_detail = s3_ops.get_bucket_name(lmt_data)
    lmt_data_loc_bucket = lmt_data_loc_detail['bucket']
    lmt_data_loc_key = lmt_data_loc_detail['key']
    valid_tg_list_res = s3_ops.list_subdirs(lmt_data_loc_detail['bucket'],lmt_data_loc_detail['key'],)
    
    valid_tgrp_loc_list = [ [find_by_data_tg(item['Prefix'], TG_EXTRACT_REGEX), 
                         '{}{}/'.format(lmt_data, find_by_data_tg(item['Prefix'], TG_EXTRACT_REGEX))] 
                       for item in valid_tg_list_res]
    
    collected = NIO.decorated_run_io(task=get_data_n_schema, task_n_args_list=valid_tgrp_loc_list, max_workers=25,)
#     return collected
    tg_data_schema_dict = {k:extract_schema(v)  for item in collected for k, v in item['result']['schema'].items()}
    tg_data_files_dict = {k:{filename_by_key(u['Key']):u for u in v } for item in collected for k, v in item['result']['files'].items()}
    #target_data_tg_dict = {re.findall(TARGET_EXTRACT_REGEX,V)[0]: K for K, V in tg_data_schema_dict.items()}
    target_data_tg_dict = {target : K for K, V in tg_data_schema_dict.items() 
                           for target in re.findall(TARGET_EXTRACT_2_REGEX,V)}
    
    return tg_data_schema_dict, tg_data_files_dict,target_data_tg_dict

In [9]:
def is_valid_file(key:str='', regex = VALID_FILE_KEY_REGEX):
    if re.match(regex, key) is None:
        return False
    return True

def extract_info(key:str='', regex = VALID_FILE_KEY_REGEX):
    matched = re.findall(regex, key)
    return {
            'KeyDirPath' : matched[0][0],
            'ParentDir' : matched[0][1],
            'FileName' : matched[0][2],
            'FileGrp' :  matched[0][3],
            'Date' :  matched[0][4],
            'ClientName' : matched[0][5]
           }
def extract_info_with_bucket(key:str='', bucket = ''):
    res = extract_info(key)
    res.update({'Bucket' : bucket})
    return res

In [10]:
def grouped_tg(collected, tg_files_dict_type='new_tg_files_dict'):
    collect = defaultdict(dict)
    tg_f_gen = (item['result'][tg_files_dict_type] for item in collected if len(item['result'][tg_files_dict_type]) > 0)
    tg_f_gen2 = (collect[tg].update({filename: file_dict})  for item in tg_f_gen for tg, file_detail_dict in item.items() for filename, file_dict in file_detail_dict.items())
    [ i for i in tg_f_gen2]
    tg = dict(collect)
    return tg

def grouped_flag_dict(collected, flag_dict_type='schema_tg_dict'):
    f_gen = (item['result'][flag_dict_type] for item in collected if len(item['result'][flag_dict_type]) > 0)
    collect = defaultdict(set)
    f_gen2 = (collect[K].add(V)  for item in f_gen for K, V in item.items())
    [ i for i in f_gen2]
    res = dict(collect)
    return res

def grouped_set_of_flags_dict(collected, flag_dict_type='schema_tg_dict'):
    f_gen = (item['result'][flag_dict_type] for item in collected if len(item['result'][flag_dict_type]) > 0)
    collect = defaultdict(set)
    f_gen2 = (collect[K].update(V)  for item in f_gen for K, V in item.items())
    [ i for i in f_gen2]
    res = dict(collect)
    return res

def grouped_set_of_flags(collected, flag_dict_type='invalid_schema_files'):
    res_set=set()
    f_gen = (res_set.update(item['result'][flag_dict_type]) for item in collected if len(item['result'][flag_dict_type]) > 0)
    [ i for i in f_gen]
    return res_set

In [11]:

def file_process_task(src_file_details):
    
    invalid_schema_files = set()

    target_already_exist_files = set()
    
    ''' {'key_evt_advertiser_key,target_evt_advertiser_name': {'tg1', 'tg2', ...}}'''
    schema_tg_dict = {}
    
    ''' {'target_evt_advertiser_name': {'tg1', 'tg2', ...}}'''
    target_tg_dict = {}

    ''' {'tg': {'key_evt_advertiser_key,target_evt_advertiser_name', '',...}}'''
    new_tg_schema_dict = {}
    ''' {'tg': {'AdvertiserReporting_2020-06-01_ford.csv': {file detailed obj dict} }  }'''
    new_tg_files_dict = {}
    ''' {'tg': {'AdvertiserReporting_2020-06-01_ford.csv': {file detailed obj dict} }  }'''
    existing_tg_files_dict = {}


    # tg_data_schema_dict = 
    # tg_data_files_dict = 
    # target_data_tg_dict = 

#     src_file_details = valid_file_arg[0]
    src_file_loc = s3_ops.get_full_s3_path(src_file_details['Bucket'], src_file_details['Key'])

    s3_stream = S3Stream(ACCESS_KEY, SECRET_KEY)
    schema =  s3_stream.get_header(src_file_loc)
    #schema = 'key_evt_advertiser_key, targe_evt_advertiser_name'
    validate_res = validate_schema(schema)
    if validate_res['IsValid']:
        
        
        
        tg = src_file_details['FileGrp']
        file_name = src_file_details['FileName']
        
        if tg_data_schema_dict.get(tg) is None or tg_data_schema_dict.get(tg) != validate_res['Schema']:
                
#             data_tg_for_target = target_data_tg_dict.get(validate_res['TargetCol'])
#             if  data_tg_for_target is not None:# and data_tg_for_target != tg:
#                 target_already_exist_files.add((src_file_loc, data_tg_for_target))
#             else:
            new_tg_schema_dict[tg] = validate_res['Schema']
            new_tg_files_dict[tg] = {file_name: src_file_details}
        else:
            existing_tg_files_dict[tg] = {file_name: src_file_details}
        
        src_file_details['Schema'] = validate_res['Schema']
        schema_tg_dict[validate_res['Schema']] = tg
        
        #target_tg_dict[validate_res['TargetCol']] = tg
        target_tg_dict = {target:tg for target in validate_res['TargetColsSet']}

    else:
        invalid_schema_files.add((src_file_loc, schema, validate_res['errors']))

    return {'invalid_schema_files': invalid_schema_files,
#             'target_already_exist_files':target_already_exist_files,
            'schema_tg_dict': schema_tg_dict,
            'target_tg_dict':target_tg_dict,
            'new_tg_schema_dict': new_tg_schema_dict,
            'new_tg_files_dict' : new_tg_files_dict,
            'existing_tg_files_dict' : existing_tg_files_dict
           }



def src_list_page_process_task(list_page):
    
    lmt_src_loc_detail = s3_ops.get_bucket_name(lmt_src)
    lmt_src_loc_bucket = lmt_src_loc_detail['bucket']
    lmt_src_loc_key = lmt_src_loc_detail['key']
    
    invalid_files_set = { s3_ops.get_full_s3_path(lmt_src_loc_detail['bucket'], item['Key']) for item in list_page if  not is_valid_file(key=item['Key'])}
    valid_file_set = [[utils.dict_append(extract_info_with_bucket(item['Key'], lmt_src_loc_detail['bucket']),item)] for item in list_page if  is_valid_file(key=item['Key']) ]
    collected = NIO.decorated_run_io(task=file_process_task, task_n_args_list=valid_file_set, max_workers=25,)
#     return collected
    return {'invalid_files_set' : invalid_files_set,
            'invalid_schema_files': grouped_set_of_flags(collected, flag_dict_type='invalid_schema_files'),
#             'target_already_exist_files' : grouped_set_of_flags(collected, flag_dict_type='target_already_exist_files'),
            'schema_tg_dict': grouped_flag_dict(collected, flag_dict_type='schema_tg_dict'),
            'target_tg_dict': grouped_flag_dict(collected, flag_dict_type='target_tg_dict'),
            'new_tg_schema_dict': grouped_flag_dict(collected, flag_dict_type='new_tg_schema_dict'),
            'new_tg_files_dict' : grouped_tg(collected, 'new_tg_files_dict'),
            'existing_tg_files_dict' : grouped_tg(collected, 'existing_tg_files_dict')
           }

In [12]:
def extract_src_detail(maxKeysPerReq=3):
    lmt_src_loc_detail = s3_ops.get_bucket_name(lmt_src)
    lmt_src_loc_bucket = lmt_src_loc_detail['bucket']
    lmt_src_loc_key = lmt_src_loc_detail['key']
    page_generator = s3_ops.list_gen(lmt_src_loc_bucket, lmt_src_loc_key, maxKeysPerReq=maxKeysPerReq, )
    page_args_generator = ([page] for page in page_generator)
    #list_page = [i for i in page_generator][0]
    collected = NIO.decorated_run_with_args_generator(task=src_list_page_process_task, args_generator=page_args_generator, is_kernal_thread=True,)
    
    return {'invalid_files_set' : grouped_set_of_flags(collected, flag_dict_type='invalid_files_set'),
            'invalid_schema_files': grouped_set_of_flags(collected, flag_dict_type='invalid_schema_files'),
#             'target_already_exist_files' : grouped_set_of_flags(collected, flag_dict_type='target_already_exist_files'),
            'schema_tg_dict': grouped_set_of_flags_dict(collected, flag_dict_type='schema_tg_dict'),
            'target_tg_dict': grouped_set_of_flags_dict(collected, flag_dict_type='target_tg_dict'),
            'new_tg_schema_dict': grouped_set_of_flags_dict(collected, flag_dict_type='new_tg_schema_dict'),
            'new_tg_files_dict' : grouped_tg(collected, 'new_tg_files_dict'),
            'existing_tg_files_dict' : grouped_tg(collected, 'existing_tg_files_dict')
               }

In [13]:
def s3_copy_into_data_loc_task(tg, file_name, src_file, src_size, dry_run=True):
#     data_file_loc_detail = s3_ops.get_bucket_name(lmt_data)
    src_file_loc_detail = s3_ops.get_bucket_name(lmt_src)
    src_s3 = 's3://{}/{}'.format(src_file_loc_detail['bucket'], src_file)
    dest_s3 = '{}{}/{}'.format(lmt_data,tg, file_name)
    if dry_run:
        print("[dry_run]: S3 copy from {} to {}".format(src_s3, dest_s3))
    else:
        pass
        #s3_ops.copy(src=src_s3, dest = dest_s3, src_size=src_size)
    return 'Copied Successfully! by task'


def s3_remove_at_data_loc_task(file,  dry_run=True):
    data_file_loc_detail = s3_ops.get_bucket_name(lmt_data)
#     src_file_loc_detail = s3_ops.get_bucket_name(lmt_src)
#     src_s3 = 's3://{}/{}'.format(lmt_src, src_file)
    
    if dry_run:
        file_loc = 's3://{}/{}'.format(data_file_loc_detail['bucket'], file)
        print("[dry_run]: S3 delete from {} ".format(file_loc))
    else:
        pass
        #s3_ops.delete_file(data_file_loc_detail['bucket'], file)
    return 'Deleted Successfully! by task'

''' E.g. '''
# s3_copy_into_data_loc_task('tg5', 'tg5_2020-11-01_ford.csv', 'taxonomy_cs/test1/src/tg5_2020-11-01_ford.csv', 48 )
# s3_remove_at_data_loc_task('taxonomy_cs/test1/data/tg2/tg2_2020-11-01_ford.csv')

' E.g. '

### Delta

In [14]:
''' Get Existing State of System'''
tg_data = extract_data_detail(lmt_src, lmt_data, ACCESS_KEY, SECRET_KEY)
tg_data_schema_dict = tg_data[0]
tg_data_files_dict = tg_data[1]
target_data_tg_dict = tg_data[2]

target_data_tg_dict

extract_data_detail **Start Time = 2020-12-15 04:49:16.944679

2020-12-15 04:49:16,994:102410 MainThread run_blocking_tasks: starting

2020-12-15 04:49:16,995:102410 MainThread run_blocking_tasks: creating executor tasks

2020-12-15 04:49:16,996:102410 ThreadPoolExecutor-0_0 (task-0): passed args :['tg1', 's3://qubole-ford/taxonomy_cs/test1/data/tg1/']

2020-12-15 04:49:16,997:102410 ThreadPoolExecutor-0_1 (task-1): passed args :['tg15', 's3://qubole-ford/taxonomy_cs/test1/data/tg15/']

2020-12-15 04:49:16,997:102410 ThreadPoolExecutor-0_2 (task-2): passed args :['tg2', 's3://qubole-ford/taxonomy_cs/test1/data/tg2/']

2020-12-15 04:49:16,998:102410 ThreadPoolExecutor-0_3 (task-3): passed args :['tg3', 's3://qubole-ford/taxonomy_cs/test1/data/tg3/']

2020-12-15 04:49:16,999:102410 ThreadPoolExecutor-0_4 (task-4): passed args :['tg5', 's3://qubole-ford/taxonomy_cs/test1/data/tg5/']

2020-12-15 04:49:16,999:102410 ThreadPoolExecutor-0_0 (task-0): running

2020-12-15 04:49:16,999:102410 Th

{'target_a3': 'tg3',
 'target_a7': 'tg7',
 'target_a5': 'tg5',
 'target_a2': 'tg2',
 'target_a1': 'tg1',
 'target_a6': 'tg6',
 'target_a15': 'tg15'}

In [15]:
tg_data_files_dict

{'tg3': {'tg3_2020-11-01_ford.csv': {'Key': 'taxonomy_cs/test1/data/tg3/tg3_2020-11-01_ford.csv',
   'LastModified': datetime.datetime(2020, 11, 9, 17, 26, 2, tzinfo=tzlocal()),
   'ETag': '"ca7090e60660707cd776ede900dc0a1a"',
   'Size': 48,
   'StorageClass': 'STANDARD'},
  'tg3_2020-11-02_ford.csv': {'Key': 'taxonomy_cs/test1/data/tg3/tg3_2020-11-02_ford.csv',
   'LastModified': datetime.datetime(2020, 11, 9, 17, 26, 2, tzinfo=tzlocal()),
   'ETag': '"ca7090e60660707cd776ede900dc0a1a"',
   'Size': 48,
   'StorageClass': 'STANDARD'}},
 'tg7': {'tg7_2020-11-01_ford.csv': {'Key': 'taxonomy_cs/test1/data/tg7/tg7_2020-11-01_ford.csv',
   'LastModified': datetime.datetime(2020, 11, 7, 23, 2, 38, tzinfo=tzlocal()),
   'ETag': '"653eec710cdbf86149efb89f21912022"',
   'Size': 48,
   'StorageClass': 'STANDARD'},
  'tg7_2020-11-02_ford.csv': {'Key': 'taxonomy_cs/test1/data/tg7/tg7_2020-11-02_ford.csv',
   'LastModified': datetime.datetime(2020, 11, 7, 23, 2, 38, tzinfo=tzlocal()),
   'ETag': '"

In [16]:
src_delta = extract_src_detail()
src_delta

2020-12-15 04:49:17,164   process-id:102410 run_blocking_tasks: starting

2020-12-15 04:49:17,164   process-id:102410 run_blocking_tasks: creating executor tasks

2020-12-15 04:49:17,241   process-id:102458   (task-0): passed args :[[{'Key': 'taxonomy_cs/test1/src/tg0_202-11-01_ford.csv', 'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinfo=tzlocal()), 'ETag': '"535b60451f6d20c2826b045438a50fb9"', 'Size': 48, 'StorageClass': 'STANDARD'}, {'Key': 'taxonomy_cs/test1/src/tg0_2020-11-02_ford.csv', 'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinfo=tzlocal()), 'ETag': '"7a5d08cbb4c718d16851d1f2b57ffc50"', 'Size': 27, 'StorageClass': 'STANDARD'}, {'Key': 'taxonomy_cs/test1/src/tg0_2020-11-03_ford.csv', 'LastModified': datetime.datetime(2020, 10, 29, 21, 21, 36, tzinfo=tzlocal()), 'ETag': '"b19c288a2ef5e2ec6739cac3674391a6"', 'Size': 28, 'StorageClass': 'STANDARD'}]]

2020-12-15 04:49:17,244   process-id:102458   (task-0): running

2020-12-15 04:49:17,248:102458 

2020-12-15 04:49:17,316   process-id:102461   (task-3): running

2020-12-15 04:49:17,303:102460 MainThread run_blocking_tasks: waiting for executor tasks

2020-12-15 04:49:17,303:102460 ThreadPoolExecutor-1_1 (task-1): running

2020-12-15 04:49:17,320:102461 MainThread run_blocking_tasks: starting

2020-12-15 04:49:17,321:102461 MainThread run_blocking_tasks: creating executor tasks

2020-12-15 04:49:17,322:102461 ThreadPoolExecutor-1_0 (task-0): passed args :[{'KeyDirPath': 'taxonomy_cs/test1/src/', 'ParentDir': 'src', 'FileName': 'tg15_2020-11-02_ford.csv', 'FileGrp': 'tg15', 'Date': '2020-11-02', 'ClientName': 'ford', 'Bucket': 'qubole-ford', 'Key': 'taxonomy_cs/test1/src/tg15_2020-11-02_ford.csv', 'LastModified': datetime.datetime(2020, 11, 4, 19, 6, 33, tzinfo=tzlocal()), 'ETag': '"4b12a5bf8a7aef2127357db957429ddd"', 'Size': 54, 'StorageClass': 'STANDARD'}]

2020-12-15 04:49:17,323:102461 ThreadPoolExecutor-1_1 (task-1): passed args :[{'KeyDirPath': 'taxonomy_cs/test1/src/', 'Pare

2020-12-15 04:49:17,388:102462 ThreadPoolExecutor-1_2 (task-2): running

2020-12-15 04:49:17,397   process-id:102461   (task-3): done

2020-12-15 04:49:17,395:102463 ThreadPoolExecutor-1_0 (task-0): running

2020-12-15 04:49:17,405   process-id:102464   (task-6): passed args :[[{'Key': 'taxonomy_cs/test1/src/tg2_2020-11-01_ford.csv', 'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinfo=tzlocal()), 'ETag': '"dde15e0dffb34575c1aa95f81c8867c0"', 'Size': 50, 'StorageClass': 'STANDARD'}, {'Key': 'taxonomy_cs/test1/src/tg2_2020-11-02_ford.csv', 'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinfo=tzlocal()), 'ETag': '"dde15e0dffb34575c1aa95f81c8867c0"', 'Size': 50, 'StorageClass': 'STANDARD'}, {'Key': 'taxonomy_cs/test1/src/tg2_2020-11-03_ford.csv', 'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinfo=tzlocal()), 'ETag': '"dde15e0dffb34575c1aa95f81c8867c0"', 'Size': 50, 'StorageClass': 'STANDARD'}]]

2020-12-15 04:49:17,408   process-id:102464   (tas

2020-12-15 04:49:17,456:102463 ThreadPoolExecutor-1_0 (task-0): done

2020-12-15 04:49:17,437:102465 ThreadPoolExecutor-1_2 (task-2): running

2020-12-15 04:49:17,456:102466 ThreadPoolExecutor-1_2 (task-2): passed args :[{'KeyDirPath': 'taxonomy_cs/test1/src/', 'ParentDir': 'src', 'FileName': 'tg5_2020-11-03_ford.csv', 'FileGrp': 'tg5', 'Date': '2020-11-03', 'ClientName': 'ford', 'Bucket': 'qubole-ford', 'Key': 'taxonomy_cs/test1/src/tg5_2020-11-03_ford.csv', 'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinfo=tzlocal()), 'ETag': '"471c56a20b21f692659b2f5c68c0b713"', 'Size': 48, 'StorageClass': 'STANDARD'}]

2020-12-15 04:49:17,465   process-id:102467   (task-9): passed args :[[{'Key': 'taxonomy_cs/test1/src/tg5_2020-11-04_ford.csv', 'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinfo=tzlocal()), 'ETag': '"f87ad6041aa111ac6b6d0776be1c774f"', 'Size': 50, 'StorageClass': 'STANDARD'}, {'Key': 'taxonomy_cs/test1/src/tg6_2020-11-01_ford.csv', 'LastModified': da

2020-12-15 04:49:17,540:102467 ThreadPoolExecutor-1_0 (task-0): done

2020-12-15 04:49:17,544   process-id:102459  (task-11): running

2020-12-15 04:49:17,544:102467 ThreadPoolExecutor-1_1 (task-1): done

2020-12-15 04:49:17,545:102459 MainThread run_blocking_tasks: starting

2020-12-15 04:49:17,546:102467 MainThread run_blocking_tasks: exiting

2020-12-15 04:49:17,546:102459 MainThread run_blocking_tasks: creating executor tasks

2020-12-15 04:49:17,547   process-id:102467   (task-9): done

2020-12-15 04:49:17,548:102459 ThreadPoolExecutor-2_0 (task-0): passed args :[{'KeyDirPath': 'taxonomy_cs/test1/src/', 'ParentDir': 'src', 'FileName': 'tg8_2020-11-01_ford.csv', 'FileGrp': 'tg8', 'Date': '2020-11-01', 'ClientName': 'ford', 'Bucket': 'qubole-ford', 'Key': 'taxonomy_cs/test1/src/tg8_2020-11-01_ford.csv', 'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinfo=tzlocal()), 'ETag': '"8d575874cb97b2d601ae8542aaf11431"', 'Size': 48, 'StorageClass': 'STANDARD'}]

2020-12-15 04:4

{'invalid_files_set': {'s3://qubole-ford/taxonomy_cs/test1/src/tg0_202-11-01_ford.csv'},
 'invalid_schema_files': {('s3://qubole-ford/taxonomy_cs/test1/src/tg0_2020-11-03_ford.csv',
   'key_a0, targe_a0',
   'At least one Target column is required! \nAll given columns should Key or Target!')},
 'schema_tg_dict': {'key_a9,target_a9': {'tg11', 'tg9'},
  'key_a8,target_a7': {'tg8'},
  'key_a152,target_a152': {'tg15'},
  'key_a151,target_a151': {'tg15'},
  'key_a5,target_a5': {'tg5'},
  'key_a21,target_a21': {'tg2'},
  'key_a6,target_a61': {'tg6'},
  'key_a51,target_a51': {'tg5'},
  'key_a7,target_a7': {'tg14', 'tg7'},
  'key_a16,target_a16,target_a162': {'tg16'},
  'key_a1,target_a1': {'tg1'},
  'key_a16,target_a16,target_a9': {'tg17'},
  'key_a10,target_a10': {'tg10'},
  'key_a10,target_a9': {'tg10'},
  'key_a4,target_a4': {'tg4'},
  'key_a7,target_a71': {'tg7'},
  'key_a0,target_a0': {'tg0'}},
 'target_tg_dict': {'target_a9': {'tg10', 'tg11', 'tg17', 'tg9'},
  'target_a7': {'tg14', 'tg7

In [17]:
def key_target_splitter(schema = ''):
    tokens = schema.split(',')
    key_cols = []
    target_cols = []
    for t in tokens:
        t = t.strip()
        if re.match(TARGET_REGEX, t):
            target_cols.append(t)
        elif re.match(KEY_REGEX, t):
            key_cols.append(t)
        else:
            raise Exception("Not a valid schema")
    return [{'target_cols': target_cols, 'key_cols' : key_cols}]


@decorator.box_logged
def log_report(list_of_row_dict=[], columns:list=[], header_align = 'left', sort_by= None, ascending = True, report_title='', ):
    pd.set_option("display.colheader_justify", header_align)
    df = pd.DataFrame(list_of_row_dict, columns=columns) 
    if sort_by is not None:
        df = df.sort_values(by=sort_by, ascending=ascending)
    df = df.reset_index()
    df = df.drop(columns=['index'])
    #df = df.set_index(' **      ' + df.index.astype(str) )
    df = df.rename(' **      {}'.format)
    
#   df.style.set_properties(**{'text-align': 'left'}).set_table_styles([ dict(selector='td', props=[('text-align', 'left')] ) ])
#     df1 = df.reindex(columns=['Taxonomy_Grp','File','Date', 'Schema'])
    #df[df.columns[new_order]]
    #df = df.transpose()
    if report_title != '': 
        report_titled(report_title)
    logging.info("")
    logging.info("")
    logging.info(str(df))
    logging.info("")
    logging.info("")


def left_justified(df):
    formatters = {}
    for li in list(df.columns):
        max = df[li].str.len().max()
        form = "{{:<{}s}}".format(max)
        formatters[li] = functools.partial(str.format, form)
    return df.to_string(formatters=formatters, index=False)   
    
@decorator.box_titled
def report_titled(title:str=''):
    logging.info("")
    logging.info("    "+title)
    logging.info("")
    
    
# class Taxonomy_Grp:
    
#     def __init__(self, tg_name, key_cols=[], target_col='', data_location=''):
#         self.tg_name = tg_name
#         self.key_cols = key_cols
#         self.target_col = target_col
#         self.location =os.path.join(data_location, tg_name)
        
#     def get_dict(self):
#         if self.target_col == '':
#             return {'tg_name': self.tg_name}
        
#         return {'tg_name': self.tg_name, 
#                 'key_cols': self.key_cols, 
#                 'target_col': self.target_col, 
#                 'location': self.location}

#     def __str__(self):
#         if self.target_col == '':
#             return 'tg_name: {}'.format(self.tg_name)
#         return 'tg_name: {}, key_cols: {}, target_col: {}, location: {}'.format(self.tg_name, self.key_cols, self.target_col,self.location)


In [18]:
''' Extract Info needed to expose configs and show in logs and reports'''

invalid_files_set = src_delta['invalid_files_set']
invalid_schema_files = src_delta['invalid_schema_files']

tg_data = { k for k in tg_data_files_dict.keys()}
tg_existing = { k for k in src_delta['existing_tg_files_dict'].keys()}
tg_new ={ k for k in src_delta['new_tg_files_dict'].keys()}
tg_all = tg_new.union(tg_existing)

many_tg4schema_check_gen = (v for k, v in src_delta['schema_tg_dict'].items() if len(v) > 1)
many_tg4target_check_gen = (v for k, v in src_delta['target_tg_dict'].items() if len(v) > 1)
newTg4schema = {k for k, v in src_delta['new_tg_schema_dict'].items() if len(v) > 1}


tg4schema = set()
[tg4schema.update(i) for i in many_tg4schema_check_gen]
tg4target = set()
[tg4target.update(i) for i in many_tg4target_check_gen]


# invalid_tg_with_dup_schema = (tg4schema.union(newTg4schema)).difference(tg_existing)

# invalid_tg_with_dup_target = tg4target.difference(tg_existing)

# invalid_tg_all = invalid_tg_with_dup_schema.union(invalid_tg_with_dup_target)

invalid_tg_all = newTg4schema.difference(tg_existing)

tg_delta = tg_new.difference(invalid_tg_all)

tg_delta_create = tg_delta.difference(tg_data)

tg_delta_drop_n_create = (tg_delta.intersection(tg_data)).difference(tg_existing)

tg_dropped = tg_data.difference(tg_all)

tg_dropped_all = tg_dropped.union(tg_delta_drop_n_create)
tg_create_all = tg_delta_create.union(tg_delta_drop_n_create)

''' File Sync'''
files_to_be_dropped = [[f['Key']] for i in  tg_dropped_all 
                       for fn, f in tg_data_files_dict.get(i).items()]
files_not_retained_existing_tg =[[tg_data_files_dict.get(tg).get(fn)['Key']]
                                 for tg in tg_existing 
                                 for fn in set(tg_data_files_dict.get(tg).keys()).difference(set(src_delta['existing_tg_files_dict'].get(tg).keys()))]

file_drop_args = []
file_drop_args.extend(files_to_be_dropped )
file_drop_args.extend(files_not_retained_existing_tg )


files_to_be_created = [[i, f['FileName'], f['Key'], f['Size']] 
                       for i in  tg_create_all 
                       for fn, f in src_delta['new_tg_files_dict'].get(i).items()]
files_to_be_copied = [[k, f_dict['FileName'], f_dict['Key'], f_dict['Size']] 
                      for k, v in src_delta['existing_tg_files_dict'].items() 
                      for f, f_dict in v.items()] #['Key']]
file_copy_args = []
file_copy_args.extend(files_to_be_created )
file_copy_args.extend(files_to_be_copied )

collected = NIO.decorated_run_io(task=s3_remove_at_data_loc_task, task_n_args_list=file_drop_args, 
                                 is_kernal_thread=False,)

collected = NIO.decorated_run_io(task=s3_copy_into_data_loc_task, task_n_args_list=file_copy_args, 
                                 is_kernal_thread=False,)



''' Expose details to generate configs'''
tg_create_all_n_schema = {tg: schema 
                          for tg in tg_create_all 
                          for schema in src_delta['new_tg_schema_dict'].get(tg)}
tg_retain_all_n_schema = {tg: tg_data_schema_dict.get(tg) 
                          for tg in tg_existing}
tg_dropped_all_n_schema = {tg: tg_data_schema_dict.get(tg) 
                           for tg in tg_dropped_all }


'''New and Drop_n_create(With new attributes like schema) Taxonomy Grps'''
exposed_tg_all = [Taxonomy_Grp(tg,schema_dict['key_cols'], schema_dict['target_cols'], lmt_data) 
                  for tg ,schema in tg_create_all_n_schema.items() 
                  for schema_dict in key_target_splitter(schema)]
'''Retaining Taxonomy Grps with either NO CHANGES or Create and Drop some files in a retained group'''
exposed_tg_all.extend([Taxonomy_Grp(tg,schema_dict['key_cols'], schema_dict['target_cols'], lmt_data) 
                       for tg ,schema in tg_retain_all_n_schema.items() 
                       for schema_dict in key_target_splitter(schema)])


'''Dropped and Drop_n_create(With old attributes like schema) Taxonomy Grps'''
exposed_dropped_tg_all = [Taxonomy_Grp(tg,schema_dict['key_cols'], schema_dict['target_cols'], lmt_data) 
                          for tg ,schema in tg_dropped_all_n_schema.items() 
                          for schema_dict in key_target_splitter(schema)]

'''Exposed Tg grp with precedence order'''
exposed_tg_name_list = [i.tg_name for i in exposed_tg_all]

exposed_tg_name_ordered_list = sorted(exposed_tg_name_list, key=str.lower)


''' Generating output config xml'''
xml_writer.generate_output_config(exposed_tg_all, exposed_dropped_tg_all, exposed_tg_name_ordered_list, 
                                  dn_version, config_input_loc, config_file_name)





# ''' Report '''
# # invalid files Schema delails
# # To Be

# # TG level
# tg_create_n_schema = [(i, src_delta['new_tg_schema_dict'].get(i)) for i in tg_delta_create]
# tg_drop_create_n_schema = [(tg, tg_data_schema_dict.get(tg), schema_new) for tg in tg_delta_drop_n_create for schema_new in src_delta['new_tg_schema_dict'].get(tg)]
# tg_drop_n_schema = [(i, tg_data_schema_dict.get(i)) for i in tg_dropped]
# tg_retain_n_schema = [(i, tg_data_schema_dict.get(i)) for i in tg_existing]

# # File Level
# files_to_be_dropped = [ f['Key'] for i in  tg_dropped for fn, f in tg_data_files_dict.get(i).items()]
# files_to_be_dropped_schema_change = [ f['Key'] for i in  tg_delta_drop_n_create for fn, f in tg_data_files_dict.get(i).items()]
# files_to_be_created = {f['Key'] :f['Schema'] for i in  tg_delta_create for fn, f in src_delta['new_tg_files_dict'].get(i).items()}
# files_to_be_created_schema_change = {f['Key'] :f['Schema'] for i in  tg_delta_drop_n_create for fn, f in src_delta['new_tg_files_dict'].get(i).items()}



2020-12-15 04:49:17,658:102410 MainThread run_blocking_tasks: starting

2020-12-15 04:49:17,659:102410 MainThread run_blocking_tasks: creating executor tasks

2020-12-15 04:49:17,660:102410 ThreadPoolExecutor-1_0 (task-0): passed args :['taxonomy_cs/test1/data/tg3/tg3_2020-11-01_ford.csv']

2020-12-15 04:49:17,660:102410 ThreadPoolExecutor-1_1 (task-1): passed args :['taxonomy_cs/test1/data/tg3/tg3_2020-11-02_ford.csv']

2020-12-15 04:49:17,660:102410 ThreadPoolExecutor-1_2 (task-2): passed args :['taxonomy_cs/test1/data/tg2/tg2_2020-11-01_ford.csv']

2020-12-15 04:49:17,661:102410 ThreadPoolExecutor-1_3 (task-3): passed args :['taxonomy_cs/test1/data/tg2/tg2_2020-11-02_ford.csv']

2020-12-15 04:49:17,661:102410 ThreadPoolExecutor-1_0 (task-0): running

[dry_run]: S3 delete from s3://qubole-ford/taxonomy_cs/test1/data/tg3/tg3_2020-11-01_ford.csv 
2020-12-15 04:49:17,661:102410 ThreadPoolExecutor-1_4 (task-4): passed args :['taxonomy_cs/test1/data/tg2/tg2_2020-11-04_ford.csv']

2020-12-

2020-12-15 04:49:17,712:102410 ThreadPoolExecutor-2_7 (task-7): done

2020-12-15 04:49:17,713:102410 ThreadPoolExecutor-2_8 (task-8): done

2020-12-15 04:49:17,713:102410 ThreadPoolExecutor-2_9 (task-9): done

2020-12-15 04:49:17,714:102410 ThreadPoolExecutor-2_2 (task-12): passed args :['tg4', 'tg4_2020-11-02_ford.csv', 'taxonomy_cs/test1/src/tg4_2020-11-02_ford.csv', 48]

2020-12-15 04:49:17,715:102410 ThreadPoolExecutor-2_3 (task-13): passed args :['tg4', 'tg4_2020-11-01_ford.csv', 'taxonomy_cs/test1/src/tg4_2020-11-01_ford.csv', 48]

2020-12-15 04:49:17,718:102410 ThreadPoolExecutor-2_0 (task-10): running

[dry_run]: S3 copy from s3://qubole-ford/taxonomy_cs/test1/src/tg2_2020-11-01_ford.csv to s3://qubole-ford/taxonomy_cs/test1/data/tg2/tg2_2020-11-01_ford.csv2020-12-15 04:49:17,719:102410 ThreadPoolExecutor-2_4 (task-14): passed args :['tg11', 'tg11_2020-11-01_ford.csv', 'taxonomy_cs/test1/src/tg11_2020-11-01_ford.csv', 48]


2020-12-15 04:49:17,720:102410 ThreadPoolExecutor-2_5 

2020-12-15 04:49:17,778 INFO libs.xml_writer: test.xml has been generated.





True

In [19]:
many_tg4target_check_gen = (v for k, v in src_delta['target_tg_dict'].items() if len(v) > 1)
tg4target = set()
[tg4target.update(i) for i in many_tg4target_check_gen]
tg4target
[(v,k) for k, v in src_delta['target_tg_dict'].items()]

[({'tg10', 'tg11', 'tg17', 'tg9'}, 'target_a9'),
 ({'tg14', 'tg7', 'tg8'}, 'target_a7'),
 ({'tg15'}, 'target_a152'),
 ({'tg15'}, 'target_a151'),
 ({'tg5'}, 'target_a5'),
 ({'tg2'}, 'target_a21'),
 ({'tg6'}, 'target_a61'),
 ({'tg5'}, 'target_a51'),
 ({'tg16', 'tg17'}, 'target_a16'),
 ({'tg16'}, 'target_a162'),
 ({'tg1'}, 'target_a1'),
 ({'tg10'}, 'target_a10'),
 ({'tg4'}, 'target_a4'),
 ({'tg7'}, 'target_a71'),
 ({'tg0'}, 'target_a0')]

In [20]:
[(i.tg_name, i.key_cols, i.target_cols) for i in exposed_tg_all]

[('tg14', ['key_a7'], ['target_a7']),
 ('tg17', ['key_a16'], ['target_a16', 'target_a9']),
 ('tg9', ['key_a9'], ['target_a9']),
 ('tg8', ['key_a8'], ['target_a7']),
 ('tg6', ['key_a6'], ['target_a61']),
 ('tg0', ['key_a0'], ['target_a0']),
 ('tg2', ['key_a21'], ['target_a21']),
 ('tg4', ['key_a4'], ['target_a4']),
 ('tg11', ['key_a9'], ['target_a9']),
 ('tg16', ['key_a16'], ['target_a16', 'target_a162']),
 ('tg1', ['key_a1'], ['target_a1']),
 ('tg7', ['key_a7'], ['target_a7']),
 ('tg5', ['key_a5'], ['target_a5'])]

In [21]:
[str(i) for i in invalid_tg_all]

['tg15', 'tg10']

In [22]:
#tg_create_all
# tg_create_all_n_dates = {tg: file_attr.get('Date') 
#                           for tg in tg_create_all 
#                           for f, file_attr in src_delta['new_tg_files_dict'].get(tg).items()}
# tg_create_all_n_dates
# src_delta['new_tg_files_dict']
tg_create_all
# a_list = None
# a_list = {'d','b', 'c', 'a', 'e'}
# b_list = list(a_list)
# t = sorted(b_list, key=str.lower)
# [i a_list


{'tg0', 'tg11', 'tg14', 'tg16', 'tg17', 'tg2', 'tg4', 'tg6', 'tg8', 'tg9'}

In [23]:
# exposed_tg_name_list = [i.tg_name for i in exposed_tg_all]

# exposed_tg_name_ordered_list = sorted(exposed_tg_name_list, key=str.lower)
exposed_tg_name_ordered_list
exposed_tg_name_list

['tg14',
 'tg17',
 'tg9',
 'tg8',
 'tg6',
 'tg0',
 'tg2',
 'tg4',
 'tg11',
 'tg16',
 'tg1',
 'tg7',
 'tg5']

### REPORT

In [24]:
# ''' Report '''


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)



'''  Exposed TG Report  '''

exposed_tg_report_data = [i.get_dict() for i in exposed_tg_all]
log_report(exposed_tg_report_data,  columns=['tg_name', 'key_cols','target_cols', 'location'], sort_by='tg_name')


'''  Exposed Dropped TG Report '''

exposed_tg_dropped_report_data = [i.get_dict() for i in exposed_dropped_tg_all]
log_report(exposed_tg_dropped_report_data, columns=['tg_name', 'key_cols','target_cols', 'location'], sort_by='tg_name')


# '''  Invalid TG due to schema conflict/already used Report '''

# invalid_tg_with_dup_schema_rep = [ {'Taxonomy_Grp':f['FileGrp'], 'File_Name':f['FileName'], 'Date':f['Date'], 'Schema': f['Schema'] }
#                     for i in  invalid_tg_with_dup_schema for fn, f in src_delta['new_tg_files_dict'].get(i).items()]
# log_report(invalid_tg_with_dup_schema_rep, columns=['Taxonomy_Grp','File_Name','Date', 'Schema']) 


# '''  Invalid TG due to Target Column conflict/already used Report '''

# invalid_tg_with_dup_target_rep = [ {'Taxonomy_Grp':f['FileGrp'], 'File_Name':f['FileName'], 'Date':f['Date'], 'Schema': f['Schema'] }
#                     for i in  invalid_tg_with_dup_target for fn, f in src_delta['new_tg_files_dict'].get(i).items()]
# log_report(invalid_tg_with_dup_target_rep, columns=['Taxonomy_Grp','File_Name','Date', 'Schema']) 


'''  Invalid New TG due to Schema conflict among files Report '''

invalid_tg_with_schema_conflict_rep = [ {'Taxonomy_Grp':f['FileGrp'], 'File_Name':f['FileName'], 'Date':f['Date'], 'Schema': f['Schema'] }
                    for i in  invalid_tg_all for fn, f in src_delta['new_tg_files_dict'].get(i).items()]
log_report(invalid_tg_with_schema_conflict_rep, columns=['Taxonomy_Grp','File_Name','Date', 'Schema'])


'''  Invalid files from retained grp due to schema or target mismatch with previously delivered files for same'''

partially_invalid_tg_set = tg_new.intersection(tg_existing)
partially_invalid_tg_report = [ {'Taxonomy_Grp':f['FileGrp'], 'File_Name':f['FileName'], 'Date':f['Date'], 'Schema': f['Schema'], 'Grp_Schema': tg_data_schema_dict[i] } 
                               for i in partially_invalid_tg_set  
                               for fn, f in src_delta['new_tg_files_dict'].get(i).items()]
log_report(partially_invalid_tg_report, columns=['Taxonomy_Grp','File_Name','Date', 'Schema','Grp_Schema']) 


'''   Invalid file not match with required file pattern '''

invalid_files_report_data = [{'File_Name' : i} for i in invalid_files_set]
log_report(invalid_files_report_data,  columns=['File_Name'])


'''   Invalid file not match with required schema pattern '''

invalid_schema_files_rep_data = [{'File_Name' : i[0], 'Schema': i[1], 'Reason' : i[2]} for i in invalid_schema_files]
log_report(invalid_schema_files_rep_data,  columns=['File_Name', 'Schema','Reason'], header_align='left')


'''   Dropped TG Completely '''

tg_dropped_rep_gen =(extract_info(f['Key']) for i in  tg_dropped for fn, f in tg_data_files_dict.get(i).items())
tg_dropped_report_dict =  [{'Taxonomy_Grp':i['FileGrp'], 'File_Name':i['FileName'], 'Date':i['Date'], 'Schema': tg_data_schema_dict[i['FileGrp']] }
                           for i in tg_dropped_rep_gen] 
log_report(list_of_row_dict=tg_dropped_report_dict,columns=['Taxonomy_Grp','File_Name','Date', 'Schema']) 


'''   Dropped TG to change schema '''

tg_drop_schema_change_rep = ((tg, tg_data_schema_dict.get(tg), schema_new, extract_info(f['Key'])) 
                             for tg in tg_delta_drop_n_create 
                             for schema_new in src_delta['new_tg_schema_dict'].get(tg)
                             for fn, f in tg_data_files_dict.get(tg).items())
tg_drop_schema_change_report_dict = [{'Grp' : i[0], 'File_Name': i[3]['FileName'], 'Date': i[3]['Date'], 'Old_Schema' : i[1], 'New_Schema' : i[2]} 
                                     for i in tg_drop_schema_change_rep]
log_report(list_of_row_dict=tg_drop_schema_change_report_dict,columns=['Grp','File_Name','Date', 'Old_Schema', 'New_Schema']) 


'''   Created TG Absolute New '''

tg_newly_created_report_data = [ {'Taxonomy_Grp':f['FileGrp'], 'File_Name':f['FileName'], 'Date':f['Date'], 'Schema': f['Schema'] }
                    for i in  tg_delta_create for fn, f in src_delta['new_tg_files_dict'].get(i).items()]
log_report(tg_newly_created_report_data, columns=['Taxonomy_Grp','File_Name','Date', 'Schema']) 


'''   Created TG to change schema(with new Schema) '''

tg_re_created_schema_change_rep = ((tg, tg_data_schema_dict.get(tg), f['Schema'], extract_info(f['Key']), 'Re-delivered') 
                                   if tg_data_files_dict.get(tg).get(fn) is not None 
                                   else (tg, 'NAN', f['Schema'], extract_info(f['Key']), 'New File')
                                    
                                   for tg in tg_delta_drop_n_create 
                                   #for schema_new in src_delta['new_tg_schema_dict'].get(tg) 
                                   
                                   for fn, f in src_delta['new_tg_files_dict'].get(tg).items() 
                                   )

tg_recreated_schema_change_report_dict = [{'Grp' : i[0], 'File_Name': i[3]['FileName'], 'Date': i[3]['Date'], 'Old_Schema' : i[1], 'New_Schema' : i[2], 'Desc': i[4]} 
                                          for i in tg_re_created_schema_change_rep]

log_report(list_of_row_dict=tg_recreated_schema_change_report_dict,columns=['Grp','File_Name','Date', 'Old_Schema', 'New_Schema', 'Desc'])


'''   Retained TG with retained files, new files and dropped files '''

tg_retained_report_data = [{'Taxonomy_Grp':f['FileGrp'], 'File_Name':f['FileName'], 'Date':f['Date'], 'Schema': f['Schema'], 'Desc' : 'Retained' }
                           
                            if tg_data_files_dict.get(tg).get(fn) is not None 
                            else {'Taxonomy_Grp':f['FileGrp'], 'File_Name':f['FileName'], 'Date':f['Date'], 'Schema': f['Schema'], 'Desc' : 'New File' }
                            for tg in tg_existing 
                            for fn, f in src_delta['existing_tg_files_dict'].get(tg).items()]

tg_retained_dropped_files = [extract_info(tg_data_files_dict.get(tg).get(fn)['Key']) 
                             for tg in tg_existing 
                             for fn in set(tg_data_files_dict.get(tg).keys()).difference(set(src_delta['existing_tg_files_dict'].get(tg).keys()))]

                             
tg_retained_dropped_files_report_data = [{'Taxonomy_Grp':f['FileGrp'], 'File_Name':f['FileName'], 'Date':f['Date'], 'Schema': tg_data_schema_dict[f['FileGrp']], 'Desc' : 'Dropped' }
                                          for f in tg_retained_dropped_files]

tg_retained_report_data.extend(tg_retained_dropped_files_report_data)
log_report(tg_retained_report_data, columns=['Taxonomy_Grp','File_Name','Date', 'Schema', 'Desc'], sort_by = ['Taxonomy_Grp','Date']) 

#''' End '''


 ******************************************************************************************************
 **          
 **          
 **            tg_name key_cols   target_cols                location                                     
 **      0    tg0     [key_a0]                [target_a0]   s3://qubole-ford/taxonomy_cs/test1/data/tg0
 **      1    tg1     [key_a1]                [target_a1]   s3://qubole-ford/taxonomy_cs/test1/data/tg1
 **      2   tg11     [key_a9]                [target_a9]  s3://qubole-ford/taxonomy_cs/test1/data/tg11
 **      3   tg14     [key_a7]                [target_a7]  s3://qubole-ford/taxonomy_cs/test1/data/tg14
 **      4   tg16    [key_a16]  [target_a16, target_a162]  s3://qubole-ford/taxonomy_cs/test1/data/tg16
 **      5   tg17    [key_a16]    [target_a16, target_a9]  s3://qubole-ford/taxonomy_cs/test1/data/tg17
 **      6    tg2    [key_a21]               [target_a21]   s3://qubole-ford/taxonomy_cs/test1/data/tg2
 **      7    tg4     [key_a4]   

 **          
 **          
 ******************************************************************************************************



 ******************************************************************************************************
 **          
 **          
 **           Taxonomy_Grp File_Name                Date        Schema            Desc     
 **      0  tg1          tg1_2020-11-01_ford.csv  2020-11-01  key_a1,target_a1  Retained
 **      1  tg1          tg1_2020-11-02_ford.csv  2020-11-02  key_a1,target_a1  New File
 **      2  tg5          tg5_2020-11-01_ford.csv  2020-11-01  key_a5,target_a5  Retained
 **      3  tg5          tg5_2020-11-02_ford.csv  2020-11-02  key_a5,target_a5  Retained
 **      4  tg5          tg5_2020-11-03_ford.csv  2020-11-03  key_a5,target_a5  New File
 **      5  tg5          tg5_2020-11-05_ford.csv  2020-11-05  key_a5,target_a5   Dropped
 **      6  tg7          tg7_2020-11-01_ford.csv  2020-11-01  key_a7,target_a7  Retained
 **      7  tg7   

In [25]:
pd.describe_option()

compute.use_bottleneck : bool
    Use the bottleneck library to accelerate if it is installed,
    the default is True
    Valid values: False,True
    [default: True] [currently: True]

compute.use_numexpr : bool
    Use the numexpr library to accelerate computation if it is installed,
    the default is True
    Valid values: False,True
    [default: True] [currently: True]

display.chop_threshold : float or None
    if set to a float value, all float values smaller then the given threshold
    will be displayed as exactly 0 by repr and friends.
    [default: None] [currently: None]

display.colheader_justify : 'left'/'right'
    Controls the justification of column headers. used by DataFrameFormatter.
    [default: right] [currently: left]

display.column_space No description available.
    [default: 12] [currently: 12]

display.date_dayfirst : boolean
    When True, prints and parses dates with the day first, eg 20/01/2005
    [default: False] [currently: False]

display.date_yearf

In [26]:
def Gen_Report(exposed_tg_all = None, 
               exposed_dropped_tg_all = None,
#                invalid_tg_with_dup_schema = None, 
#                invalid_tg_with_dup_target = None,
               invalid_new_tg_schema_conflict = None,
               src_delta  = None,
               tg_new  = None, 
               tg_existing = None,
               invalid_files_set = None,
               invalid_schema_files = None,
               tg_data_files_dict = None, 
               tg_data_schema_dict = None, 
               tg_delta_drop_n_create = None, 
               tg_delta_create = None):

    ''' Report '''
    
    
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)
    pd.set_option('display.width', 1000)
    pd.set_option('display.max_colwidth', 1000)


    
  
    
    '''  Exposed TG Report  '''

    exposed_tg_report_data = [i.get_dict() for i in exposed_tg_all]
    report_title = 'Exposed TG Report'
    log_report(exposed_tg_report_data,  columns=['tg_name', 'key_cols', 'target_cols', 'location'], 
               sort_by='tg_name', report_title=report_title)


    '''  Exposed Dropped TG Report '''

    exposed_tg_dropped_report_data = [i.get_dict() for i in exposed_dropped_tg_all]
    report_title = 'Exposed Dropped TG Report'
    log_report(exposed_tg_dropped_report_data, columns=['tg_name', 'key_cols', 'target_cols', 'location'], 
               sort_by='tg_name', report_title=report_title)


    
    '''   Dropped TG Completely '''

    tg_dropped_rep_gen =(extract_info(f['Key']) for i in  tg_dropped for fn, f in tg_data_files_dict.get(i).items())
    tg_dropped_report_dict =  [{'Taxonomy_Grp':i['FileGrp'], 'File_Name':i['FileName'], 'Date':i['Date'], 'Schema': tg_data_schema_dict[i['FileGrp']] }
                               for i in tg_dropped_rep_gen] 
    report_title = 'Dropped TG Completely'
    log_report(list_of_row_dict=tg_dropped_report_dict, columns=['Taxonomy_Grp','File_Name','Date', 'Schema'], 
               sort_by = ['Taxonomy_Grp','Date'], report_title=report_title) 


    '''   Dropped TG to change schema '''

    tg_drop_schema_change_rep = ((tg, tg_data_schema_dict.get(tg), schema_new, extract_info(f['Key'])) 
                                 for tg in tg_delta_drop_n_create 
                                 for schema_new in src_delta['new_tg_schema_dict'].get(tg)
                                 for fn, f in tg_data_files_dict.get(tg).items())
    tg_drop_schema_change_report_dict = [{'Grp' : i[0], 'File_Name': i[3]['FileName'], 'Date': i[3]['Date'], 'Old_Schema' : i[1], 'New_Schema' : i[2]} 
                                         for i in tg_drop_schema_change_rep]
    report_title = 'Dropped TG to change schema'
    log_report(list_of_row_dict=tg_drop_schema_change_report_dict, 
               columns=['Grp','File_Name','Date', 'Old_Schema', 'New_Schema'], 
               sort_by = ['Grp','Date'], report_title=report_title) 


    '''   Created TG Absolute New '''

    tg_newly_created_report_data = [ {'Taxonomy_Grp':f['FileGrp'], 'File_Name':f['FileName'], 'Date':f['Date'], 'Schema': f['Schema'] }
                        for i in  tg_delta_create for fn, f in src_delta['new_tg_files_dict'].get(i).items()]
    report_title = 'Created TG Absolute New'
    log_report(tg_newly_created_report_data, columns=['Taxonomy_Grp','File_Name','Date', 'Schema'], 
               sort_by = ['Taxonomy_Grp','Date'], report_title=report_title) 


    '''   Created TG to change schema(with new Schema) '''

    tg_re_created_schema_change_rep = ((tg, tg_data_schema_dict.get(tg), f['Schema'], extract_info(f['Key']), 'Re-delivered') 
                                       if tg_data_files_dict.get(tg).get(fn) is not None 
                                       else (tg, 'NAN', f['Schema'], extract_info(f['Key']), 'New File')

                                       for tg in tg_delta_drop_n_create 
                                       #for schema_new in src_delta['new_tg_schema_dict'].get(tg) 

                                       for fn, f in src_delta['new_tg_files_dict'].get(tg).items() 
                                       )

    tg_recreated_schema_change_report_dict = [{'Grp' : i[0], 'File_Name': i[3]['FileName'], 'Date': i[3]['Date'], 'Old_Schema' : i[1], 'New_Schema' : i[2], 'Desc': i[4]} 
                                              for i in tg_re_created_schema_change_rep]
    report_title = 'Created TG to change schema(with new Schema)'
    log_report(list_of_row_dict=tg_recreated_schema_change_report_dict, 
               columns=['Grp','File_Name','Date', 'Old_Schema', 'New_Schema', 'Desc'], 
               sort_by = ['Grp','Date'], report_title=report_title)


    '''   Retained TG with retained files, new files and dropped files '''

    tg_retained_report_data = [{'Taxonomy_Grp':f['FileGrp'], 'File_Name':f['FileName'], 'Date':f['Date'], 'Schema': f['Schema'], 'Desc' : 'Retained' }

                                if tg_data_files_dict.get(tg).get(fn) is not None 
                                else {'Taxonomy_Grp':f['FileGrp'], 'File_Name':f['FileName'], 'Date':f['Date'], 'Schema': f['Schema'], 'Desc' : 'New File' }
                                for tg in tg_existing 
                                for fn, f in src_delta['existing_tg_files_dict'].get(tg).items()]

    tg_retained_dropped_files = [extract_info(tg_data_files_dict.get(tg).get(fn)['Key']) 
                                 for tg in tg_existing 
                                 for fn in set(tg_data_files_dict.get(tg).keys()).difference(
                                     set(src_delta['existing_tg_files_dict'].get(tg).keys()))]


    tg_retained_dropped_files_report_data = [{'Taxonomy_Grp':f['FileGrp'], 'File_Name':f['FileName'], 'Date':f['Date'], 'Schema': tg_data_schema_dict[f['FileGrp']], 'Desc' : 'Dropped' }
                                              for f in tg_retained_dropped_files]

    tg_retained_report_data.extend(tg_retained_dropped_files_report_data)
    report_title = 'Retained TG with retained files, new files and dropped files'
    log_report(tg_retained_report_data, columns=['Taxonomy_Grp','File_Name','Date', 'Schema', 'Desc'], 
               header_align='left', sort_by = ['Taxonomy_Grp','Date'], report_title=report_title) 

    
#     '''  Invalid TG due to schema conflict/already used Report '''

#     invalid_tg_with_dup_schema_rep = [ {'Taxonomy_Grp':f['FileGrp'], 'File_Name':f['FileName'], 'Date':f['Date'], 'Schema': f['Schema'] }
#                                       for i in  invalid_tg_with_dup_schema 
#                                       for fn, f in src_delta['new_tg_files_dict'].get(i).items()]
#     report_title = 'Invalid TG due to schema conflict/already used Report'
#     log_report(invalid_tg_with_dup_schema_rep, columns=['Taxonomy_Grp','File_Name','Date', 'Schema'], 
#                sort_by = ['Taxonomy_Grp','Date'], report_title=report_title) 


#     '''  Invalid TG due to Target Column conflict/already used Report '''

#     invalid_tg_with_dup_target_rep = [ {'Taxonomy_Grp':f['FileGrp'], 'File_Name':f['FileName'], 'Date':f['Date'], 'Schema': f['Schema'] }
#                         for i in  invalid_tg_with_dup_target for fn, f in src_delta['new_tg_files_dict'].get(i).items()]
#     report_title = 'Invalid TG due to Target Column conflict/already used Report'
#     log_report(invalid_tg_with_dup_target_rep, columns=['Taxonomy_Grp','File_Name','Date', 'Schema'], 
#                sort_by = ['Taxonomy_Grp','Date'], report_title=report_title)
    
    
    '''  Invalid New TG due to Schema conflict among files Report '''

    invalid_tg_with_schema_conflict_rep = [ {'Taxonomy_Grp':f['FileGrp'], 'File_Name':f['FileName'], 'Date':f['Date'], 'Schema': f['Schema'] }
                        for i in  invalid_new_tg_schema_conflict for fn, f in src_delta['new_tg_files_dict'].get(i).items()]
    report_title = 'Invalid New TG due to Schema conflict among files Report'
    log_report(invalid_tg_with_schema_conflict_rep, columns=['Taxonomy_Grp','File_Name','Date', 'Schema'], 
               sort_by = ['Taxonomy_Grp','Date'], report_title=report_title)


    ''' Invalid files from retained grp due to schema or target mismatch with previously delivered files for same'''

    partially_invalid_tg_set = tg_new.intersection(tg_existing)
    partially_invalid_tg_report = [ {'Taxonomy_Grp':f['FileGrp'], 'File_Name':f['FileName'], 'Date':f['Date'], 'Schema': f['Schema'], 'Grp_Schema': tg_data_schema_dict[i] } 
                                   for i in partially_invalid_tg_set  
                                   for fn, f in src_delta['new_tg_files_dict'].get(i).items()]
    report_title = 'Invalid files from retained grp due to schema or target mismatch'
    log_report(partially_invalid_tg_report, columns=['Taxonomy_Grp','File_Name','Date', 'Schema','Grp_Schema'], 
               sort_by = ['Taxonomy_Grp','Date'], report_title=report_title) 


    '''   Invalid file not match with required file pattern '''

    invalid_files_report_data = [{'File_Name' : filename_by_key(i)} for i in invalid_files_set]
    report_title = 'Invalid file not match with required file pattern'
    log_report(invalid_files_report_data,  columns=['File_Name'], 
               sort_by = ['File_Name'], report_title=report_title)


    '''   Invalid file not match with required schema pattern '''

    invalid_schema_files_rep_data = [{'File_Name' : filename_by_key(i[0]), 'Schema': i[1], 'Reason' : i[2]} for i in invalid_schema_files]
    report_title = 'Invalid file not match with required schema pattern'
    log_report(invalid_schema_files_rep_data,  columns=['File_Name', 'Schema','Reason'], 
               header_align='left', sort_by = ['File_Name'], report_title=report_title)


    #''' End '''


In [27]:
Gen_Report(exposed_tg_all = exposed_tg_all, 
               exposed_dropped_tg_all = exposed_dropped_tg_all,
#                invalid_tg_with_dup_schema = invalid_tg_with_dup_schema, 
#                invalid_tg_with_dup_target = invalid_tg_with_dup_target,
               invalid_new_tg_schema_conflict = invalid_tg_all,
               src_delta  = src_delta,
               tg_new  = tg_new, 
               tg_existing = tg_existing,
               invalid_files_set = invalid_files_set,
               invalid_schema_files = invalid_schema_files,
               tg_data_files_dict = tg_data_files_dict, 
               tg_data_schema_dict = tg_data_schema_dict, 
               tg_delta_drop_n_create = tg_delta_drop_n_create, 
               tg_delta_create = tg_delta_create
)

 ******************************************************************************************************
 ******************************************************************************************************
 ****          
 ****    Exposed TG Report
 ****          
 ******************************************************************************************************
 **          
 **          
 **            tg_name key_cols   target_cols                location                                     
 **      0    tg0     [key_a0]                [target_a0]   s3://qubole-ford/taxonomy_cs/test1/data/tg0
 **      1    tg1     [key_a1]                [target_a1]   s3://qubole-ford/taxonomy_cs/test1/data/tg1
 **      2   tg11     [key_a9]                [target_a9]  s3://qubole-ford/taxonomy_cs/test1/data/tg11
 **      3   tg14     [key_a7]                [target_a7]  s3://qubole-ford/taxonomy_cs/test1/data/tg14
 **      4   tg16    [key_a16]  [target_a16, target_a162]  s3://qubole-ford/ta

 **          
 **          
 **           Taxonomy_Grp File_Name                Date        Schema            Desc     
 **      0  tg1          tg1_2020-11-01_ford.csv  2020-11-01  key_a1,target_a1  Retained
 **      1  tg1          tg1_2020-11-02_ford.csv  2020-11-02  key_a1,target_a1  New File
 **      2  tg5          tg5_2020-11-01_ford.csv  2020-11-01  key_a5,target_a5  Retained
 **      3  tg5          tg5_2020-11-02_ford.csv  2020-11-02  key_a5,target_a5  Retained
 **      4  tg5          tg5_2020-11-03_ford.csv  2020-11-03  key_a5,target_a5  New File
 **      5  tg5          tg5_2020-11-05_ford.csv  2020-11-05  key_a5,target_a5   Dropped
 **      6  tg7          tg7_2020-11-01_ford.csv  2020-11-01  key_a7,target_a7  Retained
 **      7  tg7          tg7_2020-11-02_ford.csv  2020-11-02  key_a7,target_a7  Retained
 **          
 **          
 ******************************************************************************************************



 ********************************

In [28]:
#df = pd.DataFrame({})

In [29]:
#df.__str__??

In [30]:
#!cat ~/raj_fb/anaconda3/envs/aws_util/lib/python3.7/site-packages/pandas/core/base.py

In [33]:
!cat /home/vbhargava/feature_test0/temp/taxo_config_xmls/test1.xml


<?xml version='1.0' encoding='UTF-8'?>
<configroot version="12.1">
	<set>
		<name>CS_TAXONOMY_LMT_SCHEMA_SET</name>
        
		<elements>
			<subsource_name>
				<!-- name of the subsource, has to be unique across all datasource -->
				<val>tg9</val>
			</subsource_name>
			<key-columns>
                
				<attr datatype="STRING">key_a9</attr>
                
			</key-columns>
			<target-columns>
                
				<attr datatype="STRING">target_a9</attr>
                
			</target-columns>
			<partitionby_columns>
			</partitionby_columns>
			<row_delimiter>
				<!-- Row separator -->
				<val>'\n'</val>
			</row_delimiter>
			<column_delimiter>
				<!-- Column separator -->
				<val>','</val>
			</column_delimiter>
			<serde>
				<val>'org.apache.hadoop.hive.serde2.OpenCSVSerde'</val>
			</serde>
			<serde_properties>
				<val/>
			</serde_properties>
			<table_properties>
				<val/>
			</table_properties>
			<storage_type>
				<!-- Sto

In [35]:
config_data['s3_location']

's3://qubole-ford/warehouse/'

In [108]:

def path_resolve(parent_path, relative_path):
    path_tk = parent_path.split('//')
    if len(path_tk) == 2: 
        abc = os.path.join(path_tk[1], relative_path)
    else:
        abc = os.path.join(path_tk[0], relative_path)
    folder_token_stk = abc.split('/')

    cnt = 0
    for idx, tk in enumerate(folder_token_stk):
        if tk == '.':
            pass
        elif tk == '..':
            cnt = cnt-1
        else:
            folder_token_stk[cnt] = tk
            cnt = cnt+1

    if len(path_tk) == 2:
        return '{}//{}'.format(path_tk[0], '/'.join(folder_token_stk[:cnt]))
    else:
        return '{}'.format('/'.join(folder_token_stk[:cnt]))
path_resolve("s3://qubole-ford/warehouse/yts/uiy/tac", '../../../../../../kv')
path_resolve(config_data['s3_location'], '../taxonomy_cs/lmt/input/')
path_resolve('/home/abc/ddec', '../taxonomy_cs/lmt/input/')

'/home/abc/taxonomy_cs/lmt/input/'

In [41]:
from pathlib import Path
file_dir = Path( os.path.join(config_data['s3_location'], '../../abc'))

In [43]:
(file_dir).resolve()

PosixPath('/home/vbhargava/feature_test0/msaction_backend/common/BU3.0_core/util/Py_utils/taxonomy_utils/notebooks/s3:/abc')