In [1]:
import sys, os, inspect, re
sys.path.append("/home/vbhargava/feature_test0/msaction_backend/common/BU3.0_core/util/Py_utils/taxonomy_utils")
import time, logging
numeric_level = getattr(logging, 'INFO', None)
stdout_handler = logging.StreamHandler(sys.stdout)
logging.basicConfig(level=numeric_level,
                        format='%(asctime)s %(levelname)s %(name)s: %(message)s',
                        handlers=[stdout_handler])

In [2]:
from libs.s3_ops import S3_OPs
from libs.s3_stream import S3Stream
from libs.configs import Config
from libs.nio_executor import NIO
from libs import utils
from collections import defaultdict

In [3]:
config = '/home/vbhargava/feature_test0/msaction_backend/customers/raj_ford_test/common/config/inputs/platform_config.xml'
lmt_src = 's3://qubole-ford/taxonomy_cs/test1/src/'
lmt_data = 's3://qubole-ford/taxonomy_cs/test1/data/'

In [4]:
config_data = Config.get_qubole_config(config)
ACCESS_KEY=config_data['access_key']
SECRET_KEY=config_data['secret_key']

In [5]:
TG_EXTRACT_REGEX = '^.*?/([a-zA-Z]+\-?[0-9]*)/$' 
FILE_EXTRACT_REGEX = '^.*/([a-zA-Z0-9.\-_]{0,255}.csv)$' #'^.*/([a-zA-Z0-9.\-_]{0,255}.csv)$'
TARGET_EXTRACT_REGEX ='^.*,?(target_[A-Za-z0-9_-]+).*$'
VALID_FILE_KEY_REGEX = '^(.*/([a-zA-Z]+\-?[0-9]*)?/)?(([a-zA-Z]+\-?[0-9]*?)_([0-9]{4}-[0-9]{2}-[0-9]{2}?)_([a-zA-Z0-9.\-_]+?).csv?)$'
KEY_REGEX = '^[Kk]ey_[A-Za-z0-9_]{2,30}$'
TARGET_REGEX = '^[Tt]arget_[A-Za-z0-9_]{2,30}$'

In [6]:
s3_ops = S3_OPs(ACCESS_KEY, SECRET_KEY)

def filename_by_key(key):
    return get_val_by_regex(key, FILE_EXTRACT_REGEX, error_msg="Not vaild key for taxonomy data csv file")

def find_by_data_tg(key, regex):
    return get_val_by_regex(key, regex, error_msg="Not vaild taxonomy data dir")

        
def get_val_by_regex(key, regex, error_msg="can't be extract a val."):
    matched = re.findall(regex, key)
    if len(matched) > 0:
        return matched[0]
    else:
        raise Exception(error_msg)
        
def get_data_n_schema(tg, data_files_loc):
    data_file_lock_detail = s3_ops.get_bucket_name(data_files_loc)
    files = s3_ops.list_complete(data_file_lock_detail['bucket'], data_file_lock_detail['key'])
    res = {}
    if len(files)>0:
        s3_stream = S3Stream(ACCESS_KEY, SECRET_KEY)
        schema = s3_stream.get_header(s3_ops.get_full_s3_path(data_file_lock_detail['bucket'],files[0]['Key']))
        #res[tg]={'schema':schema, 'files': files}
        res['schema'] = {tg:schema}
        res['files'] = {tg:files}
    return res

def extract_schema(schema):
    return schema.replace(" ","").lower()

def validate_schema(schema):
    if schema=='': 
        return {'IsValid' : False, 'schema': schema, 'message' : "Schema shouldn't be empty"}
    tokens = schema.split(',')
    if len(tokens) < 2:
         return {'IsValid' : False, 'schema': schema, 'message' : "Schema should have at least 2 columns"}
    KEY_REGEX = '^[Kk]ey_[A-Za-z0-9_]{2,30}$'
    TARGET_REGEX = '^[Tt]arget_[A-Za-z0-9_]{2,30}$'
    key_cnt = 0
    target_cnt = 0
    invalid_headers = []
    columns = defaultdict(list)
    res = {}
    target_col = None
    key_cols_set = set()
    for t in tokens:
        t = t.strip()
        if re.match(TARGET_REGEX, t):
            target_cnt = target_cnt + 1
            target_col = t
        elif re.match(KEY_REGEX, t):
            key_cnt = key_cnt + 1
            key_cols_set.add(t)
        else:
            invalid_headers.append(t)
        columns[t.lower()].append(1)

    error_msgs=[]
    if target_cnt != 1 :
        error_msgs.append("Exact one Target column is required!")
    if key_cnt < 1 :
        error_msgs.append("At least one Key column is required!")
    if len(invalid_headers) > 0 :
        error_msgs.append("All given columns should Key or Target!")
    for k, v in columns.items():

        if len(v) > 1:
            print("--")
            error_msgs.append("Same name: {} should not represent more than one column in schema! cols names are case insensitive. ".format(k))

    if len(error_msgs) > 0:
        return {'IsValid' : False, 'schema': schema, 'errors' : " \n".join(error_msgs)}
    #print(str(key_cnt)+":"+str(target_cnt)+":"+str(invalid_headers)+":"+str(columns))
    return {'IsValid' : True, 'Schema': schema.replace(" ","").lower(), 
            'TargetCol' : target_col, 'KeyColsSet' : key_cols_set}

In [7]:
def extract_data_detail(lmt_src, lmt_data, access_key, secret_key):
#     Valid data Taxonomy Grps
    
    #
    lmt_data_loc_detail = s3_ops.get_bucket_name(lmt_data)
    lmt_data_loc_bucket = lmt_data_loc_detail['bucket']
    lmt_data_loc_key = lmt_data_loc_detail['key']
    valid_tg_list_res = s3_ops.list_subdirs(lmt_data_loc_detail['bucket'],lmt_data_loc_detail['key'],)
    
    valid_tgrp_loc_list = [ [find_by_data_tg(item['Prefix'], TG_EXTRACT_REGEX), 
                         '{}{}'.format(lmt_data, find_by_data_tg(item['Prefix'], TG_EXTRACT_REGEX))] 
                       for item in valid_tg_list_res]
    
    collected = NIO.decorated_run_io(task=get_data_n_schema, task_n_args_list=valid_tgrp_loc_list, max_workers=25,)
#     return collected
    tg_data_schema_dict = {k:extract_schema(v)  for item in collected for k, v in item['result']['schema'].items()}
    tg_data_files_dict = {k:{filename_by_key(u['Key']):u for u in v } for item in collected for k, v in item['result']['files'].items()}
    target_data_tg_dict = {re.findall(TARGET_EXTRACT_REGEX,V)[0]: K for K, V in tg_data_schema_dict.items()}
    
    return tg_data_schema_dict, tg_data_files_dict,target_data_tg_dict

In [8]:
extract_data_detail(lmt_src, lmt_data, ACCESS_KEY, SECRET_KEY)

2020-11-01 21:20:34,751:81025 MainThread run_blocking_tasks: starting

2020-11-01 21:20:34,751:81025 MainThread run_blocking_tasks: creating executor tasks

2020-11-01 21:20:34,752:81025 ThreadPoolExecutor-0_0 (task-0): passed args :['tg1', 's3://qubole-ford/taxonomy_cs/test1/data/tg1']

2020-11-01 21:20:34,753:81025 ThreadPoolExecutor-0_1 (task-1): passed args :['tg2', 's3://qubole-ford/taxonomy_cs/test1/data/tg2']

2020-11-01 21:20:34,753:81025 ThreadPoolExecutor-0_0 (task-0): running

2020-11-01 21:20:34,754:81025 ThreadPoolExecutor-0_1 (task-1): running

2020-11-01 21:20:34,754:81025 ThreadPoolExecutor-0_2 (task-2): passed args :['tg3', 's3://qubole-ford/taxonomy_cs/test1/data/tg3']

2020-11-01 21:20:34,755:81025 ThreadPoolExecutor-0_3 (task-3): passed args :['tg5', 's3://qubole-ford/taxonomy_cs/test1/data/tg5']

2020-11-01 21:20:34,756:81025 ThreadPoolExecutor-0_4 (task-4): passed args :['tg6', 's3://qubole-ford/taxonomy_cs/test1/data/tg6']

2020-11-01 21:20:34,758:81025 ThreadPoo

({'tg2': 'key_a2,target_a2',
  'tg7': 'key_a7,target_a7',
  'tg6': 'key_a6,target_a6',
  'tg5': 'key_a5,target_a5',
  'tg3': 'key_a3,target_a3',
  'tg1': 'key_a1,target_a1'},
 {'tg2': {'tg2_2020-11-01_ford.csv': {'Key': 'taxonomy_cs/test1/data/tg2/tg2_2020-11-01_ford.csv',
    'LastModified': datetime.datetime(2020, 10, 29, 19, 54, 36, tzinfo=tzlocal()),
    'ETag': '"df46527d151cbac56cf5d648af64f146"',
    'Size': 48,
    'StorageClass': 'STANDARD'},
   'tg2_2020-11-02_ford.csv': {'Key': 'taxonomy_cs/test1/data/tg2/tg2_2020-11-02_ford.csv',
    'LastModified': datetime.datetime(2020, 10, 29, 19, 54, 36, tzinfo=tzlocal()),
    'ETag': '"df46527d151cbac56cf5d648af64f146"',
    'Size': 48,
    'StorageClass': 'STANDARD'}},
  'tg7': {'tg7_2020-11-01_ford.csv': {'Key': 'taxonomy_cs/test1/data/tg7/tg7_2020-11-01_ford.csv',
    'LastModified': datetime.datetime(2020, 10, 29, 19, 54, 36, tzinfo=tzlocal()),
    'ETag': '"653eec710cdbf86149efb89f21912022"',
    'Size': 48,
    'StorageClass': '

In [9]:
from service.taxonomy_cs_api import Taxonomy_CS_API

In [10]:
tcAPI = Taxonomy_CS_API()

In [11]:
tcAPI.extract_data_detail()

2020-11-01 21:20:34,983:81025 MainThread run_blocking_tasks: starting

2020-11-01 21:20:34,984:81025 MainThread run_blocking_tasks: creating executor tasks

2020-11-01 21:20:34,985:81025 ThreadPoolExecutor-1_0 (task-0): passed args :['AdvertiserReporting', 's3://qubole-ford/taxonomy_cs/lmt/data/AdvertiserReporting']

2020-11-01 21:20:34,985:81025 ThreadPoolExecutor-1_1 (task-1): passed args :['ChannelGrouping', 's3://qubole-ford/taxonomy_cs/lmt/data/ChannelGrouping']

2020-11-01 21:20:34,986:81025 MainThread run_blocking_tasks: waiting for executor tasks

2020-11-01 21:20:34,986:81025 ThreadPoolExecutor-1_0 (task-0): running

2020-11-01 21:20:34,987:81025 ThreadPoolExecutor-1_1 (task-1): running

2020-11-01 21:20:35,044:81025 ThreadPoolExecutor-1_0 (task-0): done

2020-11-01 21:20:35,066:81025 ThreadPoolExecutor-1_1 (task-1): done

2020-11-01 21:20:35,067:81025 MainThread run_blocking_tasks: exiting



{'tg_data_schema_dict': {'AdvertiserReporting': 'key_evt_advertiser_key,target_evt_advertiser_name',
  'ChannelGrouping': 'key_evt_source,target_channel'},
 'tg_data_files_dict': {'AdvertiserReporting': {'AdvertiserReporting_2020-06-01_ford.csv': {'Key': 'taxonomy_cs/lmt/data/AdvertiserReporting/AdvertiserReporting_2020-06-01_ford.csv',
    'LastModified': datetime.datetime(2020, 10, 20, 21, 52, 9, tzinfo=tzlocal()),
    'ETag': '"04234605e8b4354998074abae5c74ae9"',
    'Size': 71,
    'StorageClass': 'STANDARD'},
   'AdvertiserReporting_2020-06-02_ford.csv': {'Key': 'taxonomy_cs/lmt/data/AdvertiserReporting/AdvertiserReporting_2020-06-02_ford.csv',
    'LastModified': datetime.datetime(2020, 10, 20, 21, 50, 55, tzinfo=tzlocal()),
    'ETag': '"f33832d48d54ec287f1b486526c197bf"',
    'Size': 57,
    'StorageClass': 'STANDARD'}},
  'ChannelGrouping': {'ChannelGrouping_2020-06-01_ford.csv': {'Key': 'taxonomy_cs/lmt/data/ChannelGrouping/ChannelGrouping_2020-06-01_ford.csv',
    'LastModif

## SRC File processed

In [12]:
tg_data = extract_data_detail(lmt_src, lmt_data, ACCESS_KEY, SECRET_KEY)
tg_data_schema_dict = tg_data[0]
tg_data_files_dict = tg_data[1]
target_data_tg_dict = tg_data[2]

2020-11-01 21:20:35,092:81025 MainThread run_blocking_tasks: starting

2020-11-01 21:20:35,092:81025 MainThread run_blocking_tasks: creating executor tasks

2020-11-01 21:20:35,093:81025 ThreadPoolExecutor-2_0 (task-0): passed args :['tg1', 's3://qubole-ford/taxonomy_cs/test1/data/tg1']

2020-11-01 21:20:35,093:81025 ThreadPoolExecutor-2_1 (task-1): passed args :['tg2', 's3://qubole-ford/taxonomy_cs/test1/data/tg2']

2020-11-01 21:20:35,094:81025 ThreadPoolExecutor-2_2 (task-2): passed args :['tg3', 's3://qubole-ford/taxonomy_cs/test1/data/tg3']

2020-11-01 21:20:35,094:81025 ThreadPoolExecutor-2_3 (task-3): passed args :['tg5', 's3://qubole-ford/taxonomy_cs/test1/data/tg5']

2020-11-01 21:20:35,094:81025 ThreadPoolExecutor-2_0 (task-0): running

2020-11-01 21:20:35,095:81025 ThreadPoolExecutor-2_4 (task-4): passed args :['tg6', 's3://qubole-ford/taxonomy_cs/test1/data/tg6']

2020-11-01 21:20:35,095:81025 ThreadPoolExecutor-2_1 (task-1): running

2020-11-01 21:20:35,095:81025 ThreadPoo

In [13]:
target_data_tg_dict

{'target_a7': 'tg7',
 'target_a6': 'tg6',
 'target_a1': 'tg1',
 'target_a2': 'tg2',
 'target_a5': 'tg5',
 'target_a3': 'tg3'}

In [14]:
def is_valid_file(key:str='', regex = VALID_FILE_KEY_REGEX):
    if re.match(regex, key) is None:
        return False
    return True

def extract_info(key:str='', regex = VALID_FILE_KEY_REGEX):
    matched = re.findall(regex, key)
    return {
            'KeyDirPath' : matched[0][0],
            'ParentDir' : matched[0][1],
            'FileName' : matched[0][2],
            'FileGrp' :  matched[0][3],
            'Date' :  matched[0][4],
            'ClientName' : matched[0][5]
           }
def extract_info_with_bucket(key:str='', bucket = ''):
    res = extract_info(key)
    res.update({'Bucket' : bucket})
    return res

In [15]:
def grouped_tg(collected, tg_files_dict_type='new_tg_files_dict'):
    collect = defaultdict(dict)
    tg_f_gen = (item['result'][tg_files_dict_type] for item in collected if len(item['result'][tg_files_dict_type]) > 0)
    tg_f_gen2 = (collect[tg].update({filename: file_dict})  for item in tg_f_gen for tg, file_detail_dict in item.items() for filename, file_dict in file_detail_dict.items())
    [ i for i in tg_f_gen2]
    tg = dict(collect)
    return tg

def grouped_flag_dict(collected, flag_dict_type='schema_tg_dict'):
    f_gen = (item['result'][flag_dict_type] for item in collected if len(item['result'][flag_dict_type]) > 0)
    collect = defaultdict(set)
    f_gen2 = (collect[K].add(V)  for item in f_gen for K, V in item.items())
    [ i for i in f_gen2]
    res = dict(collect)
    return res

def grouped_set_of_flags_dict(collected, flag_dict_type='schema_tg_dict'):
    f_gen = (item['result'][flag_dict_type] for item in collected if len(item['result'][flag_dict_type]) > 0)
    collect = defaultdict(set)
    f_gen2 = (collect[K].update(V)  for item in f_gen for K, V in item.items())
    [ i for i in f_gen2]
    res = dict(collect)
    return res

def grouped_set_of_flags(collected, flag_dict_type='invalid_schema_files'):
    res_set=set()
    f_gen = (res_set.update(item['result'][flag_dict_type]) for item in collected if len(item['result'][flag_dict_type]) > 0)
    [ i for i in f_gen]
    return res_set

In [16]:
# def file_process_task(src_file_details):
#     invalid_schema_files = set()

#     ''' {'key_evt_advertiser_key,target_evt_advertiser_name': {'tg1', 'tg2', ...}}'''
#     schema_tg_dict = {}
    
#     ''' {'target_evt_advertiser_name': {'tg1', 'tg2', ...}}'''
#     target_tg_dict = {}

#     ''' {'tg': {'key_evt_advertiser_key,target_evt_advertiser_name', '',...}}'''
#     new_tg_schema_dict = {}
#     ''' {'tg': {'AdvertiserReporting_2020-06-01_ford.csv': {file detailed obj dict} }  }'''
#     new_tg_files_dict = {}
#     ''' {'tg': {'AdvertiserReporting_2020-06-01_ford.csv': {file detailed obj dict} }  }'''
#     existing_tg_files_dict = {}


# #     tg_data_schema_dict = {}
# #     tg_data_files_dict = {}

# #     src_file_details = valid_file_arg[0]
#     src_file_loc = s3_ops.get_full_s3_path(src_file_details['Bucket'], src_file_details['Key'])

#     s3_stream = S3Stream(ACCESS_KEY, SECRET_KEY)
#     schema =  s3_stream.get_header(src_file_loc)
#     #schema = 'key_evt_advertiser_key, targe_evt_advertiser_name'
#     validate_res = validate_schema(schema)
#     if validate_res['IsValid']:
#         src_file_details['Schema'] = validate_res['Schema']
#         tg = src_file_details['FileGrp']
#         file_name = src_file_details['FileName']
#         schema_tg_dict[validate_res['Schema']] = tg
#         target_tg_dict[validate_res['TargetCol']] = tg
#         if tg_data_schema_dict.get(tg) is None:
#             new_tg_schema_dict[tg] = validate_res['Schema']
#             new_tg_files_dict[tg] = {file_name: src_file_details}
#         else:
#             existing_tg_files_dict[tg] = {file_name: src_file_details}

#     else:
#         invalid_schema_files.add((src_file_loc, schema, validate_res['errors']))

#     return {'invalid_schema_files': invalid_schema_files,
#             'schema_tg_dict': schema_tg_dict,
#             'target_tg_dict':target_tg_dict,
#             'new_tg_schema_dict': new_tg_schema_dict,
#             'new_tg_files_dict' : new_tg_files_dict,
#             'existing_tg_files_dict' : existing_tg_files_dict
#            }


def file_process_task(src_file_details):
    
    invalid_schema_files = set()

    target_already_exist_files = set()
    
    ''' {'key_evt_advertiser_key,target_evt_advertiser_name': {'tg1', 'tg2', ...}}'''
    schema_tg_dict = {}
    
    ''' {'target_evt_advertiser_name': {'tg1', 'tg2', ...}}'''
    target_tg_dict = {}

    ''' {'tg': {'key_evt_advertiser_key,target_evt_advertiser_name', '',...}}'''
    new_tg_schema_dict = {}
    ''' {'tg': {'AdvertiserReporting_2020-06-01_ford.csv': {file detailed obj dict} }  }'''
    new_tg_files_dict = {}
    ''' {'tg': {'AdvertiserReporting_2020-06-01_ford.csv': {file detailed obj dict} }  }'''
    existing_tg_files_dict = {}


    # tg_data_schema_dict = 
    # tg_data_files_dict = 
    # target_data_tg_dict = 

#     src_file_details = valid_file_arg[0]
    src_file_loc = s3_ops.get_full_s3_path(src_file_details['Bucket'], src_file_details['Key'])

    s3_stream = S3Stream(ACCESS_KEY, SECRET_KEY)
    schema =  s3_stream.get_header(src_file_loc)
    #schema = 'key_evt_advertiser_key, targe_evt_advertiser_name'
    validate_res = validate_schema(schema)
    if validate_res['IsValid']:
        
        
        
        tg = src_file_details['FileGrp']
        file_name = src_file_details['FileName']
        
        if tg_data_schema_dict.get(tg) is None or tg_data_schema_dict.get(tg) != validate_res['Schema']:
                
#             data_tg_for_target = target_data_tg_dict.get(validate_res['TargetCol'])
#             if  data_tg_for_target is not None:# and data_tg_for_target != tg:
#                 target_already_exist_files.add((src_file_loc, data_tg_for_target))
#             else:
            new_tg_schema_dict[tg] = validate_res['Schema']
            new_tg_files_dict[tg] = {file_name: src_file_details}
        else:
            existing_tg_files_dict[tg] = {file_name: src_file_details}
        
        src_file_details['Schema'] = validate_res['Schema']
        schema_tg_dict[validate_res['Schema']] = tg
        target_tg_dict[validate_res['TargetCol']] = tg

    else:
        invalid_schema_files.add((src_file_loc, schema, validate_res['errors']))

    return {'invalid_schema_files': invalid_schema_files,
            'target_already_exist_files':target_already_exist_files,
            'schema_tg_dict': schema_tg_dict,
            'target_tg_dict':target_tg_dict,
            'new_tg_schema_dict': new_tg_schema_dict,
            'new_tg_files_dict' : new_tg_files_dict,
            'existing_tg_files_dict' : existing_tg_files_dict
           }



def src_list_page_process_task(list_page):
    
    lmt_src_loc_detail = s3_ops.get_bucket_name(lmt_src)
    lmt_src_loc_bucket = lmt_src_loc_detail['bucket']
    lmt_src_loc_key = lmt_src_loc_detail['key']
    
    invalid_files_set = { s3_ops.get_full_s3_path(lmt_src_loc_detail['bucket'], item['Key']) for item in list_page if  not is_valid_file(key=item['Key'])}
    valid_file_set = [[utils.dict_append(extract_info_with_bucket(item['Key'], lmt_src_loc_detail['bucket']),item)] for item in list_page if  is_valid_file(key=item['Key']) ]
    collected = NIO.decorated_run_io(task=file_process_task, task_n_args_list=valid_file_set, max_workers=25,)
#     return collected
    return {'invalid_files_set' : invalid_files_set,
            'invalid_schema_files': grouped_set_of_flags(collected, flag_dict_type='invalid_schema_files'),
            'target_already_exist_files' : grouped_set_of_flags(collected, flag_dict_type='target_already_exist_files'),
            'schema_tg_dict': grouped_flag_dict(collected, flag_dict_type='schema_tg_dict'),
            'target_tg_dict': grouped_flag_dict(collected, flag_dict_type='target_tg_dict'),
            'new_tg_schema_dict': grouped_flag_dict(collected, flag_dict_type='new_tg_schema_dict'),
            'new_tg_files_dict' : grouped_tg(collected, 'new_tg_files_dict'),
            'existing_tg_files_dict' : grouped_tg(collected, 'existing_tg_files_dict')
           }

In [17]:
lmt_src_loc_detail = s3_ops.get_bucket_name(lmt_src)
page_generator = s3_ops.list_gen(lmt_src_loc_detail['bucket'],lmt_src_loc_detail['key'], maxKeysPerReq=12, )
list_page = [i for i in page_generator][0]

src_list_page_process_task(list_page)

2020-11-01 21:20:35,298:81025 MainThread run_blocking_tasks: starting

2020-11-01 21:20:35,299:81025 MainThread run_blocking_tasks: creating executor tasks

2020-11-01 21:20:35,300:81025 ThreadPoolExecutor-3_0 (task-0): passed args :[{'KeyDirPath': 'taxonomy_cs/test1/src/', 'ParentDir': 'src', 'FileName': 'tg0_2020-11-02_ford.csv', 'FileGrp': 'tg0', 'Date': '2020-11-02', 'ClientName': 'ford', 'Bucket': 'qubole-ford', 'Key': 'taxonomy_cs/test1/src/tg0_2020-11-02_ford.csv', 'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinfo=tzlocal()), 'ETag': '"7a5d08cbb4c718d16851d1f2b57ffc50"', 'Size': 27, 'StorageClass': 'STANDARD'}]

2020-11-01 21:20:35,300:81025 ThreadPoolExecutor-3_1 (task-1): passed args :[{'KeyDirPath': 'taxonomy_cs/test1/src/', 'ParentDir': 'src', 'FileName': 'tg0_2020-11-03_ford.csv', 'FileGrp': 'tg0', 'Date': '2020-11-03', 'ClientName': 'ford', 'Bucket': 'qubole-ford', 'Key': 'taxonomy_cs/test1/src/tg0_2020-11-03_ford.csv', 'LastModified': datetime.datetime(20

{'invalid_files_set': {'s3://qubole-ford/taxonomy_cs/test1/src/tg0_202-11-01_ford.csv'},
 'invalid_schema_files': {('s3://qubole-ford/taxonomy_cs/test1/src/tg0_2020-11-03_ford.csv',
   'key_a0, targe_a0',
   'Exact one Target column is required! \nAll given columns should Key or Target!')},
 'target_already_exist_files': set(),
 'schema_tg_dict': {'key_a1,target_a1': {'tg1'},
  'key_a10,target_a9': {'tg10'},
  'key_a4,target_a4': {'tg4'},
  'key_a0,target_a0': {'tg0'},
  'key_a21,target_a21': {'tg2'},
  'key_a9,target_a9': {'tg11'}},
 'target_tg_dict': {'target_a1': {'tg1'},
  'target_a9': {'tg10', 'tg11'},
  'target_a4': {'tg4'},
  'target_a0': {'tg0'},
  'target_a21': {'tg2'}},
 'new_tg_schema_dict': {'tg10': {'key_a10,target_a9'},
  'tg4': {'key_a4,target_a4'},
  'tg0': {'key_a0,target_a0'},
  'tg2': {'key_a21,target_a21'},
  'tg11': {'key_a9,target_a9'}},
 'new_tg_files_dict': {'tg10': {'tg10_2020-11-01_ford.csv': {'KeyDirPath': 'taxonomy_cs/test1/src/',
    'ParentDir': 'src',
   

In [18]:
def extract_src_detail1():
    lmt_src_loc_detail = s3_ops.get_bucket_name(lmt_src)
    lmt_src_loc_bucket = lmt_src_loc_detail['bucket']
    lmt_src_loc_key = lmt_src_loc_detail['key']
    page_generator = s3_ops.list_gen(lmt_src_loc_bucket, lmt_src_loc_key, maxKeysPerReq=3, )
    page_args_generator = ([page] for page in page_generator)
    #list_page = [i for i in page_generator][0]
    collected = NIO.decorated_run_with_args_generator(task=src_list_page_process_task, args_generator=page_args_generator, is_kernal_thread=False,)
    return collected


def extract_src_detail(maxKeysPerReq=3):
    lmt_src_loc_detail = s3_ops.get_bucket_name(lmt_src)
    lmt_src_loc_bucket = lmt_src_loc_detail['bucket']
    lmt_src_loc_key = lmt_src_loc_detail['key']
    page_generator = s3_ops.list_gen(lmt_src_loc_bucket, lmt_src_loc_key, maxKeysPerReq=maxKeysPerReq, )
    page_args_generator = ([page] for page in page_generator)
    #list_page = [i for i in page_generator][0]
    collected = NIO.decorated_run_with_args_generator(task=src_list_page_process_task, args_generator=page_args_generator, is_kernal_thread=True,)
    
    return {'invalid_files_set' : grouped_set_of_flags(collected, flag_dict_type='invalid_files_set'),
            'invalid_schema_files': grouped_set_of_flags(collected, flag_dict_type='invalid_schema_files'),
            'target_already_exist_files' : grouped_set_of_flags(collected, flag_dict_type='target_already_exist_files'),
            'schema_tg_dict': grouped_set_of_flags_dict(collected, flag_dict_type='schema_tg_dict'),
            'target_tg_dict': grouped_set_of_flags_dict(collected, flag_dict_type='target_tg_dict'),
            'new_tg_schema_dict': grouped_set_of_flags_dict(collected, flag_dict_type='new_tg_schema_dict'),
            'new_tg_files_dict' : grouped_tg(collected, 'new_tg_files_dict'),
            'existing_tg_files_dict' : grouped_tg(collected, 'existing_tg_files_dict')
               }

In [19]:
res432 = extract_src_detail1()
res432

2020-11-01 21:20:35,467:81025 MainThread run_blocking_tasks: starting

2020-11-01 21:20:35,467:81025 MainThread run_blocking_tasks: creating executor tasks

2020-11-01 21:20:35,487:81025 ThreadPoolExecutor-4_0 (task-0): passed args :[[{'Key': 'taxonomy_cs/test1/src/tg0_202-11-01_ford.csv', 'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinfo=tzlocal()), 'ETag': '"535b60451f6d20c2826b045438a50fb9"', 'Size': 48, 'StorageClass': 'STANDARD'}, {'Key': 'taxonomy_cs/test1/src/tg0_2020-11-02_ford.csv', 'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinfo=tzlocal()), 'ETag': '"7a5d08cbb4c718d16851d1f2b57ffc50"', 'Size': 27, 'StorageClass': 'STANDARD'}, {'Key': 'taxonomy_cs/test1/src/tg0_2020-11-03_ford.csv', 'LastModified': datetime.datetime(2020, 10, 29, 21, 21, 36, tzinfo=tzlocal()), 'ETag': '"b19c288a2ef5e2ec6739cac3674391a6"', 'Size': 28, 'StorageClass': 'STANDARD'}]]

2020-11-01 21:20:35,490:81025 ThreadPoolExecutor-4_0 (task-0): running

2020-11-01 21:20:35,491

2020-11-01 21:20:35,586:81025 ThreadPoolExecutor-6_0 (task-0): done

2020-11-01 21:20:35,590:81025 ThreadPoolExecutor-6_1 (task-1): done

2020-11-01 21:20:35,590:81025 ThreadPoolExecutor-4_3 (task-3): running

2020-11-01 21:20:35,591:81025 ThreadPoolExecutor-6_2 (task-2): done

2020-11-01 21:20:35,592:81025 ThreadPoolExecutor-5_1 (task-1): done

2020-11-01 21:20:35,595:81025 ThreadPoolExecutor-4_4 (task-4): passed args :[[{'Key': 'taxonomy_cs/test1/src/tg4_2020-11-03_ford.csv', 'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinfo=tzlocal()), 'ETag': '"ee6aeaee97c71bc6e4c3cea71ad78e35"', 'Size': 48, 'StorageClass': 'STANDARD'}, {'Key': 'taxonomy_cs/test1/src/tg5_2020-11-01_ford.csv', 'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinfo=tzlocal()), 'ETag': '"471c56a20b21f692659b2f5c68c0b713"', 'Size': 48, 'StorageClass': 'STANDARD'}, {'Key': 'taxonomy_cs/test1/src/tg5_2020-11-02_ford.csv', 'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinfo=tzlo

2020-11-01 21:20:35,664:81025 ThreadPoolExecutor-9_2 (task-2): running

2020-11-01 21:20:35,666:81025 ThreadPoolExecutor-4_6 (task-6): running

2020-11-01 21:20:35,667:81025 ThreadPoolExecutor-4_0 (task-7): running

2020-11-01 21:20:35,681:81025 ThreadPoolExecutor-8_0 (task-0): done

2020-11-01 21:20:35,689:81025 ThreadPoolExecutor-8_1 (task-1): done

2020-11-01 21:20:35,689:81025 ThreadPoolExecutor-10_0 (task-0): passed args :[{'KeyDirPath': 'taxonomy_cs/test1/src/', 'ParentDir': 'src', 'FileName': 'tg5_2020-11-03_ford.csv', 'FileGrp': 'tg5', 'Date': '2020-11-03', 'ClientName': 'ford', 'Bucket': 'qubole-ford', 'Key': 'taxonomy_cs/test1/src/tg5_2020-11-03_ford.csv', 'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinfo=tzlocal()), 'ETag': '"471c56a20b21f692659b2f5c68c0b713"', 'Size': 48, 'StorageClass': 'STANDARD'}]

2020-11-01 21:20:35,690:81025 ThreadPoolExecutor-10_1 (task-1): passed args :[{'KeyDirPath': 'taxonomy_cs/test1/src/', 'ParentDir': 'src', 'FileName': 'tg5_20

[{'id': 'task-7',
  'task': <function __main__.src_list_page_process_task(list_page)>,
  'args': [[{'Key': 'taxonomy_cs/test1/src/tg7_2020-11-03_ford.csv',
     'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinfo=tzlocal()),
     'ETag': '"d54b283d90621ca6b19c85f4c96d4b8f"',
     'Size': 49,
     'StorageClass': 'STANDARD'},
    {'Key': 'taxonomy_cs/test1/src/tg8_2020-11-01_ford.csv',
     'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinfo=tzlocal()),
     'ETag': '"8d575874cb97b2d601ae8542aaf11431"',
     'Size': 48,
     'StorageClass': 'STANDARD'},
    {'Key': 'taxonomy_cs/test1/src/tg9_2020-11-01_ford.csv',
     'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinfo=tzlocal()),
     'ETag': '"b83eaf4009dc42dd2a744fad592339f9"',
     'Size': 48,
     'StorageClass': 'STANDARD'}]],
  'result': {'invalid_files_set': set(),
   'invalid_schema_files': set(),
   'target_already_exist_files': set(),
   'schema_tg_dict': {'key_a8,target_a7': {'tg8

In [20]:
[i['result'] for i in res432 ]

# def grouped_set_of_flags_dict1(collected, flag_dict_type='schema_tg_dict'):
#     f_gen = (item['result'][flag_dict_type] for item in collected if len(item['result'][flag_dict_type]) > 0)
#     collect = defaultdict(set)
#     f_gen2 = (collect[K].update(V)  for item in f_gen for K, V in item.items())
#     [ i for i in f_gen2]
#     res = dict(collect)
#     return res

[{'invalid_files_set': set(),
  'invalid_schema_files': set(),
  'target_already_exist_files': set(),
  'schema_tg_dict': {'key_a8,target_a7': {'tg8'},
   'key_a7,target_a71': {'tg7'},
   'key_a9,target_a9': {'tg9'}},
  'target_tg_dict': {'target_a7': {'tg8'},
   'target_a71': {'tg7'},
   'target_a9': {'tg9'}},
  'new_tg_schema_dict': {'tg8': {'key_a8,target_a7'},
   'tg7': {'key_a7,target_a71'},
   'tg9': {'key_a9,target_a9'}},
  'new_tg_files_dict': {'tg8': {'tg8_2020-11-01_ford.csv': {'KeyDirPath': 'taxonomy_cs/test1/src/',
     'ParentDir': 'src',
     'FileName': 'tg8_2020-11-01_ford.csv',
     'FileGrp': 'tg8',
     'Date': '2020-11-01',
     'ClientName': 'ford',
     'Bucket': 'qubole-ford',
     'Key': 'taxonomy_cs/test1/src/tg8_2020-11-01_ford.csv',
     'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinfo=tzlocal()),
     'ETag': '"8d575874cb97b2d601ae8542aaf11431"',
     'Size': 48,
     'StorageClass': 'STANDARD',
     'Schema': 'key_a8,target_a7'}},
   'tg7'

In [21]:
# collect = defaultdict(set)
# f_gen = (item['result']['target_tg_dict'] for item in res432 if len(item['result']['target_tg_dict']) > 0)
# [( K, V)  for item in f_gen for K, V in item.items()]
# #collect

In [22]:
len(res432)

8

In [23]:
# [ i['result'] for i in res432]

In [24]:
grouped_set_of_flags_dict(res432, flag_dict_type='schema_tg_dict')


{'key_a8,target_a7': {'tg8'},
 'key_a7,target_a71': {'tg7'},
 'key_a9,target_a9': {'tg11', 'tg9'},
 'key_a1,target_a1': {'tg1'},
 'key_a10,target_a9': {'tg10'},
 'key_a21,target_a21': {'tg2'},
 'key_a5,target_a5': {'tg5'},
 'key_a4,target_a4': {'tg4'},
 'key_a7,target_a7': {'tg7'},
 'key_a6,target_a61': {'tg6'},
 'key_a51,target_a51': {'tg5'},
 'key_a0,target_a0': {'tg0'}}

In [25]:
grouped_set_of_flags_dict(res432, flag_dict_type='target_tg_dict')

{'target_a7': {'tg7', 'tg8'},
 'target_a71': {'tg7'},
 'target_a9': {'tg10', 'tg11', 'tg9'},
 'target_a1': {'tg1'},
 'target_a21': {'tg2'},
 'target_a5': {'tg5'},
 'target_a4': {'tg4'},
 'target_a61': {'tg6'},
 'target_a51': {'tg5'},
 'target_a0': {'tg0'}}

In [26]:
grouped_set_of_flags_dict(res432, flag_dict_type='new_tg_schema_dict')

{'tg8': {'key_a8,target_a7'},
 'tg7': {'key_a7,target_a71'},
 'tg9': {'key_a9,target_a9'},
 'tg11': {'key_a9,target_a9'},
 'tg10': {'key_a10,target_a9'},
 'tg2': {'key_a21,target_a21'},
 'tg4': {'key_a4,target_a4'},
 'tg6': {'key_a6,target_a61'},
 'tg5': {'key_a51,target_a51'},
 'tg0': {'key_a0,target_a0'}}

In [27]:
grouped_tg(res432, 'new_tg_files_dict')

{'tg8': {'tg8_2020-11-01_ford.csv': {'KeyDirPath': 'taxonomy_cs/test1/src/',
   'ParentDir': 'src',
   'FileName': 'tg8_2020-11-01_ford.csv',
   'FileGrp': 'tg8',
   'Date': '2020-11-01',
   'ClientName': 'ford',
   'Bucket': 'qubole-ford',
   'Key': 'taxonomy_cs/test1/src/tg8_2020-11-01_ford.csv',
   'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinfo=tzlocal()),
   'ETag': '"8d575874cb97b2d601ae8542aaf11431"',
   'Size': 48,
   'StorageClass': 'STANDARD',
   'Schema': 'key_a8,target_a7'}},
 'tg7': {'tg7_2020-11-03_ford.csv': {'KeyDirPath': 'taxonomy_cs/test1/src/',
   'ParentDir': 'src',
   'FileName': 'tg7_2020-11-03_ford.csv',
   'FileGrp': 'tg7',
   'Date': '2020-11-03',
   'ClientName': 'ford',
   'Bucket': 'qubole-ford',
   'Key': 'taxonomy_cs/test1/src/tg7_2020-11-03_ford.csv',
   'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinfo=tzlocal()),
   'ETag': '"d54b283d90621ca6b19c85f4c96d4b8f"',
   'Size': 49,
   'StorageClass': 'STANDARD',
   'Schema'

In [28]:
grouped_tg(res432, 'existing_tg_files_dict')

{'tg1': {'tg1_2020-11-01_ford.csv': {'KeyDirPath': 'taxonomy_cs/test1/src/',
   'ParentDir': 'src',
   'FileName': 'tg1_2020-11-01_ford.csv',
   'FileGrp': 'tg1',
   'Date': '2020-11-01',
   'ClientName': 'ford',
   'Bucket': 'qubole-ford',
   'Key': 'taxonomy_cs/test1/src/tg1_2020-11-01_ford.csv',
   'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinfo=tzlocal()),
   'ETag': '"e74387593f23233a61d30b719b79a381"',
   'Size': 48,
   'StorageClass': 'STANDARD',
   'Schema': 'key_a1,target_a1'},
  'tg1_2020-11-02_ford.csv': {'KeyDirPath': 'taxonomy_cs/test1/src/',
   'ParentDir': 'src',
   'FileName': 'tg1_2020-11-02_ford.csv',
   'FileGrp': 'tg1',
   'Date': '2020-11-02',
   'ClientName': 'ford',
   'Bucket': 'qubole-ford',
   'Key': 'taxonomy_cs/test1/src/tg1_2020-11-02_ford.csv',
   'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinfo=tzlocal()),
   'ETag': '"e74387593f23233a61d30b719b79a381"',
   'Size': 48,
   'StorageClass': 'STANDARD',
   'Schema': 'key_a

In [29]:
grouped_set_of_flags(res432, 'invalid_schema_files')

{('s3://qubole-ford/taxonomy_cs/test1/src/tg0_2020-11-03_ford.csv',
  'key_a0, targe_a0',
  'Exact one Target column is required! \nAll given columns should Key or Target!')}

In [30]:
grouped_set_of_flags(res432, 'invalid_files_set')

{'s3://qubole-ford/taxonomy_cs/test1/src/tg0_202-11-01_ford.csv'}

In [31]:
res_final = extract_src_detail()

2020-11-01 21:20:35,979   process-id:81025 run_blocking_tasks: starting

2020-11-01 21:20:35,980   process-id:81025 run_blocking_tasks: creating executor tasks

2020-11-01 21:20:36,036   process-id:81095   (task-0): passed args :[[{'Key': 'taxonomy_cs/test1/src/tg0_202-11-01_ford.csv', 'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinfo=tzlocal()), 'ETag': '"535b60451f6d20c2826b045438a50fb9"', 'Size': 48, 'StorageClass': 'STANDARD'}, {'Key': 'taxonomy_cs/test1/src/tg0_2020-11-02_ford.csv', 'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinfo=tzlocal()), 'ETag': '"7a5d08cbb4c718d16851d1f2b57ffc50"', 'Size': 27, 'StorageClass': 'STANDARD'}, {'Key': 'taxonomy_cs/test1/src/tg0_2020-11-03_ford.csv', 'LastModified': datetime.datetime(2020, 10, 29, 21, 21, 36, tzinfo=tzlocal()), 'ETag': '"b19c288a2ef5e2ec6739cac3674391a6"', 'Size': 28, 'StorageClass': 'STANDARD'}]]

2020-11-01 21:20:36,039   process-id:81095   (task-0): running

2020-11-01 21:20:36,042:81095 MainT

2020-11-01 21:20:36,098:81098 ThreadPoolExecutor-13_1 (task-1): passed args :[{'KeyDirPath': 'taxonomy_cs/test1/src/', 'ParentDir': 'src', 'FileName': 'tg4_2020-11-01_ford.csv', 'FileGrp': 'tg4', 'Date': '2020-11-01', 'ClientName': 'ford', 'Bucket': 'qubole-ford', 'Key': 'taxonomy_cs/test1/src/tg4_2020-11-01_ford.csv', 'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinfo=tzlocal()), 'ETag': '"ee6aeaee97c71bc6e4c3cea71ad78e35"', 'Size': 48, 'StorageClass': 'STANDARD'}]

2020-11-01 21:20:36,099:81098 ThreadPoolExecutor-13_2 (task-2): passed args :[{'KeyDirPath': 'taxonomy_cs/test1/src/', 'ParentDir': 'src', 'FileName': 'tg4_2020-11-02_ford.csv', 'FileGrp': 'tg4', 'Date': '2020-11-02', 'ClientName': 'ford', 'Bucket': 'qubole-ford', 'Key': 'taxonomy_cs/test1/src/tg4_2020-11-02_ford.csv', 'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinfo=tzlocal()), 'ETag': '"ee6aeaee97c71bc6e4c3cea71ad78e35"', 'Size': 48, 'StorageClass': 'STANDARD'}]

2020-11-01 21:20:36,099:

2020-11-01 21:20:36,159:81095 ThreadPoolExecutor-13_0 (task-0): done

2020-11-01 21:20:36,162   process-id:81025 run_blocking_tasks: waiting for executor tasks

2020-11-01 21:20:36,163:81095 ThreadPoolExecutor-13_1 (task-1): done

2020-11-01 21:20:36,166:81095 MainThread run_blocking_tasks: exiting

2020-11-01 21:20:36,168   process-id:81095   (task-0): done

2020-11-01 21:20:36,164   process-id:81102   (task-7): passed args :[[{'Key': 'taxonomy_cs/test1/src/tg7_2020-11-03_ford.csv', 'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinfo=tzlocal()), 'ETag': '"d54b283d90621ca6b19c85f4c96d4b8f"', 'Size': 49, 'StorageClass': 'STANDARD'}, {'Key': 'taxonomy_cs/test1/src/tg8_2020-11-01_ford.csv', 'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinfo=tzlocal()), 'ETag': '"8d575874cb97b2d601ae8542aaf11431"', 'Size': 48, 'StorageClass': 'STANDARD'}, {'Key': 'taxonomy_cs/test1/src/tg9_2020-11-01_ford.csv', 'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinf

In [32]:
res_final

{'invalid_files_set': {'s3://qubole-ford/taxonomy_cs/test1/src/tg0_202-11-01_ford.csv'},
 'invalid_schema_files': {('s3://qubole-ford/taxonomy_cs/test1/src/tg0_2020-11-03_ford.csv',
   'key_a0, targe_a0',
   'Exact one Target column is required! \nAll given columns should Key or Target!')},
 'target_already_exist_files': set(),
 'schema_tg_dict': {'key_a6,target_a61': {'tg6'},
  'key_a51,target_a51': {'tg5'},
  'key_a5,target_a5': {'tg5'},
  'key_a21,target_a21': {'tg2'},
  'key_a1,target_a1': {'tg1'},
  'key_a4,target_a4': {'tg4'},
  'key_a9,target_a9': {'tg11', 'tg9'},
  'key_a10,target_a9': {'tg10'},
  'key_a7,target_a7': {'tg7'},
  'key_a8,target_a7': {'tg8'},
  'key_a7,target_a71': {'tg7'},
  'key_a0,target_a0': {'tg0'}},
 'target_tg_dict': {'target_a61': {'tg6'},
  'target_a51': {'tg5'},
  'target_a5': {'tg5'},
  'target_a21': {'tg2'},
  'target_a1': {'tg1'},
  'target_a4': {'tg4'},
  'target_a9': {'tg10', 'tg11', 'tg9'},
  'target_a7': {'tg7', 'tg8'},
  'target_a71': {'tg7'},
  

In [33]:
src_delta = res_final#tcAPI.extract_src_detail(is_kernal_thread = False)

In [34]:
src_delta

{'invalid_files_set': {'s3://qubole-ford/taxonomy_cs/test1/src/tg0_202-11-01_ford.csv'},
 'invalid_schema_files': {('s3://qubole-ford/taxonomy_cs/test1/src/tg0_2020-11-03_ford.csv',
   'key_a0, targe_a0',
   'Exact one Target column is required! \nAll given columns should Key or Target!')},
 'target_already_exist_files': set(),
 'schema_tg_dict': {'key_a6,target_a61': {'tg6'},
  'key_a51,target_a51': {'tg5'},
  'key_a5,target_a5': {'tg5'},
  'key_a21,target_a21': {'tg2'},
  'key_a1,target_a1': {'tg1'},
  'key_a4,target_a4': {'tg4'},
  'key_a9,target_a9': {'tg11', 'tg9'},
  'key_a10,target_a9': {'tg10'},
  'key_a7,target_a7': {'tg7'},
  'key_a8,target_a7': {'tg8'},
  'key_a7,target_a71': {'tg7'},
  'key_a0,target_a0': {'tg0'}},
 'target_tg_dict': {'target_a61': {'tg6'},
  'target_a51': {'tg5'},
  'target_a5': {'tg5'},
  'target_a21': {'tg2'},
  'target_a1': {'tg1'},
  'target_a4': {'tg4'},
  'target_a9': {'tg10', 'tg11', 'tg9'},
  'target_a7': {'tg7', 'tg8'},
  'target_a71': {'tg7'},
  

In [35]:
tg_data_schema_dict
#tg_data_files_dict
#target_data_tg_dict = {re.findall(TARGET_EXTRACT_REGEX,V)[0]: K for K, V in tg_data_schema_dict.items()}

{'tg7': 'key_a7,target_a7',
 'tg6': 'key_a6,target_a6',
 'tg1': 'key_a1,target_a1',
 'tg2': 'key_a2,target_a2',
 'tg5': 'key_a5,target_a5',
 'tg3': 'key_a3,target_a3'}

In [36]:
{ k for k in src_delta.keys()}

{'existing_tg_files_dict',
 'invalid_files_set',
 'invalid_schema_files',
 'new_tg_files_dict',
 'new_tg_schema_dict',
 'schema_tg_dict',
 'target_already_exist_files',
 'target_tg_dict'}

In [37]:
{ k for k in tg_data_files_dict.keys()}

{'tg1', 'tg2', 'tg3', 'tg5', 'tg6', 'tg7'}

In [38]:
{ k for k in src_delta['existing_tg_files_dict'].keys()}

{'tg1', 'tg5', 'tg7'}

In [39]:
{ k for k in src_delta['new_tg_files_dict'].keys()}

{'tg0', 'tg10', 'tg11', 'tg2', 'tg4', 'tg5', 'tg6', 'tg7', 'tg8', 'tg9'}

In [40]:
[v for k, v in src_delta['schema_tg_dict'].items() if len(v) > 1]

[{'tg11', 'tg9'}]

In [41]:
[v for k, v in src_delta['target_tg_dict'].items() if len(v) > 1]

[{'tg10', 'tg11', 'tg9'}, {'tg7', 'tg8'}]

In [42]:
[v for k, v in src_delta['new_tg_schema_dict'].items() if len(v) > 0]

[{'key_a6,target_a61'},
 {'key_a51,target_a51'},
 {'key_a21,target_a21'},
 {'key_a4,target_a4'},
 {'key_a9,target_a9'},
 {'key_a10,target_a9'},
 {'key_a9,target_a9'},
 {'key_a8,target_a7'},
 {'key_a7,target_a71'},
 {'key_a0,target_a0'}]

In [43]:
src_delta['invalid_schema_files']

{('s3://qubole-ford/taxonomy_cs/test1/src/tg0_2020-11-03_ford.csv',
  'key_a0, targe_a0',
  'Exact one Target column is required! \nAll given columns should Key or Target!')}

In [44]:
#test
#newtg4schema = {}#defaultdict(set)
#{k for k, v in src_delta['new_tg_schema_dict'].items() if len(v) > 0}
#newtg4schema

## Transpose dict
def transpose_dict(dict_arg):
    res = defaultdict(set)
    [res[t].add(k) for k, v in dict_arg.items() for t in v]
    return dict(res)

    
transpose_dict(src_delta['target_tg_dict'])
# tg_target_dict = defaultdict(set)
# [(t, v, k) for k, v in src_delta['target_tg_dict'].items() for t in v]

# [tg_target_dict[t].add(k) for k, v in src_delta['target_tg_dict'].items() for t in v]

# dict(tg_target_dict)



{'tg6': {'target_a61'},
 'tg5': {'target_a5', 'target_a51'},
 'tg2': {'target_a21'},
 'tg1': {'target_a1'},
 'tg4': {'target_a4'},
 'tg11': {'target_a9'},
 'tg10': {'target_a9'},
 'tg9': {'target_a9'},
 'tg7': {'target_a7', 'target_a71'},
 'tg8': {'target_a7'},
 'tg0': {'target_a0'}}

In [45]:
tg_data = { k for k in tg_data_files_dict.keys()}
tg_existing = { k for k in src_delta['existing_tg_files_dict'].keys()}
tg_new ={ k for k in src_delta['new_tg_files_dict'].keys()}
tg_delta = tg_new.union(tg_existing)
tg_dropped = tg_data.difference(tg_delta)
many_tg4schema_check_gen = (v for k, v in src_delta['schema_tg_dict'].items() if len(v) > 1)
many_tg4target_check_gen = (v for k, v in src_delta['target_tg_dict'].items() if len(v) > 1)
newTg4schema = {k for k, v in src_delta['new_tg_schema_dict'].items() if len(v) > 1}

In [46]:
tg4schema = set()
[tg4schema.update(i) for i in many_tg4schema_check_gen]
tg4target = set()
[tg4target.update(i) for i in many_tg4target_check_gen]

[None, None]

In [47]:
tg4schema

{'tg11', 'tg9'}

In [48]:
#newTg4schema = {tg for schema in schema4tg for tg in src_delta['schema_tg_dict'][schema]}

In [49]:
newTg4schema

set()

In [50]:
tg4target

{'tg10', 'tg11', 'tg7', 'tg8', 'tg9'}

In [51]:
#def extract_delta():
invalid_files_set = src_delta['invalid_files_set']
invalid_schema_files = src_delta['invalid_schema_files']

tg_data = { k for k in tg_data_files_dict.keys()}
tg_existing = { k for k in src_delta['existing_tg_files_dict'].keys()}
tg_new ={ k for k in src_delta['new_tg_files_dict'].keys()}
tg_all = tg_new.union(tg_existing)

many_tg4schema_check_gen = (v for k, v in src_delta['schema_tg_dict'].items() if len(v) > 1)
many_tg4target_check_gen = (v for k, v in src_delta['target_tg_dict'].items() if len(v) > 1)
newTg4schema = {k for k, v in src_delta['new_tg_schema_dict'].items() if len(v) > 1}


tg4schema = set()
[tg4schema.update(i) for i in many_tg4schema_check_gen]
tg4target = set()
[tg4target.update(i) for i in many_tg4target_check_gen]


invalid_tg_with_dup_schema = (tg4schema.union(newTg4schema)).difference(tg_existing)

invalid_tg_with_dup_target = tg4target.difference(tg_existing)

invalid_tg_all = invalid_tg_with_dup_schema.union(invalid_tg_with_dup_target)

tg_delta = tg_new.difference(invalid_tg_all)

tg_delta_create = tg_delta.difference(tg_data)

tg_delta_drop_n_create = (tg_delta.intersection(tg_data)).difference(tg_existing)

tg_dropped = tg_data.difference(tg_all)

tg_dropped_all = tg_dropped.union(tg_delta_drop_n_create)
tg_create_all = tg_delta_create.union(tg_delta_drop_n_create)
    

In [52]:
tg_new

{'tg0', 'tg10', 'tg11', 'tg2', 'tg4', 'tg5', 'tg6', 'tg7', 'tg8', 'tg9'}

In [53]:
tg_existing

{'tg1', 'tg5', 'tg7'}

In [54]:
tg_data

{'tg1', 'tg2', 'tg3', 'tg5', 'tg6', 'tg7'}

In [55]:
invalid_tg_with_dup_schema

{'tg11', 'tg9'}

In [56]:
tg4schema

{'tg11', 'tg9'}

In [57]:
invalid_tg_with_dup_target

{'tg10', 'tg11', 'tg8', 'tg9'}

In [58]:
invalid_tg_all

{'tg10', 'tg11', 'tg8', 'tg9'}

In [59]:
tg_delta

{'tg0', 'tg2', 'tg4', 'tg5', 'tg6', 'tg7'}

In [60]:
tg_delta_create

{'tg0', 'tg4'}

In [61]:
tg_delta_drop_n_create

{'tg2', 'tg6'}

In [62]:
tg_dropped

{'tg3'}

In [63]:
tg_existing

{'tg1', 'tg5', 'tg7'}

In [64]:
tg_dropped_all

{'tg2', 'tg3', 'tg6'}

In [65]:
tg_create_all

{'tg0', 'tg2', 'tg4', 'tg6'}

In [66]:
src_delta['existing_tg_files_dict']['tg1']

{'tg1_2020-11-02_ford.csv': {'KeyDirPath': 'taxonomy_cs/test1/src/',
  'ParentDir': 'src',
  'FileName': 'tg1_2020-11-02_ford.csv',
  'FileGrp': 'tg1',
  'Date': '2020-11-02',
  'ClientName': 'ford',
  'Bucket': 'qubole-ford',
  'Key': 'taxonomy_cs/test1/src/tg1_2020-11-02_ford.csv',
  'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinfo=tzlocal()),
  'ETag': '"e74387593f23233a61d30b719b79a381"',
  'Size': 48,
  'StorageClass': 'STANDARD',
  'Schema': 'key_a1,target_a1'},
 'tg1_2020-11-01_ford.csv': {'KeyDirPath': 'taxonomy_cs/test1/src/',
  'ParentDir': 'src',
  'FileName': 'tg1_2020-11-01_ford.csv',
  'FileGrp': 'tg1',
  'Date': '2020-11-01',
  'ClientName': 'ford',
  'Bucket': 'qubole-ford',
  'Key': 'taxonomy_cs/test1/src/tg1_2020-11-01_ford.csv',
  'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinfo=tzlocal()),
  'ETag': '"e74387593f23233a61d30b719b79a381"',
  'Size': 48,
  'StorageClass': 'STANDARD',
  'Schema': 'key_a1,target_a1'}}

In [67]:
src_delta['new_tg_files_dict']['tg6']

{'tg6_2020-11-01_ford.csv': {'KeyDirPath': 'taxonomy_cs/test1/src/',
  'ParentDir': 'src',
  'FileName': 'tg6_2020-11-01_ford.csv',
  'FileGrp': 'tg6',
  'Date': '2020-11-01',
  'ClientName': 'ford',
  'Bucket': 'qubole-ford',
  'Key': 'taxonomy_cs/test1/src/tg6_2020-11-01_ford.csv',
  'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinfo=tzlocal()),
  'ETag': '"ae64f1e8ed00a66a125cfeee7223cfa2"',
  'Size': 49,
  'StorageClass': 'STANDARD',
  'Schema': 'key_a6,target_a61'},
 'tg6_2020-11-02_ford.csv': {'KeyDirPath': 'taxonomy_cs/test1/src/',
  'ParentDir': 'src',
  'FileName': 'tg6_2020-11-02_ford.csv',
  'FileGrp': 'tg6',
  'Date': '2020-11-02',
  'ClientName': 'ford',
  'Bucket': 'qubole-ford',
  'Key': 'taxonomy_cs/test1/src/tg6_2020-11-02_ford.csv',
  'LastModified': datetime.datetime(2020, 10, 29, 19, 53, 45, tzinfo=tzlocal()),
  'ETag': '"ae64f1e8ed00a66a125cfeee7223cfa2"',
  'Size': 49,
  'StorageClass': 'STANDARD',
  'Schema': 'key_a6,target_a61'}}

## Sync data files

In [262]:
#files_need_to_be_dropped = { f['Key']:i for i in  tg_dropped_all for fn, f in tg_data_files_dict.get(i).items()}
files_to_be_dropped = [ [f['Key']] for i in  tg_dropped_all for fn, f in tg_data_files_dict.get(i).items()]

files_to_be_dropped

[['taxonomy_cs/test1/data/tg3/tg3_2020-11-01_ford.csv'],
 ['taxonomy_cs/test1/data/tg3/tg3_2020-11-02_ford.csv'],
 ['taxonomy_cs/test1/data/tg6/tg6_2020-11-01_ford.csv'],
 ['taxonomy_cs/test1/data/tg6/tg6_2020-11-02_ford.csv'],
 ['taxonomy_cs/test1/data/tg2/tg2_2020-11-01_ford.csv'],
 ['taxonomy_cs/test1/data/tg2/tg2_2020-11-02_ford.csv']]

In [260]:
files_not_retained_existing_tg =  [[tg_data_files_dict.get(tg).get(fn)['Key']]
                                 for tg in tg_existing 
                                 for fn in set(tg_data_files_dict.get(tg).keys()).difference(set(src_delta['existing_tg_files_dict'].get(tg).keys()))]
files_not_retained_existing_tg

[['taxonomy_cs/test1/data/tg5/tg5_2020-11-05_ford.csv']]

In [265]:
file_drop_args = []
file_drop_args.extend(files_to_be_dropped )
file_drop_args.extend(files_not_retained_existing_tg )
file_drop_args

[['taxonomy_cs/test1/data/tg3/tg3_2020-11-01_ford.csv'],
 ['taxonomy_cs/test1/data/tg3/tg3_2020-11-02_ford.csv'],
 ['taxonomy_cs/test1/data/tg6/tg6_2020-11-01_ford.csv'],
 ['taxonomy_cs/test1/data/tg6/tg6_2020-11-02_ford.csv'],
 ['taxonomy_cs/test1/data/tg2/tg2_2020-11-01_ford.csv'],
 ['taxonomy_cs/test1/data/tg2/tg2_2020-11-02_ford.csv'],
 ['taxonomy_cs/test1/data/tg5/tg5_2020-11-05_ford.csv']]

In [69]:
#files_need_to_be_created = {f['Key'] : i for i in  tg_create_all for fn, f in src_delta['new_tg_files_dict'].get(i).items()}
files_need_to_be_created = [[i, f['FileName'], f['Key'], f['Size']] for i in  tg_create_all for fn, f in src_delta['new_tg_files_dict'].get(i).items()]

files_need_to_be_created

[['tg4',
  'tg4_2020-11-03_ford.csv',
  'taxonomy_cs/test1/src/tg4_2020-11-03_ford.csv',
  48],
 ['tg4',
  'tg4_2020-11-02_ford.csv',
  'taxonomy_cs/test1/src/tg4_2020-11-02_ford.csv',
  48],
 ['tg4',
  'tg4_2020-11-01_ford.csv',
  'taxonomy_cs/test1/src/tg4_2020-11-01_ford.csv',
  48],
 ['tg6',
  'tg6_2020-11-01_ford.csv',
  'taxonomy_cs/test1/src/tg6_2020-11-01_ford.csv',
  49],
 ['tg6',
  'tg6_2020-11-02_ford.csv',
  'taxonomy_cs/test1/src/tg6_2020-11-02_ford.csv',
  49],
 ['tg0',
  'tg0_2020-11-02_ford.csv',
  'taxonomy_cs/test1/src/tg0_2020-11-02_ford.csv',
  27],
 ['tg2',
  'tg2_2020-11-02_ford.csv',
  'taxonomy_cs/test1/src/tg2_2020-11-02_ford.csv',
  50],
 ['tg2',
  'tg2_2020-11-01_ford.csv',
  'taxonomy_cs/test1/src/tg2_2020-11-01_ford.csv',
  50],
 ['tg2',
  'tg2_2020-11-03_ford.csv',
  'taxonomy_cs/test1/src/tg2_2020-11-03_ford.csv',
  50]]

In [70]:
#files_need_to_be_copied = {f_dict['Key']:k for k, v in src_delta['existing_tg_files_dict'].items() for f, f_dict in v.items()} #['Key']]
files_need_to_be_copied = [[k, f_dict['FileName'], f_dict['Key'], f_dict['Size']] for k, v in src_delta['existing_tg_files_dict'].items() for f, f_dict in v.items()] #['Key']]

files_need_to_be_copied 

[['tg5',
  'tg5_2020-11-03_ford.csv',
  'taxonomy_cs/test1/src/tg5_2020-11-03_ford.csv',
  48],
 ['tg5',
  'tg5_2020-11-02_ford.csv',
  'taxonomy_cs/test1/src/tg5_2020-11-02_ford.csv',
  48],
 ['tg5',
  'tg5_2020-11-01_ford.csv',
  'taxonomy_cs/test1/src/tg5_2020-11-01_ford.csv',
  48],
 ['tg1',
  'tg1_2020-11-02_ford.csv',
  'taxonomy_cs/test1/src/tg1_2020-11-02_ford.csv',
  48],
 ['tg1',
  'tg1_2020-11-01_ford.csv',
  'taxonomy_cs/test1/src/tg1_2020-11-01_ford.csv',
  48],
 ['tg7',
  'tg7_2020-11-02_ford.csv',
  'taxonomy_cs/test1/src/tg7_2020-11-02_ford.csv',
  48],
 ['tg7',
  'tg7_2020-11-01_ford.csv',
  'taxonomy_cs/test1/src/tg7_2020-11-01_ford.csv',
  48]]

In [71]:
file_copy_args = []
file_copy_args.extend(files_need_to_be_created )
file_copy_args.extend(files_need_to_be_copied )
file_copy_args

[['tg4',
  'tg4_2020-11-03_ford.csv',
  'taxonomy_cs/test1/src/tg4_2020-11-03_ford.csv',
  48],
 ['tg4',
  'tg4_2020-11-02_ford.csv',
  'taxonomy_cs/test1/src/tg4_2020-11-02_ford.csv',
  48],
 ['tg4',
  'tg4_2020-11-01_ford.csv',
  'taxonomy_cs/test1/src/tg4_2020-11-01_ford.csv',
  48],
 ['tg6',
  'tg6_2020-11-01_ford.csv',
  'taxonomy_cs/test1/src/tg6_2020-11-01_ford.csv',
  49],
 ['tg6',
  'tg6_2020-11-02_ford.csv',
  'taxonomy_cs/test1/src/tg6_2020-11-02_ford.csv',
  49],
 ['tg0',
  'tg0_2020-11-02_ford.csv',
  'taxonomy_cs/test1/src/tg0_2020-11-02_ford.csv',
  27],
 ['tg2',
  'tg2_2020-11-02_ford.csv',
  'taxonomy_cs/test1/src/tg2_2020-11-02_ford.csv',
  50],
 ['tg2',
  'tg2_2020-11-01_ford.csv',
  'taxonomy_cs/test1/src/tg2_2020-11-01_ford.csv',
  50],
 ['tg2',
  'tg2_2020-11-03_ford.csv',
  'taxonomy_cs/test1/src/tg2_2020-11-03_ford.csv',
  50],
 ['tg5',
  'tg5_2020-11-03_ford.csv',
  'taxonomy_cs/test1/src/tg5_2020-11-03_ford.csv',
  48],
 ['tg5',
  'tg5_2020-11-02_ford.csv',
  

In [72]:
def s3_copy_into_data_loc_task(tg, file_name, src_file, src_size, dry_run=True):
#     data_file_loc_detail = s3_ops.get_bucket_name(lmt_data)
    src_file_loc_detail = s3_ops.get_bucket_name(lmt_src)
    src_s3 = 's3://{}/{}'.format(src_file_loc_detail['bucket'], src_file)
    dest_s3 = '{}{}/{}'.format(lmt_data,tg, file_name)
    if dry_run:
        print("[dry_run]: S3 copy from {} to {}".format(src_s3, dest_s3))
    else:
        pass
        #s3_ops.copy(src=src_s3, dest = dest_s3, src_size=src_size)
    return 'Copied Successfully! by task'
s3_copy_into_data_loc_task('tg5', 'tg5_2020-11-01_ford.csv', 'taxonomy_cs/test1/src/tg5_2020-11-01_ford.csv', 48 )

[dry_run]: S3 copy from s3://qubole-ford/taxonomy_cs/test1/src/tg5_2020-11-01_ford.csv to s3://qubole-ford/taxonomy_cs/test1/data/tg5/tg5_2020-11-01_ford.csv


'Copied Successfully! by task'

In [73]:
def s3_remove_at_data_loc_task(file,  dry_run=True):
    data_file_loc_detail = s3_ops.get_bucket_name(lmt_data)
#     src_file_loc_detail = s3_ops.get_bucket_name(lmt_src)
#     src_s3 = 's3://{}/{}'.format(lmt_src, src_file)
    
    if dry_run:
        file_loc = 's3://{}/{}'.format(data_file_loc_detail['bucket'], file)
        print("[dry_run]: S3 delete from {} ".format(file_loc))
    else:
        pass
        #s3_ops.delete_file(data_file_loc_detail['bucket'], file)
    return 'Deleted Successfully! by task'
s3_remove_at_data_loc_task('taxonomy_cs/test1/data/tg2/tg2_2020-11-01_ford.csv')

[dry_run]: S3 delete from s3://qubole-ford/taxonomy_cs/test1/data/tg2/tg2_2020-11-01_ford.csv 


'Deleted Successfully! by task'

In [266]:
collected = NIO.decorated_run_io(task=s3_remove_at_data_loc_task, task_n_args_list=file_drop_args, is_kernal_thread=False,)
collected

2020-11-02 00:13:03,652:81025 MainThread run_blocking_tasks: starting

2020-11-02 00:13:03,653:81025 MainThread run_blocking_tasks: creating executor tasks

2020-11-02 00:13:03,654:81025 ThreadPoolExecutor-15_0 (task-0): passed args :['taxonomy_cs/test1/data/tg3/tg3_2020-11-01_ford.csv']

2020-11-02 00:13:03,654:81025 ThreadPoolExecutor-15_1 (task-1): passed args :['taxonomy_cs/test1/data/tg3/tg3_2020-11-02_ford.csv']

2020-11-02 00:13:03,654:81025 ThreadPoolExecutor-15_2 (task-2): passed args :['taxonomy_cs/test1/data/tg6/tg6_2020-11-01_ford.csv']

2020-11-02 00:13:03,655:81025 ThreadPoolExecutor-15_0 (task-0): running

[dry_run]: S3 delete from s3://qubole-ford/taxonomy_cs/test1/data/tg3/tg3_2020-11-01_ford.csv 2020-11-02 00:13:03,656:81025 ThreadPoolExecutor-15_3 (task-3): passed args :['taxonomy_cs/test1/data/tg6/tg6_2020-11-02_ford.csv']


2020-11-02 00:13:03,656:81025 ThreadPoolExecutor-15_4 (task-4): passed args :['taxonomy_cs/test1/data/tg2/tg2_2020-11-01_ford.csv']

2020-11-02

[{'id': 'task-5',
  'task': <function __main__.s3_remove_at_data_loc_task(file, dry_run=True)>,
  'args': ['taxonomy_cs/test1/data/tg2/tg2_2020-11-02_ford.csv'],
  'result': 'Deleted Successfully! by task',
  'error': None},
 {'id': 'task-2',
  'task': <function __main__.s3_remove_at_data_loc_task(file, dry_run=True)>,
  'args': ['taxonomy_cs/test1/data/tg6/tg6_2020-11-01_ford.csv'],
  'result': 'Deleted Successfully! by task',
  'error': None},
 {'id': 'task-6',
  'task': <function __main__.s3_remove_at_data_loc_task(file, dry_run=True)>,
  'args': ['taxonomy_cs/test1/data/tg5/tg5_2020-11-05_ford.csv'],
  'result': 'Deleted Successfully! by task',
  'error': None},
 {'id': 'task-0',
  'task': <function __main__.s3_remove_at_data_loc_task(file, dry_run=True)>,
  'args': ['taxonomy_cs/test1/data/tg3/tg3_2020-11-01_ford.csv'],
  'result': 'Deleted Successfully! by task',
  'error': None},
 {'id': 'task-1',
  'task': <function __main__.s3_remove_at_data_loc_task(file, dry_run=True)>,
  'a

In [78]:
collected = NIO.decorated_run_io(task=s3_copy_into_data_loc_task, task_n_args_list=file_copy_args, is_kernal_thread=False,)
collected

2020-11-01 21:24:23,064:81025 MainThread run_blocking_tasks: starting

2020-11-01 21:24:23,065:81025 MainThread run_blocking_tasks: creating executor tasks

2020-11-01 21:24:23,066:81025 ThreadPoolExecutor-14_0 (task-0): passed args :['tg4', 'tg4_2020-11-03_ford.csv', 'taxonomy_cs/test1/src/tg4_2020-11-03_ford.csv', 48]

2020-11-01 21:24:23,067:81025 ThreadPoolExecutor-14_1 (task-1): passed args :['tg4', 'tg4_2020-11-02_ford.csv', 'taxonomy_cs/test1/src/tg4_2020-11-02_ford.csv', 48]

2020-11-01 21:24:23,067:81025 ThreadPoolExecutor-14_2 (task-2): passed args :['tg4', 'tg4_2020-11-01_ford.csv', 'taxonomy_cs/test1/src/tg4_2020-11-01_ford.csv', 48]

2020-11-01 21:24:23,068:81025 ThreadPoolExecutor-14_0 (task-0): running

[dry_run]: S3 copy from s3://qubole-ford/taxonomy_cs/test1/src/tg4_2020-11-03_ford.csv to s3://qubole-ford/taxonomy_cs/test1/data/tg4/tg4_2020-11-03_ford.csv
2020-11-01 21:24:23,068:81025 ThreadPoolExecutor-14_1 (task-1): running

[dry_run]: S3 copy from s3://qubole-ford/

[{'id': 'task-4',
  'task': <function __main__.s3_copy_into_data_loc_task(tg, file_name, src_file, src_size, dry_run=True)>,
  'args': ['tg6',
   'tg6_2020-11-02_ford.csv',
   'taxonomy_cs/test1/src/tg6_2020-11-02_ford.csv',
   49],
  'result': 'Copied Successfully! by task',
  'error': None},
 {'id': 'task-3',
  'task': <function __main__.s3_copy_into_data_loc_task(tg, file_name, src_file, src_size, dry_run=True)>,
  'args': ['tg6',
   'tg6_2020-11-01_ford.csv',
   'taxonomy_cs/test1/src/tg6_2020-11-01_ford.csv',
   49],
  'result': 'Copied Successfully! by task',
  'error': None},
 {'id': 'task-15',
  'task': <function __main__.s3_copy_into_data_loc_task(tg, file_name, src_file, src_size, dry_run=True)>,
  'args': ['tg7',
   'tg7_2020-11-01_ford.csv',
   'taxonomy_cs/test1/src/tg7_2020-11-01_ford.csv',
   48],
  'result': 'Copied Successfully! by task',
  'error': None},
 {'id': 'task-8',
  'task': <function __main__.s3_copy_into_data_loc_task(tg, file_name, src_file, src_size, dry_r

### Expose Schema

In [79]:
tg_create_all_n_schema = {tg: schema for tg in tg_create_all for schema in src_delta['new_tg_schema_dict'].get(tg)}
tg_create_all_n_schema

{'tg4': 'key_a4,target_a4',
 'tg6': 'key_a6,target_a61',
 'tg0': 'key_a0,target_a0',
 'tg2': 'key_a21,target_a21'}

In [80]:


tg_retain_all_n_schema = {tg: tg_data_schema_dict.get(tg) for tg in tg_existing}
tg_retain_all_n_schema

{'tg1': 'key_a1,target_a1',
 'tg5': 'key_a5,target_a5',
 'tg7': 'key_a7,target_a7'}

In [272]:
tg_dropped_all_n_schema = {tg: tg_data_schema_dict.get(tg) for tg in tg_dropped_all }
tg_dropped_all_n_schema

{'tg3': 'key_a3,target_a3',
 'tg6': 'key_a6,target_a6',
 'tg2': 'key_a2,target_a2'}

In [267]:
import os


class Taxonomy_Grp:
    
    def __init__(self, tg_name, key_cols=[], target_col='', data_location=''):
        self.tg_name = tg_name
        self.key_cols = key_cols
        self.target_col = target_col
        self.location =os.path.join(data_location, tg_name)
        
    def get_dict(self):
        if self.target_col == '':
            return {'tg_name': self.tg_name}
        
        return {'tg_name': self.tg_name, 
                'key_cols': self.key_cols, 
                'target_col': self.target_col, 
                'location': self.location}

    def __str__(self):
        if self.target_col == '':
            return 'tg_name: {}'.format(self.tg_name)
        return 'tg_name: {}, key_cols: {}, target_col: {}, location: {}'.format(self.tg_name, self.key_cols, self.target_col,self.location)

def key_target_splitter(schema = ''):
    tokens = schema.split(',')
    key_cols = []
    for t in tokens:
        t = t.strip()
        if re.match(TARGET_REGEX, t):
            target_col = t
        elif re.match(KEY_REGEX, t):
            key_cols.append(t)
        else:
            raise Exception("Not a valid schema")
    return [{'target': target_col, 'key_cols' : key_cols}]





In [268]:
exposed_tg_all = None
exposed_tg_all = [Taxonomy_Grp(tg,schema_dict['key_cols'], schema_dict['target'], lmt_data) for tg ,schema in tg_create_all_n_schema.items() for schema_dict in key_target_splitter(schema)]
exposed_tg_all.extend([Taxonomy_Grp(tg,schema_dict['key_cols'], schema_dict['target'], lmt_data) for tg ,schema in tg_retain_all_n_schema.items() for schema_dict in key_target_splitter(schema)])
#[ i.get_dict() for i in exposed_tg_all]
[ str(i) for i in exposed_tg_all]

["tg_name: tg4, key_cols: ['key_a4'], target_col: target_a4, location: s3://qubole-ford/taxonomy_cs/test1/data/tg4",
 "tg_name: tg6, key_cols: ['key_a6'], target_col: target_a61, location: s3://qubole-ford/taxonomy_cs/test1/data/tg6",
 "tg_name: tg0, key_cols: ['key_a0'], target_col: target_a0, location: s3://qubole-ford/taxonomy_cs/test1/data/tg0",
 "tg_name: tg2, key_cols: ['key_a21'], target_col: target_a21, location: s3://qubole-ford/taxonomy_cs/test1/data/tg2",
 "tg_name: tg1, key_cols: ['key_a1'], target_col: target_a1, location: s3://qubole-ford/taxonomy_cs/test1/data/tg1",
 "tg_name: tg5, key_cols: ['key_a5'], target_col: target_a5, location: s3://qubole-ford/taxonomy_cs/test1/data/tg5",
 "tg_name: tg7, key_cols: ['key_a7'], target_col: target_a7, location: s3://qubole-ford/taxonomy_cs/test1/data/tg7"]

In [269]:
exposed_dropped_tg_all = None
exposed_dropped_tg_all = [Taxonomy_Grp(tg) for tg in tg_dropped_all]
[ i.get_dict() for i in exposed_dropped_tg_all]
[ str(i) for i in exposed_dropped_tg_all]



['tg_name: tg3', 'tg_name: tg6', 'tg_name: tg2']

In [275]:
# tg_data_target_dict =  {v: k for k, v in target_data_tg_dict.items()}
# target_retained = 

exposed_dropped_tg_all2 = [Taxonomy_Grp(tg,schema_dict['key_cols'], schema_dict['target'], lmt_data) 
                           for tg ,schema in tg_dropped_all_n_schema.items() 
                           for schema_dict in key_target_splitter(schema)]
[ i.get_dict() for i in exposed_dropped_tg_all2]

[{'tg_name': 'tg3',
  'key_cols': ['key_a3'],
  'target_col': 'target_a3',
  'location': 's3://qubole-ford/taxonomy_cs/test1/data/tg3'},
 {'tg_name': 'tg6',
  'key_cols': ['key_a6'],
  'target_col': 'target_a6',
  'location': 's3://qubole-ford/taxonomy_cs/test1/data/tg6'},
 {'tg_name': 'tg2',
  'key_cols': ['key_a2'],
  'target_col': 'target_a2',
  'location': 's3://qubole-ford/taxonomy_cs/test1/data/tg2'}]

### To Report

In [105]:
tg_create_n_schema = [(i, src_delta['new_tg_schema_dict'].get(i)) for i in tg_delta_create]
tg_drop_create_n_schema = [(tg, tg_data_schema_dict.get(tg), schema_new) for tg in tg_delta_drop_n_create for schema_new in src_delta['new_tg_schema_dict'].get(tg)]
tg_drop_n_schema = [(i, tg_data_schema_dict.get(i)) for i in tg_dropped]
tg_retain_n_schema = [(i, tg_data_schema_dict.get(i)) for i in tg_existing]



In [106]:
tg_drop_create_n_schema

[('tg6', 'key_a6,target_a6', 'key_a6,target_a61'),
 ('tg2', 'key_a2,target_a2', 'key_a21,target_a21')]

#### Files Sync Report

In [107]:
files_to_be_dropped = [ f['Key'] for i in  tg_dropped for fn, f in tg_data_files_dict.get(i).items()]
files_to_be_dropped

['taxonomy_cs/test1/data/tg3/tg3_2020-11-01_ford.csv',
 'taxonomy_cs/test1/data/tg3/tg3_2020-11-02_ford.csv']

In [108]:
files_to_be_dropped_schema_change = [ f['Key'] for i in  tg_delta_drop_n_create for fn, f in tg_data_files_dict.get(i).items()]
files_to_be_dropped_schema_change

['taxonomy_cs/test1/data/tg6/tg6_2020-11-01_ford.csv',
 'taxonomy_cs/test1/data/tg6/tg6_2020-11-02_ford.csv',
 'taxonomy_cs/test1/data/tg2/tg2_2020-11-01_ford.csv',
 'taxonomy_cs/test1/data/tg2/tg2_2020-11-02_ford.csv']

In [109]:
files_to_be_created = {f['Key'] :f['Schema'] for i in  tg_delta_create for fn, f in src_delta['new_tg_files_dict'].get(i).items()}
files_to_be_created

{'taxonomy_cs/test1/src/tg4_2020-11-03_ford.csv': 'key_a4,target_a4',
 'taxonomy_cs/test1/src/tg4_2020-11-02_ford.csv': 'key_a4,target_a4',
 'taxonomy_cs/test1/src/tg4_2020-11-01_ford.csv': 'key_a4,target_a4',
 'taxonomy_cs/test1/src/tg0_2020-11-02_ford.csv': 'key_a0,target_a0'}

In [110]:
files_to_be_created_schema_change = {f['Key'] :f['Schema'] for i in  tg_delta_drop_n_create for fn, f in src_delta['new_tg_files_dict'].get(i).items()}
files_to_be_created_schema_change

{'taxonomy_cs/test1/src/tg6_2020-11-01_ford.csv': 'key_a6,target_a61',
 'taxonomy_cs/test1/src/tg6_2020-11-02_ford.csv': 'key_a6,target_a61',
 'taxonomy_cs/test1/src/tg2_2020-11-02_ford.csv': 'key_a21,target_a21',
 'taxonomy_cs/test1/src/tg2_2020-11-01_ford.csv': 'key_a21,target_a21',
 'taxonomy_cs/test1/src/tg2_2020-11-03_ford.csv': 'key_a21,target_a21'}

### Report Display

In [111]:
import pandas as pd 

In [112]:
details = {  
    0 : { 
        'Name' : 'Ankit', 
        'Age' : 22, 
        'University' : 'BHU'
        }, 
    1 : { 
        'Name' : 'Aishwarya', 
        'Age' : 21, 
        'University' : 'JNU'
        }, 
    2 : { 
        'Name' : 'Shaurya', 
        'Age' : 23, 
        'University' : 'DU'
        } 
} 

details = [ 
    { 
        'Name' : 'Ankit', 
        'Age' : 22, 
        'University' : 'BHU'
        }, 
     { 
        'Name' : 'Aishwarya', 
        'Age' : 21, 
        'University' : 'JNU'
        }, 
    { 
        'Name' : 'Shaurya', 
        'Age' : 23, 
        'University' : 'DU'
        } 
]

In [241]:
df = pd.DataFrame(details) 
#df.sort_values??

df1 = df.sort_values(by = 'Age')
df2 = df1.reset_index()
df2 = df2.drop(columns=['index'])
#df = df.transpose()
logging.info("\n\n"+str(df2))
type(df)

2020-11-01 23:35:59,542 INFO root: 

   Age       Name University
0   21  Aishwarya        JNU
1   22      Ankit        BHU
2   23    Shaurya         DU


pandas.core.frame.DataFrame

### LOG Report

In [242]:

def log_report(list_of_row_dict=[], columns:list=[], header_align = 'right', sort_by= None, ascending = True):
    pd.set_option("display.colheader_justify", header_align)
    df = pd.DataFrame(list_of_row_dict, columns=columns) 
    if sort_by is not None:
        df = df.sort_values(by=sort_by, ascending=ascending)
        df = df.reset_index()
        df = df.drop(columns=['index'])
#     df1 = df.reindex(columns=['Taxonomy_Grp','File','Date', 'Schema'])
    #df[df.columns[new_order]]
    #df = df.transpose()
    logging.info("\n\n"+str(df))
   

### tg_dropped_report

In [187]:
tg_dropped_rep_gen =(extract_info(f['Key']) for i in  tg_dropped for fn, f in tg_data_files_dict.get(i).items())
tg_dropped_report_dict =  [{'Taxonomy_Grp':i['FileGrp'], 'File_Name':i['FileName'], 'Date':i['Date'], 'Schema': tg_data_schema_dict[i['FileGrp']] }
                           for i in tg_dropped_rep_gen] 
log_report(list_of_row_dict=tg_dropped_report_dict,columns=['Taxonomy_Grp','File_Name','Date', 'Schema']) 

2020-11-01 22:56:34,953 INFO root: 

  Taxonomy_Grp                File_Name        Date            Schema
0          tg3  tg3_2020-11-01_ford.csv  2020-11-01  key_a3,target_a3
1          tg3  tg3_2020-11-02_ford.csv  2020-11-02  key_a3,target_a3


### tg_drop_schema_change_report

In [116]:
tg_drop_schema_change_rep = ((tg, tg_data_schema_dict.get(tg), schema_new, extract_info(f['Key'])) for tg in tg_delta_drop_n_create for schema_new in src_delta['new_tg_schema_dict'].get(tg) for fn, f in tg_data_files_dict.get(tg).items())
tg_drop_schema_change_report_dict = [{'Grp' : i[0], 'File_Name': i[3]['FileName'], 'Date': i[3]['Date'], 'Old_Schema' : i[1], 'New_Schema' : i[2]} for i in tg_drop_schema_change_rep]
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)
#pd.describe_option()
log_report(list_of_row_dict=tg_drop_schema_change_report_dict,columns=['Grp','File_Name','Date', 'Old_Schema', 'New_Schema']) 

2020-11-01 21:33:57,043 INFO root: 

   Grp                File_Name        Date        Old_Schema          New_Schema
0  tg6  tg6_2020-11-01_ford.csv  2020-11-01  key_a6,target_a6   key_a6,target_a61
1  tg6  tg6_2020-11-02_ford.csv  2020-11-02  key_a6,target_a6   key_a6,target_a61
2  tg2  tg2_2020-11-01_ford.csv  2020-11-01  key_a2,target_a2  key_a21,target_a21
3  tg2  tg2_2020-11-02_ford.csv  2020-11-02  key_a2,target_a2  key_a21,target_a21


### tg_re_created_schema_change

In [117]:

tg_re_created_schema_change_rep = ((tg, tg_data_schema_dict.get(tg), f['Schema'], extract_info(f['Key']), 'Re-delivered') 
                                   if tg_data_files_dict.get(tg).get(fn) is not None 
                                   else (tg, 'NAN', f['Schema'], extract_info(f['Key']), 'New File')
                                    
                                   for tg in tg_delta_drop_n_create 
                                   #for schema_new in src_delta['new_tg_schema_dict'].get(tg) 
                                   
                                   for fn, f in src_delta['new_tg_files_dict'].get(tg).items() 
                                   )

tg_recreated_schema_change_report_dict = [{'Grp' : i[0], 'File_Name': i[3]['FileName'], 'Date': i[3]['Date'], 'Old_Schema' : i[1], 'New_Schema' : i[2], 'Desc': i[4]} 
                                          for i in tg_re_created_schema_change_rep]

log_report(list_of_row_dict=tg_recreated_schema_change_report_dict,columns=['Grp','File_Name','Date', 'Old_Schema', 'New_Schema', 'Desc'])

2020-11-01 21:34:01,196 INFO root: 

   Grp                File_Name        Date        Old_Schema          New_Schema          Desc
0  tg6  tg6_2020-11-01_ford.csv  2020-11-01  key_a6,target_a6   key_a6,target_a61  Re-delivered
1  tg6  tg6_2020-11-02_ford.csv  2020-11-02  key_a6,target_a6   key_a6,target_a61  Re-delivered
2  tg2  tg2_2020-11-02_ford.csv  2020-11-02  key_a2,target_a2  key_a21,target_a21  Re-delivered
3  tg2  tg2_2020-11-01_ford.csv  2020-11-01  key_a2,target_a2  key_a21,target_a21  Re-delivered
4  tg2  tg2_2020-11-03_ford.csv  2020-11-03               NAN  key_a21,target_a21      New File


### tg_newly_created_report

In [118]:

tg_newly_created_report_data = [ {'Taxonomy_Grp':f['FileGrp'], 'File_Name':f['FileName'], 'Date':f['Date'], 'Schema': f['Schema'] }
                    for i in  tg_delta_create for fn, f in src_delta['new_tg_files_dict'].get(i).items()]
log_report(tg_newly_created_report_data, columns=['Taxonomy_Grp','File_Name','Date', 'Schema']) 

2020-11-01 21:34:04,018 INFO root: 

  Taxonomy_Grp                File_Name        Date            Schema
0          tg4  tg4_2020-11-03_ford.csv  2020-11-03  key_a4,target_a4
1          tg4  tg4_2020-11-02_ford.csv  2020-11-02  key_a4,target_a4
2          tg4  tg4_2020-11-01_ford.csv  2020-11-01  key_a4,target_a4
3          tg0  tg0_2020-11-02_ford.csv  2020-11-02  key_a0,target_a0


 ###   tg_retained_report    (Existing)

In [247]:
# tg_retained_report_data = [{'Taxonomy_Grp':f['FileGrp'], 'File_Name':f['FileName'], 'Date':f['Date'], 'Schema': f['Schema'], 'Desc' : 'Retained' }
                           
#                             if tg_data_files_dict.get(tg).get(fn) is not None 
#                             else {'Taxonomy_Grp':f['FileGrp'], 'File_Name':f['FileName'], 'Date':f['Date'], 'Schema': f['Schema'], 'Desc' : 'New File' }
#                             for tg in tg_existing 
#                             for fn, f in src_delta['existing_tg_files_dict'].get(tg).items()]
# tg_retained_report_data
# log_report(tg_retained_report_data, columns=['Taxonomy_Grp','File_Name','Date', 'Schema', 'Desc']) 
# 

tg_retained_report_data = [{'Taxonomy_Grp':f['FileGrp'], 'File_Name':f['FileName'], 'Date':f['Date'], 'Schema': f['Schema'], 'Desc' : 'Retained' }
                           
                            if tg_data_files_dict.get(tg).get(fn) is not None 
                            else {'Taxonomy_Grp':f['FileGrp'], 'File_Name':f['FileName'], 'Date':f['Date'], 'Schema': f['Schema'], 'Desc' : 'New File' }
                            for tg in tg_existing 
                            for fn, f in src_delta['existing_tg_files_dict'].get(tg).items()]

tg_retained_dropped_files = [extract_info(tg_data_files_dict.get(tg).get(fn)['Key']) 
                             for tg in tg_existing 
                             for fn in set(tg_data_files_dict.get(tg).keys()).difference(set(src_delta['existing_tg_files_dict'].get(tg).keys()))]

                             
tg_retained_dropped_files_report_data = [{'Taxonomy_Grp':f['FileGrp'], 'File_Name':f['FileName'], 'Date':f['Date'], 'Schema': tg_data_schema_dict[f['FileGrp']], 'Desc' : 'Dropped' }
                                          for f in tg_retained_dropped_files]

tg_retained_report_data.extend(tg_retained_dropped_files_report_data)
log_report(tg_retained_report_data, columns=['Taxonomy_Grp','File_Name','Date', 'Schema', 'Desc'], sort_by = ['Taxonomy_Grp','Date']) 



2020-11-01 23:40:10,717 INFO root: 

  Taxonomy_Grp                File_Name        Date            Schema      Desc
0          tg1  tg1_2020-11-01_ford.csv  2020-11-01  key_a1,target_a1  Retained
1          tg1  tg1_2020-11-02_ford.csv  2020-11-02  key_a1,target_a1  New File
2          tg5  tg5_2020-11-01_ford.csv  2020-11-01  key_a5,target_a5  Retained
3          tg5  tg5_2020-11-02_ford.csv  2020-11-02  key_a5,target_a5  Retained
4          tg5  tg5_2020-11-03_ford.csv  2020-11-03  key_a5,target_a5  New File
5          tg5  tg5_2020-11-05_ford.csv  2020-11-05  key_a5,target_a5   Dropped
6          tg7  tg7_2020-11-01_ford.csv  2020-11-01  key_a7,target_a7  Retained
7          tg7  tg7_2020-11-02_ford.csv  2020-11-02  key_a7,target_a7  Retained


In [253]:
### Needs to be dropped during sync up

tg_retained_dropped_files =  [(tg,fn)#[extract_info(tg_data_files_dict.get(tg).get(fn)['Key']) 
                             for tg in tg_existing 
                             for fn in set(tg_data_files_dict.get(tg).keys()).difference(set(src_delta['existing_tg_files_dict'].get(tg).keys()))]
tg_retained_dropped_files

[('tg5', 'tg5_2020-11-05_ford.csv')]

In [254]:
#tg_retained_dropped_files_report_data

### Invalid_tg_schema_mismatch Report

In [120]:
invalid_tg_with_dup_schema_rep = [ {'Taxonomy_Grp':f['FileGrp'], 'File_Name':f['FileName'], 'Date':f['Date'], 'Schema': f['Schema'] }
                    for i in  invalid_tg_with_dup_schema for fn, f in src_delta['new_tg_files_dict'].get(i).items()]


log_report(invalid_tg_with_dup_schema_rep, columns=['Taxonomy_Grp','File_Name','Date', 'Schema']) 


2020-11-01 21:34:09,716 INFO root: 

  Taxonomy_Grp                 File_Name        Date            Schema
0         tg11  tg11_2020-11-01_ford.csv  2020-11-01  key_a9,target_a9
1          tg9   tg9_2020-11-01_ford.csv  2020-11-01  key_a9,target_a9


### Invalid_tg_target_mismatch Report

In [121]:
invalid_tg_with_dup_target_rep = [ {'Taxonomy_Grp':f['FileGrp'], 'File_Name':f['FileName'], 'Date':f['Date'], 'Schema': f['Schema'] }
                    for i in  invalid_tg_with_dup_target for fn, f in src_delta['new_tg_files_dict'].get(i).items()]
log_report(invalid_tg_with_dup_target_rep, columns=['Taxonomy_Grp','File_Name','Date', 'Schema']) 

2020-11-01 21:34:12,620 INFO root: 

  Taxonomy_Grp                 File_Name        Date             Schema
0          tg8   tg8_2020-11-01_ford.csv  2020-11-01   key_a8,target_a7
1         tg11  tg11_2020-11-01_ford.csv  2020-11-01   key_a9,target_a9
2         tg10  tg10_2020-11-01_ford.csv  2020-11-01  key_a10,target_a9
3          tg9   tg9_2020-11-01_ford.csv  2020-11-01   key_a9,target_a9


### Some of the Newly Delivered Files  are invalid for existing grp

In [122]:
partially_invalid_tg_set = tg_new.intersection(tg_existing)
partially_invalid_tg_report = [ {'Taxonomy_Grp':f['FileGrp'], 'File_Name':f['FileName'], 'Date':f['Date'], 'Schema': f['Schema'], 'Grp_Schema': tg_data_schema_dict[i] } for i in partially_invalid_tg_set  for fn, f in src_delta['new_tg_files_dict'].get(i).items()]
log_report(partially_invalid_tg_report, columns=['Taxonomy_Grp','File_Name','Date', 'Schema','Grp_Schema']) 

2020-11-01 21:34:15,265 INFO root: 

  Taxonomy_Grp                File_Name        Date              Schema        Grp_Schema
0          tg7  tg7_2020-11-03_ford.csv  2020-11-03   key_a7,target_a71  key_a7,target_a7
1          tg5  tg5_2020-11-04_ford.csv  2020-11-04  key_a51,target_a51  key_a5,target_a5


### Invalid file not match with desired file pattern

In [123]:
invalid_files_report_data = [{'File_Name' : i} for i in invalid_files_set]
log_report(invalid_files_report_data,  columns=['File_Name'])

2020-11-01 21:34:17,595 INFO root: 

                                                       File_Name
0  s3://qubole-ford/taxonomy_cs/test1/src/tg0_202-11-01_ford.csv


### File Schemas Patterns are not valid

In [124]:
invalid_schema_files2 ={('s3://qubole-ford/taxonomy_cs/test1/src/tg0_2020-11-03_ford.csv',
  'key_a0, targe_a0',
  'Exact one Target column is required! \nAll given columns should Key or Target!'),
('s3://qubole-ford/taxonomy_cs/test1/src/tg0_2020-11-04_ford.csv',
  'key_a0, targe_a0',
  'Exact one Target column is required! \nAll given columns should Key or Target!')}

invalid_schema_files2 ={('tg0_2020-11-03_ford.csv',
  'key_a0, targe_a0',
  'Exact one Target column is required! \nAll given columns should Key or Target!'),
('tg0_2020-11-04_ford.csv',
  'key_a0, targe_a0',
  'Exact one Target column is required! \nAll given columns should Key or Target!')}

invalid_schema_files_rep_data = [{'File_Name' : i[0], 'Schema': i[1], 'Reason' : i[2]} for i in invalid_schema_files]
log_report(invalid_schema_files_rep_data,  columns=['File_Name', 'Schema','Reason'], header_align='left')
# df = pd.DataFrame(invalid_schema_files_rep_data, columns=['File_Name', 'Schema','Reason']) 
# #     df1 = df.reindex(columns=['Taxonomy_Grp','File','Date', 'Schema'])
#     #df[df.columns[new_order]]
# #df = df.transpose()
# pd.set_option("display.colheader_justify","left")
# logging.info("\n\n"+str(df))


2020-11-01 21:34:24,179 INFO root: 

  File_Name                                                       Schema            Reason                                                                         
0  s3://qubole-ford/taxonomy_cs/test1/src/tg0_2020-11-03_ford.csv  key_a0, targe_a0  Exact one Target column is required! \nAll given columns should Key or Target!


### Exposed TG Report

In [243]:
exposed_tg_report_data = [i.get_dict() for i in exposed_tg_all]
log_report(exposed_tg_report_data,  columns=['tg_name', 'key_cols','target_col', 'location'], sort_by='tg_name')

2020-11-01 23:37:03,339 INFO root: 

  tg_name   key_cols  target_col                                     location
0     tg0   [key_a0]   target_a0  s3://qubole-ford/taxonomy_cs/test1/data/tg0
1     tg1   [key_a1]   target_a1  s3://qubole-ford/taxonomy_cs/test1/data/tg1
2     tg2  [key_a21]  target_a21  s3://qubole-ford/taxonomy_cs/test1/data/tg2
3     tg4   [key_a4]   target_a4  s3://qubole-ford/taxonomy_cs/test1/data/tg4
4     tg5   [key_a5]   target_a5  s3://qubole-ford/taxonomy_cs/test1/data/tg5
5     tg6   [key_a6]  target_a61  s3://qubole-ford/taxonomy_cs/test1/data/tg6
6     tg7   [key_a7]   target_a7  s3://qubole-ford/taxonomy_cs/test1/data/tg7


### Exposed Dropped TG Report

In [188]:
exposed_tg_dropped_report_data = [i.get_dict() for i in exposed_dropped_tg_all]
log_report(exposed_tg_dropped_report_data,  columns=['tg_name'])

2020-11-01 22:57:42,161 INFO root: 

  tg_name
0     tg3
1     tg6
2     tg2


In [277]:
### Extended drop tg report
exposed_tg_dropped_report_data2 = [i.get_dict() for i in exposed_dropped_tg_all2]
log_report(exposed_tg_dropped_report_data2, columns=['tg_name', 'key_cols','target_col', 'location'], sort_by='tg_name')

2020-11-02 00:28:33,646 INFO root: 

  tg_name  key_cols target_col                                     location
0     tg2  [key_a2]  target_a2  s3://qubole-ford/taxonomy_cs/test1/data/tg2
1     tg3  [key_a3]  target_a3  s3://qubole-ford/taxonomy_cs/test1/data/tg3
2     tg6  [key_a6]  target_a6  s3://qubole-ford/taxonomy_cs/test1/data/tg6


### Gen Schema XML

In [282]:
from jinja2 import Environment, FileSystemLoader
from libs import path_resolver

In [283]:
def get_xml_template(template_file_name, **kwargs):

    path = os.path.join("templates", "xml") 
    templates_dir = path_resolver.resolve(path)
    env = Environment( loader = FileSystemLoader(templates_dir) )
    template = env.get_template(template_file_name)
    return template.render(**kwargs)

In [314]:
[str(i) for i in exposed_tg_all]

["tg_name: tg4, key_cols: ['key_a4'], target_col: target_a4, location: s3://qubole-ford/taxonomy_cs/test1/data/tg4",
 "tg_name: tg6, key_cols: ['key_a6'], target_col: target_a61, location: s3://qubole-ford/taxonomy_cs/test1/data/tg6",
 "tg_name: tg0, key_cols: ['key_a0'], target_col: target_a0, location: s3://qubole-ford/taxonomy_cs/test1/data/tg0",
 "tg_name: tg2, key_cols: ['key_a21'], target_col: target_a21, location: s3://qubole-ford/taxonomy_cs/test1/data/tg2",
 "tg_name: tg1, key_cols: ['key_a1'], target_col: target_a1, location: s3://qubole-ford/taxonomy_cs/test1/data/tg1",
 "tg_name: tg5, key_cols: ['key_a5'], target_col: target_a5, location: s3://qubole-ford/taxonomy_cs/test1/data/tg5",
 "tg_name: tg7, key_cols: ['key_a7'], target_col: target_a7, location: s3://qubole-ford/taxonomy_cs/test1/data/tg7"]

In [311]:
a_xml = get_xml_template('audit_lmt_ds_schema_config.xml', dn_version = '11.1', create_tg_schema_obj_list=exposed_tg_all, drop_tg_schema_obj_list = exposed_dropped_tg_all2)
print(a_xml)

<configroot version="11.1">
	<set>
		<name>CS_TAXONOMY_LMT_SCHEMA_SET</name>
        
		<elements>
			<subsource_name>
				<!-- name of the subsource, has to be unique across all datasource -->
				<val>tg4</val>
			</subsource_name>
			<key-columns>
                
				<attr datatype="STRING">key_a4</attr>
                
			</key-columns>
			<target-columns>
				<attr datatype="STRING">target_a4</attr>
			</target-columns>
			<partitionby_columns>
			</partitionby_columns>
			<row_delimiter>
				<!-- Row separator -->
				<val>'\n'</val>
			</row_delimiter>
			<column_delimiter>
				<!-- Column separator -->
				<val>','</val>
			</column_delimiter>
			<serde>
				<val>'org.apache.hadoop.hive.serde2.OpenCSVSerde'</val>
			</serde>
			<serde_properties>
				<val/>
			</serde_properties>
			<table_properties>
				<val/>
			</table_properties>
			<storage_type>
				<!-- Storage or compression type of files, ex: TEXTFILE, ORC, PARQUET -->
				<val>TEXTFILE</val>
			</storage_type>
			<

In [291]:
from lxml import etree
from lxml.etree import _ElementTree, _Element, XMLParser

from io import StringIO, BytesIO

In [312]:
a_root = etree.fromstring(a_xml)

In [319]:
# def write_xml(root_element :_Element, dest : str, file_name : str, debug: str ):
#     if not os.path.exists(dest):
#         os.makedirs(dest)
#     destpath: Path = os.path.join(dest, file_name)
#     valid: bool
#     error: str
#     valid, error = __write_xml(root_element, destpath)
#     if not valid:
#         warnings.warn('{} Could not write xml file\
#             "{}"'.format(debug, error))
#         return None
#     logger.info("%s has been generated.", file_name)
#     logger.debug("staging location: %s", destpath)
	
	
	
	
	
# def __write_xml(tree: _Element, path: str) -> tuple:
#     """
#     Write dictionary structure into xml file 
#     """
    
#     if tree is None:
#         return (False, 'No dictionary provided to write to xml')

#     if path is None:
#         return (False, 'No xml path provided to write dictionary to')

#     try:
#         with open(path, mode='wb') as fp:
#             # Format the output while writing. Note that
#             # any initial indendation has to be removed
#             # while reading from source file/string.
#             eetree = etree.ElementTree(tree)
#             eetree.write(fp, pretty_print=True, xml_declaration=True,
#                          encoding='utf-8', method="xml")
#         return (True, None)
#     except ParseError:
#         error: str = 'Error in xml encoding while writing to file "{}"'.format(
#             path)
#         warnings.warn(error)
#         return (False, error)
#     except OSError:
#         error: str = 'Error in writing xml to file "{}"'.format(path)
#         warnings.warn(error)
#         return (False, error)

def write_xml(root_element :_Element, dest : str, file_name : str, debug: str ):
    log = logging.getLogger(__name__)
    if not os.path.exists(dest):
        os.makedirs(dest)
    destpath: Path = os.path.join(dest, file_name)
    valid: bool
    error: str
    valid, error = __write_xml(root_element, destpath)
    if not valid:
        log.warnings.warn('{} Could not write xml file\
            "{}"'.format(debug, error))
        return None
    log.info("%s has been generated.", file_name)
    log.debug("desting location: %s", destpath)
	

def __write_xml(tree: _Element, path: str) -> tuple:
    """
    Write dictionary structure into xml file 
    """
    log = logging.getLogger(__name__)
    if tree is None:
        return (False, 'No dictionary provided to write to xml')

    if path is None:
        return (False, 'No xml path provided to write dictionary to')

    try:
        with open(path, mode='wb') as fp:
            # Format the output while writing. Note that
            # any initial indendation has to be removed
            # while reading from source file/string.
            eetree = etree.ElementTree(tree)
            eetree.write(fp, pretty_print=True, xml_declaration=True,
                         encoding='utf-8', method="xml")
        return (True, None)
    except ParseError:
        error: str = 'Error in xml encoding while writing to file "{}"'.format(
            path)
        log.warnings.warn(error)
        return (False, error)
    except OSError:
        error: str = 'Error in writing xml to file "{}"'.format(path)
        log.warnings.warn(error)
        return (False, error)

In [321]:
write_xml(a_root, '/home/vbhargava/feature_test0/temp/taxo_config_xmls/', 'test.xml')

TypeError: write_xml() missing 1 required positional argument: 'debug'

In [316]:
log = logging.getLogger(__name__)

In [318]:
log.warning('hi')

