In [1]:
import sys, os, inspect, re
sys.path.append("/home/vbhargava/feature_test0/msaction_backend/common/BU3.0_core/util/Py_utils/taxonomy_utils")
import time, logging
numeric_level = getattr(logging, 'INFO', None)
stdout_handler = logging.StreamHandler(sys.stdout)
logging.basicConfig(level=numeric_level,
                        format='%(asctime)s %(levelname)s %(name)s: %(message)s',
                        handlers=[stdout_handler])

In [2]:
from libs.s3_ops import S3_OPs
from libs.s3_stream import S3Stream
from libs.configs import Config
from libs.nio_executor import NIO
from libs import utils
from collections import defaultdict

In [3]:
config = '/home/vbhargava/feature_test0/msaction_backend/customers/raj_ford_test/common/config/inputs/platform_config.xml'
lmt_src = 's3://qubole-ford/taxonomy_cs/lmt/input/'
lmt_data = 's3://qubole-ford/taxonomy_cs/lmt/data/'

In [4]:
config_data = Config.get_qubole_config(config)
ACCESS_KEY=config_data['access_key']
SECRET_KEY=config_data['secret_key']

In [5]:
TG_EXTRACT_REGEX = '^.*?/([a-zA-Z]+\-?[0-9]*)/$'
FILE_EXTRACT_REGEX = '^.*/([a-zA-Z0-9.\-_]{0,255}.csv)$'
TARGET_EXTRACT_REGEX ='^.*,?(target_[A-Za-z0-9_-]+).*$'
VALID_FILE_KEY_REGEX = '^(.*/([a-zA-Z]+\-?[0-9]*)?/)?(([a-zA-Z]+\-?[0-9]*?)_([0-9]{4}-[0-9]{2}-[0-9]{2}?)_([a-zA-Z0-9.\-_]+?).csv?)$'

In [6]:
s3_ops = S3_OPs(ACCESS_KEY, SECRET_KEY)

def filename_by_key(key):
    return get_val_by_regex(key, FILE_EXTRACT_REGEX, error_msg="Not vaild key for taxonomy data csv file")

def find_by_data_tg(key, regex):
    return get_val_by_regex(key, regex, error_msg="Not vaild taxonomy data dir")

        
def get_val_by_regex(key, regex, error_msg="can't be extract a val."):
    matched = re.findall(regex, key)
    if len(matched) > 0:
        return matched[0]
    else:
        raise Exception(error_msg)
        
def get_data_n_schema(tg, data_files_loc):
    data_file_lock_detail = s3_ops.get_bucket_name(data_files_loc)
    files = s3_ops.list_complete(data_file_lock_detail['bucket'], data_file_lock_detail['key'])
    res = {}
    if len(files)>0:
        s3_stream = S3Stream(ACCESS_KEY, SECRET_KEY)
        schema = s3_stream.get_header(s3_ops.get_full_s3_path(data_file_lock_detail['bucket'],files[0]['Key']))
        #res[tg]={'schema':schema, 'files': files}
        res['schema'] = {tg:schema}
        res['files'] = {tg:files}
    return res

def extract_schema(schema):
    return schema.replace(" ","").lower()

def validate_schema(schema):
    if schema=='': 
        return {'IsValid' : False, 'schema': schema, 'message' : "Schema shouldn't be empty"}
    tokens = schema.split(',')
    if len(tokens) < 2:
         return {'IsValid' : False, 'schema': schema, 'message' : "Schema should have at least 2 columns"}
    KEY_REGEX = '^[Kk]ey_[A-Za-z0-9_]{3,30}$'
    TARGET_REGEX = '^[Tt]arget_[A-Za-z0-9_]{3,30}$'
    key_cnt = 0
    target_cnt = 0
    invalid_headers = []
    columns = defaultdict(list)
    res = {}
    target_col = None
    key_cols_set = set()
    for t in tokens:
        t = t.strip()
        if re.match(TARGET_REGEX, t):
            target_cnt = target_cnt + 1
            target_col = t
        elif re.match(KEY_REGEX, t):
            key_cnt = key_cnt + 1
            key_cols_set.add(t)
        else:
            invalid_headers.append(t)
        columns[t.lower()].append(1)

    error_msgs=[]
    if target_cnt != 1 :
        error_msgs.append("Exact one Target column is required!")
    if key_cnt < 1 :
        error_msgs.append("At least one Key column is required!")
    if len(invalid_headers) > 0 :
        error_msgs.append("All given columns should Key or Target!")
    for k, v in columns.items():

        if len(v) > 1:
            print("--")
            error_msgs.append("Same name: {} should not represent more than one column in schema! cols names are case insensitive. ".format(k))

    if len(error_msgs) > 0:
        return {'IsValid' : False, 'schema': schema, 'errors' : " \n".join(error_msgs)}
    #print(str(key_cnt)+":"+str(target_cnt)+":"+str(invalid_headers)+":"+str(columns))
    return {'IsValid' : True, 'Schema': schema.replace(" ","").lower(), 
            'TargetCol' : target_col, 'KeyColsSet' : key_cols_set}

In [7]:
def extract_data_detail(lmt_src, lmt_data, access_key, secret_key):
#     Valid data Taxonomy Grps
    
    #
    lmt_data_loc_detail = s3_ops.get_bucket_name(lmt_data)
    lmt_data_loc_bucket = lmt_data_loc_detail['bucket']
    lmt_data_loc_key = lmt_data_loc_detail['key']
    valid_tg_list_res = s3_ops.list_subdirs(lmt_data_loc_detail['bucket'],lmt_data_loc_detail['key'],)
    
    valid_tgrp_loc_list = [ [find_by_data_tg(item['Prefix'], TG_EXTRACT_REGEX), 
                         '{}{}'.format(lmt_data, find_by_data_tg(item['Prefix'], TG_EXTRACT_REGEX))] 
                       for item in valid_tg_list_res]
    
    collected = NIO.decorated_run_io(task=get_data_n_schema, task_n_args_list=valid_tgrp_loc_list, max_workers=25,)
    
    tg_data_schema_dict = {k:extract_schema(v)  for item in collected for k, v in item['result']['schema'].items()}
    tg_data_files_dict = {k:{filename_by_key(u['Key']):u for u in v } for item in collected for k, v in item['result']['files'].items()}
    target_data_tg_dict = {re.findall(TARGET_EXTRACT_REGEX,V)[0]: K for K, V in tg_data_schema_dict.items()}
    
    return tg_data_schema_dict, tg_data_files_dict,target_data_tg_dict

In [8]:
extract_data_detail(lmt_src, lmt_data, ACCESS_KEY, SECRET_KEY)

2020-10-28 23:42:46,915:65224 MainThread run_blocking_tasks: starting

2020-10-28 23:42:46,916:65224 MainThread run_blocking_tasks: creating executor tasks

2020-10-28 23:42:46,917:65224 ThreadPoolExecutor-0_0 (task-0): passed args :['AdvertiserReporting', 's3://qubole-ford/taxonomy_cs/lmt/data/AdvertiserReporting']

2020-10-28 23:42:46,917:65224 ThreadPoolExecutor-0_1 (task-1): passed args :['ChannelGrouping', 's3://qubole-ford/taxonomy_cs/lmt/data/ChannelGrouping']

2020-10-28 23:42:46,918:65224 ThreadPoolExecutor-0_0 (task-0): running

2020-10-28 23:42:46,918:65224 MainThread run_blocking_tasks: waiting for executor tasks

2020-10-28 23:42:46,919:65224 ThreadPoolExecutor-0_1 (task-1): running

2020-10-28 23:42:46,975:65224 ThreadPoolExecutor-0_0 (task-0): done

2020-10-28 23:42:47,020:65224 ThreadPoolExecutor-0_1 (task-1): done

2020-10-28 23:42:47,021:65224 MainThread run_blocking_tasks: exiting



({'AdvertiserReporting': 'key_evt_advertiser_key,target_evt_advertiser_name',
  'ChannelGrouping': 'key_evt_source,target_channel'},
 {'AdvertiserReporting': {'AdvertiserReporting_2020-06-01_ford.csv': {'Key': 'taxonomy_cs/lmt/data/AdvertiserReporting/AdvertiserReporting_2020-06-01_ford.csv',
    'LastModified': datetime.datetime(2020, 10, 20, 21, 52, 9, tzinfo=tzlocal()),
    'ETag': '"04234605e8b4354998074abae5c74ae9"',
    'Size': 71,
    'StorageClass': 'STANDARD'},
   'AdvertiserReporting_2020-06-02_ford.csv': {'Key': 'taxonomy_cs/lmt/data/AdvertiserReporting/AdvertiserReporting_2020-06-02_ford.csv',
    'LastModified': datetime.datetime(2020, 10, 20, 21, 50, 55, tzinfo=tzlocal()),
    'ETag': '"f33832d48d54ec287f1b486526c197bf"',
    'Size': 57,
    'StorageClass': 'STANDARD'}},
  'ChannelGrouping': {'ChannelGrouping_2020-06-01_ford.csv': {'Key': 'taxonomy_cs/lmt/data/ChannelGrouping/ChannelGrouping_2020-06-01_ford.csv',
    'LastModified': datetime.datetime(2020, 10, 20, 21, 53,

In [9]:
from service.taxonomy_cs_api import Taxonomy_CS_API

In [10]:
tcAPI = Taxonomy_CS_API()

In [11]:
tcAPI.extract_data_detail()

2020-10-28 23:42:47,102:65224 MainThread run_blocking_tasks: starting

2020-10-28 23:42:47,103:65224 MainThread run_blocking_tasks: creating executor tasks

2020-10-28 23:42:47,104:65224 ThreadPoolExecutor-1_0 (task-0): passed args :['AdvertiserReporting', 's3://qubole-ford/taxonomy_cs/lmt/data/AdvertiserReporting']

2020-10-28 23:42:47,104:65224 ThreadPoolExecutor-1_1 (task-1): passed args :['ChannelGrouping', 's3://qubole-ford/taxonomy_cs/lmt/data/ChannelGrouping']

2020-10-28 23:42:47,105:65224 MainThread run_blocking_tasks: waiting for executor tasks

2020-10-28 23:42:47,105:65224 ThreadPoolExecutor-1_0 (task-0): running

2020-10-28 23:42:47,106:65224 ThreadPoolExecutor-1_1 (task-1): running

2020-10-28 23:42:47,187:65224 ThreadPoolExecutor-1_0 (task-0): done

2020-10-28 23:42:47,200:65224 ThreadPoolExecutor-1_1 (task-1): done

2020-10-28 23:42:47,201:65224 MainThread run_blocking_tasks: exiting



{'tg_data_schema_dict': {'ChannelGrouping': 'key_evt_source,target_channel',
  'AdvertiserReporting': 'key_evt_advertiser_key,target_evt_advertiser_name'},
 'tg_data_files_dict': {'ChannelGrouping': {'ChannelGrouping_2020-06-01_ford.csv': {'Key': 'taxonomy_cs/lmt/data/ChannelGrouping/ChannelGrouping_2020-06-01_ford.csv',
    'LastModified': datetime.datetime(2020, 10, 20, 21, 53, 33, tzinfo=tzlocal()),
    'ETag': '"f6987a7636f122fd412879bdcb51678d"',
    'Size': 53,
    'StorageClass': 'STANDARD'}},
  'AdvertiserReporting': {'AdvertiserReporting_2020-06-01_ford.csv': {'Key': 'taxonomy_cs/lmt/data/AdvertiserReporting/AdvertiserReporting_2020-06-01_ford.csv',
    'LastModified': datetime.datetime(2020, 10, 20, 21, 52, 9, tzinfo=tzlocal()),
    'ETag': '"04234605e8b4354998074abae5c74ae9"',
    'Size': 71,
    'StorageClass': 'STANDARD'},
   'AdvertiserReporting_2020-06-02_ford.csv': {'Key': 'taxonomy_cs/lmt/data/AdvertiserReporting/AdvertiserReporting_2020-06-02_ford.csv',
    'LastModif

## SRC File processed

In [18]:
tg_data = extract_data_detail(lmt_src, lmt_data, ACCESS_KEY, SECRET_KEY)
tg_data_schema_dict = tg_data[0]
tg_data_files_dict = tg_data[1]
target_data_tg_dict = tg_data[2]

2020-10-28 23:44:15,720:65224 MainThread run_blocking_tasks: starting

2020-10-28 23:44:15,721:65224 MainThread run_blocking_tasks: creating executor tasks

2020-10-28 23:44:15,722:65224 ThreadPoolExecutor-4_0 (task-0): passed args :['AdvertiserReporting', 's3://qubole-ford/taxonomy_cs/lmt/data/AdvertiserReporting']

2020-10-28 23:44:15,722:65224 ThreadPoolExecutor-4_1 (task-1): passed args :['ChannelGrouping', 's3://qubole-ford/taxonomy_cs/lmt/data/ChannelGrouping']

2020-10-28 23:44:15,722:65224 MainThread run_blocking_tasks: waiting for executor tasks

2020-10-28 23:44:15,723:65224 ThreadPoolExecutor-4_0 (task-0): running

2020-10-28 23:44:15,724:65224 ThreadPoolExecutor-4_1 (task-1): running

2020-10-28 23:44:15,781:65224 ThreadPoolExecutor-4_0 (task-0): done

2020-10-28 23:44:15,821:65224 ThreadPoolExecutor-4_1 (task-1): done

2020-10-28 23:44:15,823:65224 MainThread run_blocking_tasks: exiting



In [20]:
target_data_tg_dict

{'target_evt_advertiser_name': 'AdvertiserReporting',
 'target_channel': 'ChannelGrouping'}

In [14]:
def is_valid_file(key:str='', regex = VALID_FILE_KEY_REGEX):
    if re.match(regex, key) is None:
        return False
    return True

def extract_info(key:str='', regex = VALID_FILE_KEY_REGEX):
    matched = re.findall(regex, key)
    return {
            'KeyDirPath' : matched[0][0],
            'ParentDir' : matched[0][1],
            'FileName' : matched[0][2],
            'FileGrp' :  matched[0][3],
            'Date' :  matched[0][4],
            'ClientName' : matched[0][5]
           }
def extract_info_with_bucket(key:str='', bucket = ''):
    res = extract_info(key)
    res.update({'Bucket' : bucket})
    return res

In [15]:
def grouped_tg(collected, tg_files_dict_type='new_tg_files_dict'):
    collect = defaultdict(dict)
    tg_f_gen = (item['result'][tg_files_dict_type] for item in collected if len(item['result'][tg_files_dict_type]) > 0)
    tg_f_gen2 = (collect[tg].update({filename: file_dict})  for item in tg_f_gen for tg, file_detail_dict in item.items() for filename, file_dict in file_detail_dict.items())
    [ i for i in tg_f_gen2]
    tg = dict(collect)
    return tg

def grouped_flag_dict(collected, flag_dict_type='schema_tg_dict'):
    f_gen = (item['result'][flag_dict_type] for item in collected if len(item['result'][flag_dict_type]) > 0)
    collect = defaultdict(set)
    f_gen2 = (collect[K].add(V)  for item in f_gen for K, V in item.items())
    [ i for i in f_gen2]
    res = dict(collect)
    return res

def grouped_set_of_flags_dict(collected, flag_dict_type='schema_tg_dict'):
    f_gen = (item['result'][flag_dict_type] for item in collected if len(item['result'][flag_dict_type]) > 0)
    collect = defaultdict(set)
    f_gen2 = (collect[K].add(*V)  for item in f_gen for K, V in item.items())
    [ i for i in f_gen2]
    res = dict(collect)
    return res

def grouped_set_of_flags(collected, flag_dict_type='invalid_schema_files'):
    res_set=set()
    f_gen = (res_set.update(item['result'][flag_dict_type]) for item in collected if len(item['result'][flag_dict_type]) > 0)
    [ i for i in f_gen]
    return res_set

In [16]:
# def file_process_task(src_file_details):
#     invalid_schema_files = set()

#     ''' {'key_evt_advertiser_key,target_evt_advertiser_name': {'tg1', 'tg2', ...}}'''
#     schema_tg_dict = {}
    
#     ''' {'target_evt_advertiser_name': {'tg1', 'tg2', ...}}'''
#     target_tg_dict = {}

#     ''' {'tg': {'key_evt_advertiser_key,target_evt_advertiser_name', '',...}}'''
#     new_tg_schema_dict = {}
#     ''' {'tg': {'AdvertiserReporting_2020-06-01_ford.csv': {file detailed obj dict} }  }'''
#     new_tg_files_dict = {}
#     ''' {'tg': {'AdvertiserReporting_2020-06-01_ford.csv': {file detailed obj dict} }  }'''
#     existing_tg_files_dict = {}


# #     tg_data_schema_dict = {}
# #     tg_data_files_dict = {}

# #     src_file_details = valid_file_arg[0]
#     src_file_loc = s3_ops.get_full_s3_path(src_file_details['Bucket'], src_file_details['Key'])

#     s3_stream = S3Stream(ACCESS_KEY, SECRET_KEY)
#     schema =  s3_stream.get_header(src_file_loc)
#     #schema = 'key_evt_advertiser_key, targe_evt_advertiser_name'
#     validate_res = validate_schema(schema)
#     if validate_res['IsValid']:
#         src_file_details['Schema'] = validate_res['Schema']
#         tg = src_file_details['FileGrp']
#         file_name = src_file_details['FileName']
#         schema_tg_dict[validate_res['Schema']] = tg
#         target_tg_dict[validate_res['TargetCol']] = tg
#         if tg_data_schema_dict.get(tg) is None:
#             new_tg_schema_dict[tg] = validate_res['Schema']
#             new_tg_files_dict[tg] = {file_name: src_file_details}
#         else:
#             existing_tg_files_dict[tg] = {file_name: src_file_details}

#     else:
#         invalid_schema_files.add((src_file_loc, schema, validate_res['errors']))

#     return {'invalid_schema_files': invalid_schema_files,
#             'schema_tg_dict': schema_tg_dict,
#             'target_tg_dict':target_tg_dict,
#             'new_tg_schema_dict': new_tg_schema_dict,
#             'new_tg_files_dict' : new_tg_files_dict,
#             'existing_tg_files_dict' : existing_tg_files_dict
#            }


def file_process_task(src_file_details):
    
    invalid_schema_files = set()

    target_already_exist_files = set()
    
    ''' {'key_evt_advertiser_key,target_evt_advertiser_name': {'tg1', 'tg2', ...}}'''
    schema_tg_dict = {}
    
    ''' {'target_evt_advertiser_name': {'tg1', 'tg2', ...}}'''
    target_tg_dict = {}

    ''' {'tg': {'key_evt_advertiser_key,target_evt_advertiser_name', '',...}}'''
    new_tg_schema_dict = {}
    ''' {'tg': {'AdvertiserReporting_2020-06-01_ford.csv': {file detailed obj dict} }  }'''
    new_tg_files_dict = {}
    ''' {'tg': {'AdvertiserReporting_2020-06-01_ford.csv': {file detailed obj dict} }  }'''
    existing_tg_files_dict = {}


    # tg_data_schema_dict = 
    # tg_data_files_dict = 
    # target_data_tg_dict = 

#     src_file_details = valid_file_arg[0]
    src_file_loc = s3_ops.get_full_s3_path(src_file_details['Bucket'], src_file_details['Key'])

    s3_stream = S3Stream(ACCESS_KEY, SECRET_KEY)
    schema =  s3_stream.get_header(src_file_loc)
    #schema = 'key_evt_advertiser_key, targe_evt_advertiser_name'
    validate_res = validate_schema(schema)
    if validate_res['IsValid']:
        
        
        
        tg = src_file_details['FileGrp']
        file_name = src_file_details['FileName']
        
        if tg_data_schema_dict.get(tg) is None or tg_data_schema_dict.get(tg) != validate_res['Schema']:
                
#             data_tg_for_target = target_data_tg_dict.get(validate_res['TargetCol'])
#             if  data_tg_for_target is not None:# and data_tg_for_target != tg:
#                 target_already_exist_files.add((src_file_loc, data_tg_for_target))
#             else:
            new_tg_schema_dict[tg] = validate_res['Schema']
            new_tg_files_dict[tg] = {file_name: src_file_details}
        else:
            existing_tg_files_dict[tg] = {file_name: src_file_details}
        
        src_file_details['Schema'] = validate_res['Schema']
        schema_tg_dict[validate_res['Schema']] = tg
        target_tg_dict[validate_res['TargetCol']] = tg

    else:
        invalid_schema_files.add((src_file_loc, schema, validate_res['errors']))

    return {'invalid_schema_files': invalid_schema_files,
            'target_already_exist_files':target_already_exist_files,
            'schema_tg_dict': schema_tg_dict,
            'target_tg_dict':target_tg_dict,
            'new_tg_schema_dict': new_tg_schema_dict,
            'new_tg_files_dict' : new_tg_files_dict,
            'existing_tg_files_dict' : existing_tg_files_dict
           }



def src_list_page_process_task(list_page):
    
    lmt_src_loc_detail = s3_ops.get_bucket_name(lmt_src)
    lmt_src_loc_bucket = lmt_src_loc_detail['bucket']
    lmt_src_loc_key = lmt_src_loc_detail['key']
    
    invalid_files_set = { s3_ops.get_full_s3_path(lmt_src_loc_detail['bucket'], item['Key']) for item in list_page if  not is_valid_file(key=item['Key'])}
    valid_file_set = [[utils.dict_append(extract_info_with_bucket(item['Key'], lmt_src_loc_detail['bucket']),item)] for item in list_page if  is_valid_file(key=item['Key']) ]
    collected = NIO.decorated_run_io(task=file_process_task, task_n_args_list=valid_file_set, max_workers=25,)
#     return collected
    return {'invalid_files_set' : invalid_files_set,
            'invalid_schema_files': grouped_set_of_flags(collected, flag_dict_type='invalid_schema_files'),
            'target_already_exist_files' : grouped_set_of_flags(collected, flag_dict_type='target_already_exist_files'),
            'schema_tg_dict': grouped_flag_dict(collected, flag_dict_type='schema_tg_dict'),
            'target_tg_dict': grouped_flag_dict(collected, flag_dict_type='target_tg_dict'),
            'new_tg_schema_dict': grouped_flag_dict(collected, flag_dict_type='new_tg_schema_dict'),
            'new_tg_files_dict' : grouped_tg(collected, 'new_tg_files_dict'),
            'existing_tg_files_dict' : grouped_tg(collected, 'existing_tg_files_dict')
           }

In [21]:
lmt_src_loc_detail = s3_ops.get_bucket_name(lmt_src)
page_generator = s3_ops.list_gen(lmt_src_loc_detail['bucket'],lmt_src_loc_detail['key'], maxKeysPerReq=12, )
list_page = [i for i in page_generator][0]

src_list_page_process_task(list_page)

2020-10-28 23:45:47,975:65224 MainThread run_blocking_tasks: starting

2020-10-28 23:45:47,975:65224 MainThread run_blocking_tasks: creating executor tasks

2020-10-28 23:45:47,976:65224 ThreadPoolExecutor-5_0 (task-0): passed args :[{'KeyDirPath': 'taxonomy_cs/lmt/input/', 'ParentDir': 'input', 'FileName': 'AdvertiserReporting-123_2020-05-11_ford.csv', 'FileGrp': 'AdvertiserReporting-123', 'Date': '2020-05-11', 'ClientName': 'ford', 'Bucket': 'qubole-ford', 'Key': 'taxonomy_cs/lmt/input/AdvertiserReporting-123_2020-05-11_ford.csv', 'LastModified': datetime.datetime(2020, 10, 27, 16, 7, 11, tzinfo=tzlocal()), 'ETag': '"c752db15b6440fde2ed7c6b6621fd594"', 'Size': 76, 'StorageClass': 'STANDARD'}]

2020-10-28 23:45:47,976:65224 ThreadPoolExecutor-5_1 (task-1): passed args :[{'KeyDirPath': 'taxonomy_cs/lmt/input/', 'ParentDir': 'input', 'FileName': 'AdvertiserReporting_2020-06-01_ford.csv', 'FileGrp': 'AdvertiserReporting', 'Date': '2020-06-01', 'ClientName': 'ford', 'Bucket': 'qubole-ford

{'invalid_files_set': {'s3://qubole-ford/taxonomy_cs/lmt/input/AdvertiserReporting_202-06-04_ford.csv'},
 'invalid_schema_files': {('s3://qubole-ford/taxonomy_cs/lmt/input/AdvertiserReporting_2020-06-02_ford.csv',
   'ser_key,  target_evt_advertiser_name',
   'At least one Key column is required! \nAll given columns should Key or Target!'),
  ('s3://qubole-ford/taxonomy_cs/lmt/input/ChannelGrouping_2020-06-03_ford.csv',
   'ource, target_channel',
   'At least one Key column is required! \nAll given columns should Key or Target!')},
 'target_already_exist_files': set(),
 'schema_tg_dict': {'key_evt_source,target_channel': {'ChannelGrouping'},
  'key_evt_advertiser_key,target_evt_advertiser_name': {'AdvertiserReporting'},
  'key_evt_advertiser_key_2,target_evt_advertiser_name_2': {'AdvertiserReporting-123'}},
 'target_tg_dict': {'target_channel': {'ChannelGrouping'},
  'target_evt_advertiser_name': {'AdvertiserReporting'},
  'target_evt_advertiser_name_2': {'AdvertiserReporting-123'}},


In [22]:
def extract_src_detail1():
    lmt_src_loc_detail = s3_ops.get_bucket_name(lmt_src)
    lmt_src_loc_bucket = lmt_src_loc_detail['bucket']
    lmt_src_loc_key = lmt_src_loc_detail['key']
    page_generator = s3_ops.list_gen(lmt_src_loc_bucket, lmt_src_loc_key, maxKeysPerReq=3, )
    page_args_generator = ([page] for page in page_generator)
    #list_page = [i for i in page_generator][0]
    collected = NIO.decorated_run_with_args_generator(task=src_list_page_process_task, args_generator=page_args_generator, is_kernal_thread=False,)
    return collected


def extract_src_detail(maxKeysPerReq=3):
    lmt_src_loc_detail = s3_ops.get_bucket_name(lmt_src)
    lmt_src_loc_bucket = lmt_src_loc_detail['bucket']
    lmt_src_loc_key = lmt_src_loc_detail['key']
    page_generator = s3_ops.list_gen(lmt_src_loc_bucket, lmt_src_loc_key, maxKeysPerReq=maxKeysPerReq, )
    page_args_generator = ([page] for page in page_generator)
    #list_page = [i for i in page_generator][0]
    collected = NIO.decorated_run_with_args_generator(task=src_list_page_process_task, args_generator=page_args_generator, is_kernal_thread=True,)
    
    return {'invalid_files_set' : grouped_set_of_flags(collected, flag_dict_type='invalid_files_set'),
            'invalid_schema_files': grouped_set_of_flags(collected, flag_dict_type='invalid_schema_files'),
            'target_already_exist_files' : grouped_set_of_flags(collected, flag_dict_type='target_already_exist_files'),
            'schema_tg_dict': grouped_set_of_flags_dict(collected, flag_dict_type='schema_tg_dict'),
            'target_tg_dict': grouped_set_of_flags_dict(collected, flag_dict_type='target_tg_dict'),
            'new_tg_schema_dict': grouped_set_of_flags_dict(collected, flag_dict_type='new_tg_schema_dict'),
            'new_tg_files_dict' : grouped_tg(collected, 'new_tg_files_dict'),
            'existing_tg_files_dict' : grouped_tg(collected, 'existing_tg_files_dict')
               }

In [23]:
res432 = extract_src_detail1()
res432

2020-10-28 23:47:58,582:65224 MainThread run_blocking_tasks: starting

2020-10-28 23:47:58,583:65224 MainThread run_blocking_tasks: creating executor tasks

2020-10-28 23:47:58,629:65224 ThreadPoolExecutor-6_0 (task-0): passed args :[[{'Key': 'taxonomy_cs/lmt/input/AdvertiserReporting-123_2020-05-11_ford.csv', 'LastModified': datetime.datetime(2020, 10, 27, 16, 7, 11, tzinfo=tzlocal()), 'ETag': '"c752db15b6440fde2ed7c6b6621fd594"', 'Size': 76, 'StorageClass': 'STANDARD'}, {'Key': 'taxonomy_cs/lmt/input/AdvertiserReporting_202-06-04_ford.csv', 'LastModified': datetime.datetime(2020, 10, 27, 14, 15, 22, tzinfo=tzlocal()), 'ETag': '"e4432fb96afba25162ecbd89624b93bd"', 'Size': 72, 'StorageClass': 'STANDARD'}, {'Key': 'taxonomy_cs/lmt/input/AdvertiserReporting_2020-06-01_ford.csv', 'LastModified': datetime.datetime(2020, 10, 12, 21, 15, 2, tzinfo=tzlocal()), 'ETag': '"04234605e8b4354998074abae5c74ae9"', 'Size': 71, 'StorageClass': 'STANDARD'}]]

2020-10-28 23:47:58,632:65224 ThreadPoolExecu

2020-10-28 23:47:58,697:65224 ThreadPoolExecutor-6_2 run_blocking_tasks: waiting for executor tasks

2020-10-28 23:47:58,698:65224 ThreadPoolExecutor-9_1 (task-1): running

2020-10-28 23:47:58,705:65224 ThreadPoolExecutor-9_2 (task-2): running

2020-10-28 23:47:58,709:65224 ThreadPoolExecutor-7_1 (task-1): done

2020-10-28 23:47:58,723:65224 ThreadPoolExecutor-8_0 (task-0): done

2020-10-28 23:47:58,732:65224 ThreadPoolExecutor-8_2 (task-2): done

2020-10-28 23:47:58,733:65224 ThreadPoolExecutor-6_0 run_blocking_tasks: exiting

2020-10-28 23:47:58,736:65224 ThreadPoolExecutor-6_0 (task-0): done

2020-10-28 23:47:58,744:65224 ThreadPoolExecutor-9_0 (task-0): done

2020-10-28 23:47:58,763:65224 ThreadPoolExecutor-9_1 (task-1): done

2020-10-28 23:47:58,772:65224 ThreadPoolExecutor-9_2 (task-2): done

2020-10-28 23:47:58,773:65224 ThreadPoolExecutor-6_2 run_blocking_tasks: exiting

2020-10-28 23:47:58,774:65224 ThreadPoolExecutor-6_2 (task-2): done

2020-10-28 23:47:58,948:65224 ThreadPoo

[{'id': 'task-0',
  'task': <function __main__.src_list_page_process_task(list_page)>,
  'args': [[{'Key': 'taxonomy_cs/lmt/input/AdvertiserReporting-123_2020-05-11_ford.csv',
     'LastModified': datetime.datetime(2020, 10, 27, 16, 7, 11, tzinfo=tzlocal()),
     'ETag': '"c752db15b6440fde2ed7c6b6621fd594"',
     'Size': 76,
     'StorageClass': 'STANDARD'},
    {'Key': 'taxonomy_cs/lmt/input/AdvertiserReporting_202-06-04_ford.csv',
     'LastModified': datetime.datetime(2020, 10, 27, 14, 15, 22, tzinfo=tzlocal()),
     'ETag': '"e4432fb96afba25162ecbd89624b93bd"',
     'Size': 72,
     'StorageClass': 'STANDARD'},
    {'Key': 'taxonomy_cs/lmt/input/AdvertiserReporting_2020-06-01_ford.csv',
     'LastModified': datetime.datetime(2020, 10, 12, 21, 15, 2, tzinfo=tzlocal()),
     'ETag': '"04234605e8b4354998074abae5c74ae9"',
     'Size': 71,
     'StorageClass': 'STANDARD'}]],
  'result': {'invalid_files_set': {'s3://qubole-ford/taxonomy_cs/lmt/input/AdvertiserReporting_202-06-04_ford.csv

In [24]:
len(res432)

3

In [25]:
[ i['result'] for i in res432]

[{'invalid_files_set': {'s3://qubole-ford/taxonomy_cs/lmt/input/AdvertiserReporting_202-06-04_ford.csv'},
  'invalid_schema_files': set(),
  'target_already_exist_files': set(),
  'schema_tg_dict': {'key_evt_advertiser_key,target_evt_advertiser_name': {'AdvertiserReporting'},
   'key_evt_advertiser_key_2,target_evt_advertiser_name_2': {'AdvertiserReporting-123'}},
  'target_tg_dict': {'target_evt_advertiser_name': {'AdvertiserReporting'},
   'target_evt_advertiser_name_2': {'AdvertiserReporting-123'}},
  'new_tg_schema_dict': {'AdvertiserReporting-123': {'key_evt_advertiser_key_2,target_evt_advertiser_name_2'}},
  'new_tg_files_dict': {'AdvertiserReporting-123': {'AdvertiserReporting-123_2020-05-11_ford.csv': {'KeyDirPath': 'taxonomy_cs/lmt/input/',
     'ParentDir': 'input',
     'FileName': 'AdvertiserReporting-123_2020-05-11_ford.csv',
     'FileGrp': 'AdvertiserReporting-123',
     'Date': '2020-05-11',
     'ClientName': 'ford',
     'Bucket': 'qubole-ford',
     'Key': 'taxonomy_

In [26]:
grouped_set_of_flags_dict(res432, flag_dict_type='schema_tg_dict')


{'key_evt_advertiser_key,target_evt_advertiser_name': {'AdvertiserReporting'},
 'key_evt_advertiser_key_2,target_evt_advertiser_name_2': {'AdvertiserReporting-123'},
 'key_evt_source,target_channel': {'ChannelGrouping'}}

In [27]:
grouped_set_of_flags_dict(res432, flag_dict_type='target_tg_dict')

{'target_evt_advertiser_name': {'AdvertiserReporting'},
 'target_evt_advertiser_name_2': {'AdvertiserReporting-123'},
 'target_channel': {'ChannelGrouping'}}

In [28]:
grouped_set_of_flags_dict(res432, flag_dict_type='new_tg_schema_dict')

{'AdvertiserReporting-123': {'key_evt_advertiser_key_2,target_evt_advertiser_name_2'}}

In [29]:
grouped_tg(res432, 'new_tg_files_dict')

{'AdvertiserReporting-123': {'AdvertiserReporting-123_2020-05-11_ford.csv': {'KeyDirPath': 'taxonomy_cs/lmt/input/',
   'ParentDir': 'input',
   'FileName': 'AdvertiserReporting-123_2020-05-11_ford.csv',
   'FileGrp': 'AdvertiserReporting-123',
   'Date': '2020-05-11',
   'ClientName': 'ford',
   'Bucket': 'qubole-ford',
   'Key': 'taxonomy_cs/lmt/input/AdvertiserReporting-123_2020-05-11_ford.csv',
   'LastModified': datetime.datetime(2020, 10, 27, 16, 7, 11, tzinfo=tzlocal()),
   'ETag': '"c752db15b6440fde2ed7c6b6621fd594"',
   'Size': 76,
   'StorageClass': 'STANDARD',
   'Schema': 'key_evt_advertiser_key_2,target_evt_advertiser_name_2'}}}

In [30]:
grouped_tg(res432, 'existing_tg_files_dict')

{'AdvertiserReporting': {'AdvertiserReporting_2020-06-01_ford.csv': {'KeyDirPath': 'taxonomy_cs/lmt/input/',
   'ParentDir': 'input',
   'FileName': 'AdvertiserReporting_2020-06-01_ford.csv',
   'FileGrp': 'AdvertiserReporting',
   'Date': '2020-06-01',
   'ClientName': 'ford',
   'Bucket': 'qubole-ford',
   'Key': 'taxonomy_cs/lmt/input/AdvertiserReporting_2020-06-01_ford.csv',
   'LastModified': datetime.datetime(2020, 10, 12, 21, 15, 2, tzinfo=tzlocal()),
   'ETag': '"04234605e8b4354998074abae5c74ae9"',
   'Size': 71,
   'StorageClass': 'STANDARD',
   'Schema': 'key_evt_advertiser_key,target_evt_advertiser_name'},
  'AdvertiserReporting_2020-06-03_ford.csv': {'KeyDirPath': 'taxonomy_cs/lmt/input/',
   'ParentDir': 'input',
   'FileName': 'AdvertiserReporting_2020-06-03_ford.csv',
   'FileGrp': 'AdvertiserReporting',
   'Date': '2020-06-03',
   'ClientName': 'ford',
   'Bucket': 'qubole-ford',
   'Key': 'taxonomy_cs/lmt/input/AdvertiserReporting_2020-06-03_ford.csv',
   'LastModified

In [31]:
grouped_set_of_flags(res432, 'invalid_schema_files')

{('s3://qubole-ford/taxonomy_cs/lmt/input/AdvertiserReporting_2020-06-02_ford.csv',
  'ser_key,  target_evt_advertiser_name',
  'At least one Key column is required! \nAll given columns should Key or Target!'),
 ('s3://qubole-ford/taxonomy_cs/lmt/input/ChannelGrouping_2020-06-03_ford.csv',
  'ource, target_channel',
  'At least one Key column is required! \nAll given columns should Key or Target!')}

In [32]:
grouped_set_of_flags(res432, 'invalid_files_set')

{'s3://qubole-ford/taxonomy_cs/lmt/input/AdvertiserReporting_202-06-04_ford.csv'}

In [33]:
res_final = extract_src_detail()

2020-10-28 23:49:11,936   process-id:65224 run_blocking_tasks: starting

2020-10-28 23:49:11,938   process-id:65224 run_blocking_tasks: creating executor tasks

2020-10-28 23:49:12,038   process-id:97838   (task-0): passed args :[[{'Key': 'taxonomy_cs/lmt/input/AdvertiserReporting-123_2020-05-11_ford.csv', 'LastModified': datetime.datetime(2020, 10, 27, 16, 7, 11, tzinfo=tzlocal()), 'ETag': '"c752db15b6440fde2ed7c6b6621fd594"', 'Size': 76, 'StorageClass': 'STANDARD'}, {'Key': 'taxonomy_cs/lmt/input/AdvertiserReporting_202-06-04_ford.csv', 'LastModified': datetime.datetime(2020, 10, 27, 14, 15, 22, tzinfo=tzlocal()), 'ETag': '"e4432fb96afba25162ecbd89624b93bd"', 'Size': 72, 'StorageClass': 'STANDARD'}, {'Key': 'taxonomy_cs/lmt/input/AdvertiserReporting_2020-06-01_ford.csv', 'LastModified': datetime.datetime(2020, 10, 12, 21, 15, 2, tzinfo=tzlocal()), 'ETag': '"04234605e8b4354998074abae5c74ae9"', 'Size': 71, 'StorageClass': 'STANDARD'}]]

2020-10-28 23:49:12,041   process-id:97838   (tas

2020-10-28 23:49:12,084:97840 ThreadPoolExecutor-10_1 (task-1): running

2020-10-28 23:49:12,095:97840 ThreadPoolExecutor-10_2 (task-2): running

2020-10-28 23:49:12,124:97839 ThreadPoolExecutor-10_1 (task-1): done

2020-10-28 23:49:12,129:97838 ThreadPoolExecutor-10_1 (task-1): done

2020-10-28 23:49:12,130:97839 ThreadPoolExecutor-10_2 (task-2): done

2020-10-28 23:49:12,133:97839 ThreadPoolExecutor-10_0 (task-0): done

2020-10-28 23:49:12,135:97839 MainThread run_blocking_tasks: exiting

2020-10-28 23:49:12,136   process-id:97839   (task-1): done

2020-10-28 23:49:12,139:97840 ThreadPoolExecutor-10_0 (task-0): done

2020-10-28 23:49:12,150:97838 ThreadPoolExecutor-10_0 (task-0): done

2020-10-28 23:49:12,152:97838 MainThread run_blocking_tasks: exiting

2020-10-28 23:49:12,153   process-id:97838   (task-0): done

2020-10-28 23:49:12,154:97840 ThreadPoolExecutor-10_1 (task-1): done

2020-10-28 23:49:12,156:97840 ThreadPoolExecutor-10_2 (task-2): done

2020-10-28 23:49:12,157:97840 Ma

In [34]:
res_final

{'invalid_files_set': {'s3://qubole-ford/taxonomy_cs/lmt/input/AdvertiserReporting_202-06-04_ford.csv'},
 'invalid_schema_files': {('s3://qubole-ford/taxonomy_cs/lmt/input/AdvertiserReporting_2020-06-02_ford.csv',
   'ser_key,  target_evt_advertiser_name',
   'At least one Key column is required! \nAll given columns should Key or Target!'),
  ('s3://qubole-ford/taxonomy_cs/lmt/input/ChannelGrouping_2020-06-03_ford.csv',
   'ource, target_channel',
   'At least one Key column is required! \nAll given columns should Key or Target!')},
 'target_already_exist_files': set(),
 'schema_tg_dict': {'key_evt_advertiser_key,target_evt_advertiser_name': {'AdvertiserReporting'},
  'key_evt_source,target_channel': {'ChannelGrouping'},
  'key_evt_advertiser_key_2,target_evt_advertiser_name_2': {'AdvertiserReporting-123'}},
 'target_tg_dict': {'target_evt_advertiser_name': {'AdvertiserReporting'},
  'target_channel': {'ChannelGrouping'},
  'target_evt_advertiser_name_2': {'AdvertiserReporting-123'}},


In [36]:
src_delta = tcAPI.extract_src_detail(is_kernal_thread = False)

2020-10-29 00:22:14,545:65224 MainThread run_blocking_tasks: starting

2020-10-29 00:22:14,546:65224 MainThread run_blocking_tasks: creating executor tasks

2020-10-29 00:22:14,547:65224 ThreadPoolExecutor-10_0 (task-0): passed args :['AdvertiserReporting', 's3://qubole-ford/taxonomy_cs/lmt/data/AdvertiserReporting']

2020-10-29 00:22:14,547:65224 ThreadPoolExecutor-10_1 (task-1): passed args :['ChannelGrouping', 's3://qubole-ford/taxonomy_cs/lmt/data/ChannelGrouping']

2020-10-29 00:22:14,548:65224 ThreadPoolExecutor-10_0 (task-0): running

2020-10-29 00:22:14,548:65224 ThreadPoolExecutor-10_1 (task-1): running

2020-10-29 00:22:14,549:65224 MainThread run_blocking_tasks: waiting for executor tasks

2020-10-29 00:22:14,606:65224 ThreadPoolExecutor-10_0 (task-0): done

2020-10-29 00:22:14,627:65224 ThreadPoolExecutor-10_1 (task-1): done

2020-10-29 00:22:14,628:65224 MainThread run_blocking_tasks: exiting

2020-10-29 00:22:14,629:65224 MainThread run_blocking_tasks: starting

2020-10-2

2020-10-29 00:22:14,760:65224 ThreadPoolExecutor-14_1 (task-1): passed args :[{'KeyDirPath': 'taxonomy_cs/lmt/input/', 'ParentDir': 'input', 'FileName': 'ChannelGrouping_2020-06-02_ford.csv', 'FileGrp': 'ChannelGrouping', 'Date': '2020-06-02', 'ClientName': 'ford', 'Bucket': 'qubole-ford', 'Key': 'taxonomy_cs/lmt/input/ChannelGrouping_2020-06-02_ford.csv', 'LastModified': datetime.datetime(2020, 10, 12, 21, 15, 2, tzinfo=tzlocal()), 'ETag': '"85765aed716b8ac68c5ba714d1fd62b4"', 'Size': 45, 'StorageClass': 'STANDARD'}]

2020-10-29 00:22:14,760:65224 ThreadPoolExecutor-14_0 (task-0): running

2020-10-29 00:22:14,760:65224 ThreadPoolExecutor-14_2 (task-2): passed args :[{'KeyDirPath': 'taxonomy_cs/lmt/input/', 'ParentDir': 'input', 'FileName': 'ChannelGrouping_2020-06-03_ford.csv', 'FileGrp': 'ChannelGrouping', 'Date': '2020-06-03', 'ClientName': 'ford', 'Bucket': 'qubole-ford', 'Key': 'taxonomy_cs/lmt/input/ChannelGrouping_2020-06-03_ford.csv', 'LastModified': datetime.datetime(2020, 10,

In [37]:
src_delta

{'invalid_files_set': {'s3://qubole-ford/taxonomy_cs/lmt/input/AdvertiserReporting_202-06-04_ford.csv'},
 'invalid_schema_files': {('s3://qubole-ford/taxonomy_cs/lmt/input/AdvertiserReporting_2020-06-02_ford.csv',
   'ser_key,  target_evt_advertiser_name',
   'At least one Key column is required! \nAll given columns should Key or Target!'),
  ('s3://qubole-ford/taxonomy_cs/lmt/input/ChannelGrouping_2020-06-03_ford.csv',
   'ource, target_channel',
   'At least one Key column is required! \nAll given columns should Key or Target!')},
 'schema_tg_dict': {'key_evt_source,target_channel': {'ChannelGrouping'},
  'key_evt_advertiser_key_2,target_evt_advertiser_name_2': {'AdvertiserReporting-123'},
  'key_evt_advertiser_key,target_evt_advertiser_name': {'AdvertiserReporting'}},
 'target_tg_dict': {'target_channel': {'ChannelGrouping'},
  'target_evt_advertiser_name_2': {'AdvertiserReporting-123'},
  'target_evt_advertiser_name': {'AdvertiserReporting'}},
 'new_tg_schema_dict': {'AdvertiserRep

In [None]:
#tg_data_schema_dict
#tg_data_files_dict
#target_data_tg_dict = {re.findall(TARGET_EXTRACT_REGEX,V)[0]: K for K, V in tg_data_schema_dict.items()}

In [38]:
{ k for k in src_delta.keys()}

{'existing_tg_files_dict',
 'invalid_files_set',
 'invalid_schema_files',
 'new_tg_files_dict',
 'new_tg_schema_dict',
 'schema_tg_dict',
 'target_tg_dict'}

In [39]:
{ k for k in tg_data_files_dict.keys()}

{'AdvertiserReporting', 'ChannelGrouping'}

In [40]:
{ k for k in src_delta['existing_tg_files_dict'].keys()}

{'AdvertiserReporting', 'ChannelGrouping'}

In [41]:
{ k for k in src_delta['new_tg_files_dict'].keys()}

{'AdvertiserReporting-123'}

In [42]:
[v for k, v in src_delta['schema_tg_dict'].items() if len(v) > 1]

[]

In [43]:
[v for k, v in src_delta['target_tg_dict'].items() if len(v) > 1]

[]

In [44]:
src_delta['invalid_schema_files']

{('s3://qubole-ford/taxonomy_cs/lmt/input/AdvertiserReporting_2020-06-02_ford.csv',
  'ser_key,  target_evt_advertiser_name',
  'At least one Key column is required! \nAll given columns should Key or Target!'),
 ('s3://qubole-ford/taxonomy_cs/lmt/input/ChannelGrouping_2020-06-03_ford.csv',
  'ource, target_channel',
  'At least one Key column is required! \nAll given columns should Key or Target!')}

In [91]:
tg_data = { k for k in tg_data_files_dict.keys()}
tg_existing = { k for k in src_delta['existing_tg_files_dict'].keys()}
tg_new ={ k for k in src_delta['new_tg_files_dict'].keys()}
tg_delta = tg_new.union(tg_existing)
tg_dropped = tg_data.difference(tg_delta)
many_tg4schema_check_gen = (v for k, v in src_delta['schema_tg_dict'].items() if len(v) > 0)
many_tg4target_check_gen = (v for k, v in src_delta['target_tg_dict'].items() if len(v) > 0)
many_schema4tg_check_gen = (v for k, v in src_delta['new_tg_schema_dict'].items() if len(v) > 0)

In [92]:
tg4schema = set()
[tg4schema.add(*i) for i in many_tg4schema_check_gen]
tg4target = set()
[tg4schema.add(*i) for i in many_tg4target_check_gen]
schema4tg = set()
[schema4tg.add(*i) for i in many_schema4tg_check_gen]

[None]

In [93]:
tg4schema

{'AdvertiserReporting', 'AdvertiserReporting-123', 'ChannelGrouping'}

In [97]:
newTg4schema = {tg for schema in schema4tg for tg in src_delta['schema_tg_dict'][schema]}

In [98]:
newTg4schema

{'AdvertiserReporting-123'}

In [96]:
tg4target

set()

In [107]:
#def extract_delta():
invalid_files_set = src_delta['invalid_files_set']
invalid_schema_files = src_delta['invalid_schema_files']

tg_data = { k for k in tg_data_files_dict.keys()}
tg_existing = { k for k in src_delta['existing_tg_files_dict'].keys()}
tg_new ={ k for k in src_delta['new_tg_files_dict'].keys()}
tg_all = tg_new.union(tg_existing)

many_tg4schema_check_gen = (v for k, v in src_delta['schema_tg_dict'].items() if len(v) > 1)
many_tg4target_check_gen = (v for k, v in src_delta['target_tg_dict'].items() if len(v) > 1)
many_schema4tg_check_gen = (v for k, v in src_delta['new_tg_schema_dict'].items() if len(v) > 1)


tg4schema = set()
[tg4schema.add(*i) for i in many_tg4schema_check_gen]
tg4target = set()
[tg4schema.add(*i) for i in many_tg4target_check_gen]
schema4tg = set()
[schema4tg.add(*i) for i in many_schema4tg_check_gen]

newTg4schema = {tg for schema in schema4tg for tg in src_delta['schema_tg_dict'][schema]}

invalid_tg_with_dup_schema = (tg4schema.union(newTg4schema)).difference(tg_existing)

invalid_tg_with_dup_target = tg4target.difference(tg_existing)

invalid_tg_all = invalid_tg_with_dup_schema.union(invalid_tg_with_dup_target)

tg_delta = tg_new.difference(invalid_tg_all)

tg_delta_create = tg_delta.difference(tg_data)

tg_delta_drop_n_create = (tg_delta.intersection(tg_data)).difference(tg_existing)

tg_dropped = tg_data.difference(tg_all)

# tg_dropped_all = tg_dropped.union(tg_delta_drop_n_create)
# tg_create_all = tg_delta_create.union(tg_delta_drop_n_create)
    

In [100]:
invalid_tg_with_dup_schema

set()

In [101]:
invalid_tg_with_dup_target

set()

In [102]:
invalid_tg_all

set()

In [103]:
tg_delta

{'AdvertiserReporting-123'}

In [104]:
tg_delta_create

{'AdvertiserReporting-123'}

In [105]:
tg_delta_drop_n_create

set()

In [106]:
tg_dropped

set()

In [108]:
tg_dropped_all

set()

In [109]:
tg_create_all

{'AdvertiserReporting-123'}