<a href="https://colab.research.google.com/github/maksimowich/cre_xml_parser/blob/main/cre_xml_parser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade 'sqlalchemy<2.0'

##### Необходиымые импорты

In [2]:
import datetime
import os
import pandas as pd
import sqlalchemy
import time
import xml.etree.ElementTree as ET
from typing import Callable
from functools import wraps


##### Декоратор для замера времени работы функции

In [42]:
def timeit(func: Callable, args_to_print=[]):
    @wraps(func)
    def timeit_wrapper(*args, **kwargs):
        start_time = time.perf_counter()
        result = func(*args, **kwargs)
        end_time = time.perf_counter()
        total_time = end_time - start_time
        args_to_print_with_values = {k: v for k, v in kwargs.items() if k in args_to_print}
        print(f'Function {func.__name__} called with {args_to_print_with_values} took {total_time:.4f} seconds')
        return result
    return timeit_wrapper


Функция, возвращающая словарь на основании двух списков

In [43]:
def get_dict_from_lists(keys_list: list, values_list: list):
    return {key: value for key, value in zip(keys_list, values_list)}


##### Подключение к БД

In [44]:
connection_string = 'postgresql+psycopg2://postgres:5555@db.mpkazantsev.ru/demo'

engine = sqlalchemy.create_engine(connection_string)


##### Маппинг тэгов с таблицами

In [45]:
TAGS = ['MONTHLY_DETAIL', 'LOANS_OVERVIEW', 'LOAN', 'MAIN', 'NAME', 'SCORE', 'FRAUD']

TABLE_NAMES = ['singleformattype', 'monthlydetailtype', 'loansoverviewtype', 'loanstype', 'maintype', 'nametype', 'scoretype', 'fraudtype']

TAGS_TO_TABLE_NAMES_MAPPING = {
                              'MONTHLY_DETAIL': 'monthlydetailtype',
                              'LOANS_OVERVIEW':'loansoverviewtype',
                              'LOAN':'loanstype',
                              'MAIN':'maintype',
                              'NAME':'nametype',
                              'SCORE':'scoretype',
                              'FRAUD':'fraudtype',
                              }


def get_table_name_by_tag(tag: str):
  return TAGS_TO_TABLE_NAMES_MAPPING[tag]


##### Исключенные поля

In [46]:
FIELD_NAMES_TO_EXCLUDE = ['cbtypecode', 'nextpmtprincipal']


##### Получаем маппинг имён полей в распарсенных строках и в БД

In [None]:
def get_row_names_to_db_names_dict(table_name: str,
                                   engine: sqlalchemy.engine.base.Engine): # словарь для поиска имен полей по тегу в низком регистре и без _ через describe таблицы
  describe_query = "SELECT column_name, data_type FROM information_schema.columns WHERE table_name = '{}';".format(table_name)
  df = pd.read_sql_query(describe_query, engine)
  result_dict = {}
  for _, row in df[['column_name']].iterrows():
      result_dict[row['column_name'].lower().replace('_','')] = row['column_name']
  return result_dict


TAG_TO_ROW_NAMES_AND_DB_NAMES_DICT = {}
for tag in TAGS:
  TAG_TO_ROW_NAMES_AND_DB_NAMES_DICT[tag] = get_row_names_to_db_names_dict("sf_" + get_table_name_by_tag(tag), engine)


##### Получаем ожидаемые типы данных полей

In [None]:
def get_table_type_dict(tag: str): # получение словаря поле-тип по тегу (имена полей записаны в нижнем регистре без нижних подчеркиваний)
  xml_root = ET.parse(os.getcwd() + '/SingleFormat.xsd').getroot()
  table_type = xml_root.findall(".//{http://www.w3.org/2001/XMLSchema}element[@name='" + tag + "']")[0].attrib['type']
  result_dict = {}
  for element in xml_root.findall(".//{http://www.w3.org/2001/XMLSchema}complexType[@name='" + table_type + "']/{http://www.w3.org/2001/XMLSchema}sequence/{http://www.w3.org/2001/XMLSchema}element"):
      element_name = element.attrib['name']
      element_type = element.attrib['type']
      if element_type[:3] == "xs:":
          element_type = element_type[3:]
      result_dict[element_name.lower().replace('_','')] = element_type.lower()
  return result_dict

TAG_TO_TABLE_TYPES_DICT = {}
for tag in TAGS:
  TAG_TO_TABLE_TYPES_DICT[tag] = get_table_type_dict(tag)


##### Функция, возвращающая приведённое к ожидаемому типу значение

In [49]:
def get_field_value(expected_type: str,
                    field_name: str,
                    str_value: str):
  if expected_type == None:
    return None
  elif expected_type == 'int' or field_name == 'recentlegalupdatedate':  # костыль на interestrate пока в БД поле int а не float
    return int(str_value)
  elif field_name == 'interestrate':
    return int(str_value.split('.')[0])
  elif expected_type == 'float' or expected_type == 'moneyvaluetype':
    return float(str_value)
  else:
    return str_value


##### Функция ***get_row_from_SF_item*** формирует и возвращает строку таблицы

In [50]:
def get_row_from_SF_item(SF_item: ET.Element,
                         table_name: str,
                         hjid: int,
                         tables_current_hjid: dict,
                         get_row_from_field_names_and_filed_values: Callable):
    field_values = []
    field_names = []

    for SF_subitem in SF_item:
      field_name = SF_subitem.tag.lower().replace('_','')
      if len(SF_subitem) > 0 or field_name in FIELD_NAMES_TO_EXCLUDE or TAG_TO_ROW_NAMES_AND_DB_NAMES_DICT[SF_item.tag].get(field_name) is None:
        continue
      field_value = get_field_value(expected_type=TAG_TO_TABLE_TYPES_DICT[SF_item.tag].get(field_name), field_name=field_name, str_value=SF_subitem.text)
      if field_value != None:
        field_names.append(field_name)
        if isinstance(field_value, int) or isinstance(field_value, float):
          field_values.append(field_value)
        else:
          field_values.append("'" + field_value + "'")

    field_names.append('hdp_datetime')
    current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    field_values.append("'" + current_time + "'")

    if TABLES_HJID[table_name] != 'hjid':
        field_for_hjid_name = table_name + "_hjid"
        tables_current_hjid[field_for_hjid_name] = tables_current_hjid.get(field_for_hjid_name, 0) + 1
        field_names.append('hjid')
        field_values.append(tables_current_hjid.get(field_for_hjid_name))

    if table_name != 'monthlydetailtype':
      field_names.append(TABLES_HJID[table_name])
      field_values.append(hjid)
    else:
      field_names.append('loan_id')
      field_values.append(tables_current_hjid.get('loanstype_hjid'))
    return get_row_from_field_names_and_filed_values(field_names, field_values)


##### Функция парсит тэг в xml файле. В ходе парсинга полученные строки кладёт в словарь ***table_name_to_rows_dict*** 

In [51]:
def parse_tag(xml_root: ET.Element,
              hjid: int,
              table_name_to_rows_dict: dict,
              tables_current_hjid: dict,
              tag: str,
              add_row: Callable,
              get_row_from_field_names_and_filed_values: Callable): # процедура парсинга тега в XML
  table_name = get_table_name_by_tag(tag)
  SF_items = xml_root.findall(".//" + tag)
  for SF_item in SF_items:
    row_to_append = get_row_from_SF_item(SF_item, table_name, hjid, tables_current_hjid, get_row_from_field_names_and_filed_values)
    add_row(table_name_to_rows_dict, table_name, row_to_append)
    if tag == 'LOAN':
      MD_SF_items = SF_item.findall(".//MONTHLY_DETAIL")
      for MD_SF_item in MD_SF_items:
        MD_table_name = get_table_name_by_tag(MD_SF_item.tag)
        row_to_append = get_row_from_SF_item(MD_SF_item, MD_table_name, hjid, tables_current_hjid, get_row_from_field_names_and_filed_values)
        add_row(table_name_to_rows_dict, MD_table_name, row_to_append)


##### Функции формирования строки в виде df и в виде словаря.

In [52]:
def add_df_row(table_name_to_rows_dict, table_name, row_in_df_to_append):
    table_name_to_rows_dict[table_name] = pd.concat([table_name_to_rows_dict[table_name], row_in_df_to_append], ignore_index=True)


def get_df_row_from_field_names_and_filed_values(field_names, field_values):
    return pd.DataFrame(data=[field_values], columns=field_names)


##### Функции добавления строки в виде df и в виде словаря в ***table_name_to_rows_dict***

In [53]:
def add_dict_row(table_name_to_rows_dict, table_name, row_in_dict_to_append):
  table_name_to_rows_dict[table_name].append(row_in_dict_to_append)


def get_dict_row_from_field_names_and_filed_values(field_names, field_values):
  return get_dict_from_lists(keys_list=field_names, values_list=field_values)


##### Функция парсит xml файл целиком

In [67]:
LOADING_METHOD_TO_ADD_ROW_FUNCTION = {
                                      'PANDAS': add_df_row,
                                      'LIST': add_dict_row,
                                     }

LOADING_METHOD_TO_GET_ROW_FROM_FIELD_NAMES_AND_FIELD_VALUES_FUNCTION = {
                                                                        'PANDAS': get_df_row_from_field_names_and_filed_values,
                                                                        'LIST': get_dict_row_from_field_names_and_filed_values,
                                                                       }


def parse_xml_file(path_to_xml_file, hjid, table_name_to_df_dict, tables_current_hjid, loading_method):
  add_row_function = LOADING_METHOD_TO_ADD_ROW_FUNCTION[loading_method]
  get_row_from_field_names_and_filed_values_function = LOADING_METHOD_TO_GET_ROW_FROM_FIELD_NAMES_AND_FIELD_VALUES_FUNCTION[loading_method]
  xml_root = ET.parse(path_to_xml_file).getroot()
  for tag in TAGS: # парсим данные по тегу и накапливаем во фрейм
    if tag == 'MONTHLY_DETAIL':
        continue
    parse_tag(xml_root, hjid, table_name_to_df_dict, tables_current_hjid,
              tag, add_row_function, get_row_from_field_names_and_filed_values_function) # парсим тег в файле


# parse_xml_file = timeit(parse_xml_file)



*  ***recreate_tables***
*  ***save_df_to_db***
*  ***save_singleformattype_to_db***

 Данные функции необходимо будет переопределить при работе с БД, отличной от Postgres

##### Функция, пересоздающая таблицы в БД

In [68]:
def recreate_tables(prefix, engine):
  for table_name in TABLE_NAMES:
    drop_query = "DROP TABLE IF EXISTS {}{}".format(prefix, table_name)
    engine.execute(drop_query, engine)
    create_query = "CREATE TABLE IF NOT EXISTS {prefix}{table_name} AS SELECT * FROM adm.sf_{table_name} WHERE 1<>1".format(prefix=prefix, table_name=table_name)
    engine.execute(create_query, engine)


##### Функция, осущ. сохранение датафрейма в БД

In [69]:
def save_df_to_db(df_of_rows, tag, table_name, engine): # сохранение датафрема в таблицу
  if df_of_rows.shape[0] > 0:
    names_dict = TAG_TO_ROW_NAMES_AND_DB_NAMES_DICT[tag]
    str_for_columns = ", ".join(map(lambda x: names_dict.get(x, x), df_of_rows.columns.values))
    values_list = []
    for _, row in df_of_rows.iterrows():
        values_list.append("(" + ", ".join(map(str, row)).replace("nan","NULL") + ")")
    insert_query = "INSERT INTO {} ({}) VALUES {}".format(table_name, str_for_columns, ",".join(values_list))
    print(insert_query)
    engine.execute(insert_query)
  

# save_df_to_db = timeit(save_df_to_db)


##### Функция, осущ. сохранение списка из словаря в БД

In [70]:
def save_list_to_db(lst_of_rows, tag, table_name, engine):
  if len(lst_of_rows) > 0:
    list_of_column_names = list(TAG_TO_ROW_NAMES_AND_DB_NAMES_DICT[tag].keys())
    list_of_str_for_values = []
    for row_dict in lst_of_rows:
      list_of_str_for_values.append("(" + ','.join(list(map(lambda x: str(row_dict.get(x)) if row_dict.get(x) is not None else 'Null', list_of_column_names))) + ")")
    insert_query = "INSERT INTO {} VALUES {}".format(table_name, ','.join(list_of_str_for_values))
    print(insert_query)
    engine.execute(insert_query)
  

# save_list_to_db = timeit(save_list_to_db)


##### Функция, осущ. сохранение singleformattype в БД

In [71]:
def save_singleformattype_to_db(hjids, prefix, engine):  # сохранение данных из list в singleformattype      
  if len(hjids) > 0:
    values_list = []
    for hj in hjids:
      values_list.append("(" + ",".join([str(hj)] * 8) + ")")
    insert_query = '''INSERT INTO {}singleformattype (hjid, names_, loansoverview, loans, frauds, documents, scores, main)
                      VALUES {};'''.format(prefix, ",".join(values_list))
    engine.execute(insert_query)  


##### Функция, осущ. сохранение датафреймов по всем тэгам и singleformattype в БД

In [72]:
def save_rows_and_singleformattype_to_db(table_name_to_rows_dict,
                                         hjids,
                                         prefix,
                                         engine,
                                         save_rows_to_db):
  for tag in TAGS:
    table_name = get_table_name_by_tag(tag)
    save_rows_to_db(table_name_to_rows_dict[table_name], tag=tag, table_name=prefix+table_name, engine=engine) # сохраняем df в БД
  save_singleformattype_to_db(hjids, prefix, engine)


##### Основная функция c логикой загрузки



In [73]:
LOADING_METHOD_TO_CONTAINER_FOR_ROWS_CONSTRUCTOR_MAPPING = {
                                                            'PANDAS': pd.DataFrame,
                                                            'LIST': list,
                                                           }


LOADING_METHOD_TO_SAVE_ROWS_TO_DB_FUNCTION = {
                                              'PANDAS': save_df_to_db,
                                              'LIST': save_list_to_db,
                                             }


def make_load(recreate_tables: Callable,
              save_rows_and_singleformattype_to_db: Callable,
              engine: sqlalchemy.engine.base.Engine,
              prefix: str,
              path_to_folder_with_xml_files: str,
              loading_size: int,
              loading_method: str):
  tables_current_hjid = {}
  recreate_tables(prefix, engine)

  table_name_to_rows_dict = {}
  container_for_rows_constructor = LOADING_METHOD_TO_CONTAINER_FOR_ROWS_CONSTRUCTOR_MAPPING[loading_method]
  for tag in TAGS:
      table_name = get_table_name_by_tag(tag)
      table_name_to_rows_dict[table_name] = container_for_rows_constructor()
      table_name_to_rows_dict[table_name + "_hjid"] = 0 # счетчик idшников внутри сущности
  
  save_rows_to_db = LOADING_METHOD_TO_SAVE_ROWS_TO_DB_FUNCTION[loading_method]
  hjids = []
  for filename in os.listdir(path_to_folder_with_xml_files):
    if filename.endswith(".xml"):
        path_to_xml_file = path_to_folder_with_xml_files + "/" + filename
        hjid = int(filename.split('.')[0])
        hjids.append(hjid)
        parse_xml_file(path_to_xml_file, hjid, table_name_to_rows_dict, tables_current_hjid, loading_method)
        if len(hjids) == loading_size:
          save_rows_and_singleformattype_to_db(table_name_to_rows_dict, hjids, prefix, engine, save_rows_to_db)
          for tag in TAGS:
            table_name_to_rows_dict[get_table_name_by_tag(tag)] = container_for_rows_constructor() # очищаем
          hjids=[]
  else:
    save_rows_and_singleformattype_to_db(table_name_to_rows_dict, hjids, prefix, engine, save_rows_to_db)


## **Скрипт на парсинг**

In [None]:
TABLES_HJID = {
              'monthlydetailtype': 'loan_id', 
              'loansoverviewtype': 'hjid',
              'loanstype': 'loanstypes_loan_hjid',
              'maintype': 'hjid',
              'nametype': 'nametypes_name__hjid',
              'scoretype': 'scoretypes_score_hjid',
              'fraudtype': 'fraudtypes_fraud_hjid',
              }

prefix = 'adm.ad_sf_'
path_to_folder_with_xml_files = os.getcwd() + '/parsed_xml2' # задаём папку внутри которой xml файлы
loading_size = 50 # сохраняем по указанному количеству файлов

make_load_timed = timeit(make_load)

make_load_timed(recreate_tables=recreate_tables,
                save_rows_and_singleformattype_to_db=save_rows_and_singleformattype_to_db,
                engine=engine,
                prefix=prefix,
                path_to_folder_with_xml_files=path_to_folder_with_xml_files,
                loading_size=loading_size,
                loading_method='LIST')

In [None]:
make_load_timed(recreate_tables=recreate_tables,
                save_rows_and_singleformattype_to_db=save_rows_and_singleformattype_to_db,
                engine=engine,
                prefix=prefix,
                path_to_folder_with_xml_files=path_to_folder_with_xml_files,
                loading_size=loading_size,
                loading_method='PANDAS')