In [1]:
import os
from tqdm import tqdm
import re
import sys
import json
import base64
import hashlib
import requests
from typing import Union
from requests.auth import HTTPBasicAuth
from requests.exceptions import HTTPError

In [2]:
project_path = os.path.abspath('..')
if project_path not in sys.path:
    sys.path.append(project_path)
sys.path

['C:\\Users\\Filipp\\PycharmProjects\\Invoice_scanner\\src',
 'C:\\Users\\Filipp\\AppData\\Local\\anaconda3\\python311.zip',
 'C:\\Users\\Filipp\\AppData\\Local\\anaconda3\\DLLs',
 'C:\\Users\\Filipp\\AppData\\Local\\anaconda3\\Lib',
 'C:\\Users\\Filipp\\AppData\\Local\\anaconda3',
 '',
 'C:\\Users\\Filipp\\AppData\\Local\\anaconda3\\Lib\\site-packages',
 'C:\\Users\\Filipp\\AppData\\Local\\anaconda3\\Lib\\site-packages\\win32',
 'C:\\Users\\Filipp\\AppData\\Local\\anaconda3\\Lib\\site-packages\\win32\\lib',
 'C:\\Users\\Filipp\\AppData\\Local\\anaconda3\\Lib\\site-packages\\Pythonwin',
 'C:\\Users\\Filipp\\PycharmProjects\\Invoice_scanner']

In [3]:
from config.config import config

In [4]:
auth = HTTPBasicAuth(config['user_1C'], config['password_1C'])

### Получение списка номером счетов; Получение деталей по данным счетам

In [5]:
# Получение списка номеров счетов за период времени

# месяцы 04-06 уже есть
month1 = '07'
month2 = '08'

date_1 = f'01-{month1}-2024'
date_2 = f'01-{month2}-2024'

deal_numbers_list = requests.get(fr'http://10.10.0.10:81/ca/hs/interaction/InvoicesByDate/{date_1}/{date_2}', auth=auth).json()
print(f"length: {len(deal_numbers_list)}")

print(f"deal_numbers: {deal_numbers_list[0:3]}")

length: 2002
deal_numbers: ['0С000000000000137236', '0С000000000000138343', '0С000000000000138344']


In [None]:
# Получение деталей счетов

numbers = []
for number in tqdm(deal_numbers_list):
    try:
        res = requests.get(fr'http://10.10.0.10:81/ca/hs/interaction/InvoiceDetailsByNumber/{number}', auth=auth).json()
    except:
        print(number)
        continue
    numbers.append(res)

In [None]:
numbers[0]

In [None]:
# Запись деталей в файл

file = os.path.join(config['BASE_DIR'], 'config', 'generated_services', f'result_numbers_{month1}.json')
with open(file, 'w', encoding='utf-8') as f:
    json.dump(numbers, f, ensure_ascii=False, indent=4)

### Фильтрация деталей

In [6]:
# Получение списка частых услуг

file = os.path.join(config['BASE_DIR'], 'config', 'freq_services.json')
with open(file, 'r', encoding='utf-8') as f:
    freq_services = json.load(f)


def remove_special_characters(text):
    response = re.sub(r'[^\w\s]', ' ', text.lower())
    return re.sub(r'(\s{2,}|\n)', ' ', response)
    
freq_services = [remove_special_characters(service) for service in freq_services]

In [7]:
# Определение функций очистки

# hash

def calculate_hash(file_path):
    # Инициализация хэш-объекта MD5
    md5_hash = hashlib.md5()

    # Открываем файл в бинарном режиме для чтения
    with open(file_path, "rb") as f:
        # Чтение файла блоками по 4096 байт (можно настроить)
        for byte_block in iter(lambda: f.read(4096), b""):
            md5_hash.update(byte_block)

    # Возвращаем хэш-сумму в виде шестнадцатеричной строки
    return md5_hash.hexdigest()


def is_single_hash(lst: list[str]) -> str | None:
    """
    lst: result['ФайлыСчета'] from result = requests.get(fr'http://.../InvoiceDetailsByNumber/{deal}', auth=auth).json()
    """
    if not list:
        return None
        
    res = [calculate_hash(file) for file in lst]
        
    if len(set(res)) == 1:
        return lst[0]


# frequent services

def is_all_services_freq(lst: list, freq_services: list) -> bool:
    """
    lst: result['СтрокиСчета'] from result = requests.get(fr'http://.../InvoiceDetailsByNumber/{deal}', auth=auth).json()
    freq_services: list of frequently used services (cleared)
    """
    services = [d['Услуга'] for d in lst]
    clear_services = [remove_special_characters(service) for service in services]
    
    if set(clear_services).issubset(freq_services):
        return True
    else:
        return False    

In [8]:
file = os.path.join(config['BASE_DIR'], 'config', 'generated_services', 'result_numbers_07.json')
with open(file, 'r', encoding='utf-8') as f:
    result = json.load(f)

print(len(result))

1963


In [None]:
filtered_response_list = []
for res in tqdm(result):
    
    raws = res['СтрокиСчета']
    files = res['ФайлыСчета']
    
    if is_all_services_freq(raws, freq_services):
        # print('freq')
        continue
    if not is_single_hash(files):
        # print('hash')
        continue

    filtered_response_list.append(res)

print(f"{len(deal_numbers_list)} --> {len(filtered_response_list)}")

In [10]:
filtered_response_list[0]

{'СтрокиСчета': [{'Услуга': 'Терминальная обработка',
   'УслугаКод': '000000421',
   'ТранспортнаяПозиция': 'CICU9764876',
   'Количество': 1,
   'Цена': 10500}],
 'ФайлыСчета': ['\\\\10.10.0.3\\docs\\Baltimpex\\Invoice\\Import\\АА-0093285\\T1_ОФ-00011348_71c5acb7-b8f5-4cda-b618-bd631ff0c088_СчетНаОплату.pdf']}

In [11]:
# Запись отфильтрованных деталей в файл

file = os.path.join(config['BASE_DIR'], 'config', 'generated_services', f'filtered_result_numbers_{month1}.json')
with open(file, 'w', encoding='utf-8') as f:
    json.dump(filtered_response_list, f, ensure_ascii=False, indent=4)