In [7]:
import io
import os
import re
import glob
import shutil
import PyPDF2
import zipfile
from enum import Enum
import win32com.client

import pdfminer 
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage

In [8]:
class DocumentClass(Enum):
    ADMISIONES = 1
    DEMANDAS_SELLADAS_EJECUCION = 2
    DEMANDAS_SELLADAS_MONITORIO = 3
    FIN_DE_MONITORIO = 4
    HIPOTECARIO_PLAZO = 5
    HIPOTECARIO_TRAMITE = 6
    INADMISIONES = 7
    MANDAMIENTO = 8

In [9]:
def striprtf(text):
   pattern = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I)
   # control words which specify a "destionation".
   destinations = frozenset((
      'aftncn','aftnsep','aftnsepc','annotation','atnauthor','atndate','atnicn','atnid',
      'atnparent','atnref','atntime','atrfend','atrfstart','author','background',
      'bkmkend','bkmkstart','blipuid','buptim','category','colorschememapping',
      'colortbl','comment','company','creatim','datafield','datastore','defchp','defpap',
      'do','doccomm','docvar','dptxbxtext','ebcend','ebcstart','factoidname','falt',
      'fchars','ffdeftext','ffentrymcr','ffexitmcr','ffformat','ffhelptext','ffl',
      'ffname','ffstattext','field','file','filetbl','fldinst','fldrslt','fldtype',
      'fname','fontemb','fontfile','fonttbl','footer','footerf','footerl','footerr',
      'footnote','formfield','ftncn','ftnsep','ftnsepc','g','generator','gridtbl',
      'header','headerf','headerl','headerr','hl','hlfr','hlinkbase','hlloc','hlsrc',
      'hsv','htmltag','info','keycode','keywords','latentstyles','lchars','levelnumbers',
      'leveltext','lfolevel','linkval','list','listlevel','listname','listoverride',
      'listoverridetable','listpicture','liststylename','listtable','listtext',
      'lsdlockedexcept','macc','maccPr','mailmerge','maln','malnScr','manager','margPr',
      'mbar','mbarPr','mbaseJc','mbegChr','mborderBox','mborderBoxPr','mbox','mboxPr',
      'mchr','mcount','mctrlPr','md','mdeg','mdegHide','mden','mdiff','mdPr','me',
      'mendChr','meqArr','meqArrPr','mf','mfName','mfPr','mfunc','mfuncPr','mgroupChr',
      'mgroupChrPr','mgrow','mhideBot','mhideLeft','mhideRight','mhideTop','mhtmltag',
      'mlim','mlimloc','mlimlow','mlimlowPr','mlimupp','mlimuppPr','mm','mmaddfieldname',
      'mmath','mmathPict','mmathPr','mmaxdist','mmc','mmcJc','mmconnectstr',
      'mmconnectstrdata','mmcPr','mmcs','mmdatasource','mmheadersource','mmmailsubject',
      'mmodso','mmodsofilter','mmodsofldmpdata','mmodsomappedname','mmodsoname',
      'mmodsorecipdata','mmodsosort','mmodsosrc','mmodsotable','mmodsoudl',
      'mmodsoudldata','mmodsouniquetag','mmPr','mmquery','mmr','mnary','mnaryPr',
      'mnoBreak','mnum','mobjDist','moMath','moMathPara','moMathParaPr','mopEmu',
      'mphant','mphantPr','mplcHide','mpos','mr','mrad','mradPr','mrPr','msepChr',
      'mshow','mshp','msPre','msPrePr','msSub','msSubPr','msSubSup','msSubSupPr','msSup',
      'msSupPr','mstrikeBLTR','mstrikeH','mstrikeTLBR','mstrikeV','msub','msubHide',
      'msup','msupHide','mtransp','mtype','mvertJc','mvfmf','mvfml','mvtof','mvtol',
      'mzeroAsc','mzeroDesc','mzeroWid','nesttableprops','nextfile','nonesttables',
      'objalias','objclass','objdata','object','objname','objsect','objtime','oldcprops',
      'oldpprops','oldsprops','oldtprops','oleclsid','operator','panose','password',
      'passwordhash','pgp','pgptbl','picprop','pict','pn','pnseclvl','pntext','pntxta',
      'pntxtb','printim','private','propname','protend','protstart','protusertbl','pxe',
      'result','revtbl','revtim','rsidtbl','rxe','shp','shpgrp','shpinst',
      'shppict','shprslt','shptxt','sn','sp','staticval','stylesheet','subject','sv',
      'svb','tc','template','themedata','title','txe','ud','upr','userprops',
      'wgrffmtfilter','windowcaption','writereservation','writereservhash','xe','xform',
      'xmlattrname','xmlattrvalue','xmlclose','xmlname','xmlnstbl',
      'xmlopen',
   ))
   # Translation of some special characters.
   specialchars = {
      'par': '\n',
      'sect': '\n\n',
      'page': '\n\n',
      'line': '\n',
      'tab': '\t',
      'emdash': '\u2014',
      'endash': '\u2013',
      'emspace': '\u2003',
      'enspace': '\u2002',
      'qmspace': '\u2005',
      'bullet': '\u2022',
      'lquote': '\u2018',
      'rquote': '\u2019',
      'ldblquote': '\201C',
      'rdblquote': '\u201D',
   }
   stack = []
   ignorable = False       # Whether this group (and all inside it) are "ignorable".
   ucskip = 1              # Number of ASCII characters to skip after a unicode character.
   curskip = 0             # Number of ASCII characters left to skip
   out = []                # Output buffer.
   for match in pattern.finditer(text):
      word,arg,hex,char,brace,tchar = match.groups()
      if brace:
         curskip = 0
         if brace == '{':
            # Push state
            stack.append((ucskip,ignorable))
         elif brace == '}':
            # Pop state
            ucskip,ignorable = stack.pop()
      elif char: # \x (not a letter)
         curskip = 0
         if char == '~':
            if not ignorable:
                out.append('\xA0')
         elif char in '{}\\':
            if not ignorable:
               out.append(char)
         elif char == '*':
            ignorable = True
      elif word: # \foo
         curskip = 0
         if word in destinations:
            ignorable = True
         elif ignorable:
            pass
         elif word in specialchars:
            out.append(specialchars[word])
         elif word == 'uc':
            ucskip = int(arg)
         elif word == 'u':
            c = int(arg)
            if c < 0: c += 0x10000
            if c > 127: out.append(chr(c)) #NOQA
            else: out.append(chr(c))
            curskip = ucskip
      elif hex: # \'xx
         if curskip > 0:
            curskip -= 1
         elif not ignorable:
            c = int(hex,16)
            if c > 127: out.append(chr(c)) #NOQA
            else: out.append(chr(c))
      elif tchar:
         if curskip > 0:
            curskip -= 1
         elif not ignorable:
            out.append(tchar)
   return ''.join(out)

In [10]:
def save_to_file(path, source):
    with open(path, 'w', encoding='utf-8') as file:
        file.write(source)

def folder_dot_check(folder_name):
    folder_name = folder_name.strip()
    if folder_name[len(folder_name) - 1] == '.':
        return folder_name[:-1] + 'dot'
    return folder_name

def pdf_to_txt(path):
    txt = ''
    with open(path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
            resource_manager = PDFResourceManager()
            fake_file_handle = io.StringIO()
            converter = TextConverter(resource_manager, fake_file_handle)
            page_interpreter = PDFPageInterpreter(resource_manager, converter)
            page_interpreter.process_page(page)
            txt += fake_file_handle.getvalue()
            converter.close()
            fake_file_handle.close()
    return txt

def rtf_to_txt(rtf_path):
    with open(rtf_path, 'r', encoding='utf-8', errors='ignore') as file:
        return striprtf(file.read())

def zip_extract(zip_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(os.path.dirname(zip_path))
    
def convert_file_to_txt(file_path):
    if file_path.lower().endswith('.msg'):
        print('msg inside')
        return False
    elif file_path.lower().endswith('.zip'):
        zip_extract(file_path)
        os.remove(file_path)
    elif file_path.lower().endswith('.bin'):
        os.remove(file_path)
    elif file_path.lower().endswith('.png'):
        os.remove(file_path)
    elif file_path.lower().endswith('.jpg'):
        os.remove(file_path)
    elif file_path.lower().endswith('.jpeg'):
        os.remove(file_path)
    elif file_path.lower().endswith('.gif'):
        os.remove(file_path)
    elif file_path.lower().endswith('.pdf'):
        text = pdf_to_txt(file_path)
        os.remove(file_path)
        file_path = file_path[:-3] + 'txt'
        save_to_file(file_path, text)
    elif file_path.lower().endswith('.rtf'):
        text = rtf_to_txt(file_path)
        os.remove(file_path)
        file_path = file_path[:-3] + 'txt'
        save_to_file(file_path, text)
    elif file_path.lower().endswith('.txt'):
        return True
    else:
        print('error, file_path: ', file_path)
        return False
    return True

In [11]:
def uzip_mail(path_to_msg_file):
    directory, filename = os.path.split(path_to_msg_file)
    unzip_dir = os.path.join(directory, filename[:-4])
    unzip_dir = folder_dot_check(unzip_dir)
    os.mkdir(unzip_dir)
    
    outlook = win32com.client.Dispatch('Outlook.Application').GetNamespace('MAPI')
    msg = outlook.OpenSharedItem(path_to_msg_file)
    
    path_body_txt_file = os.path.join(unzip_dir, filename) + '.txt'
    save_to_file(path_body_txt_file, msg.Body)
    
    attachments_dir = os.path.join(unzip_dir, 'a')
    os.mkdir(attachments_dir)
    
    for att in msg.Attachments:
        att.SaveAsFile(os.path.join(attachments_dir, att.FileName))

    files = os.listdir(attachments_dir)
    for i in range(len(files)):
        file_path = os.path.join(attachments_dir, files[i])
        if not convert_file_to_txt(file_path):
            return False
    
    # if zip
    files = os.listdir(attachments_dir)
    for i in range(len(files)):
        file_path = os.path.join(attachments_dir, files[i])
        if not convert_file_to_txt(file_path):
            return False
        
    msg.Close(1)
    del outlook
    del msg
    return True

In [151]:
# not work for mails in mails
# not work for zip in zip in mails
# try by się przydał

for doc_class in DocumentClass:
    path_to_output_folder = 'C:\\o\\' + str(doc_class.value) + '\\'
    path_to_folder = 'C:\\m\\' + str(doc_class.value) + '\\'
    extension_to_find = '*.msg'
    msg_files = glob.glob(path_to_folder + extension_to_find)

    for file_path in msg_files:
        msg_folder = file_path[:-4]
        print('msg_folder: ', msg_folder)
        if not uzip_mail(file_path):
            continue
            
        msg_folder = folder_dot_check(msg_folder)
        output_folder = os.path.join(path_to_output_folder, os.path.basename(msg_folder))
        shutil.move(msg_folder, output_folder)

        os.remove(file_path)

msg_folder:  C:\m\8\URGENTE - SAKHIR - 01135331-LF
