In [54]:
import mailbox
import os
import email.header
import pandas as pd

In [55]:
def decode_mime_words(encoded_str):
    try:
        s = ' '.join(part.decode() if charset is None else part.decode(charset)
                        for part, charset in email.header.decode_header(encoded_str))
        return s
    except:
        return encoded_str

In [109]:
def is_bank(bankname, sender):
    try:
        if bankname in sender.lower():
            return True
        else:
            return False
    except:
        return False

In [57]:
def save_attachment(attachment, output_dir):
    file_path = os.path.join(output_dir, attachment.get_filename())
    with open(file_path, 'wb') as f:
        f.write(attachment.get_payload(decode=True))

In [58]:
mbox = mailbox.mbox(r"C:\Users\paulm\Downloads\taxdata\Takeout\Mail\All mail Including Spam and Trash.mbox")

In [105]:
mbox[0]['message-id']

'<238686512.337793.1690608996951@ip-10-11-58-199.eu-west-1.compute.internal>'

In [116]:
records = []
for message in mbox:
    # Accessing message attributes
    subject = message['subject']
    from_address = message['from']
    date_sent = message['date']
    msg_id = message['message-id']
    records.append((msg_id, subject, from_address, date_sent))

df = pd.DataFrame.from_records(records, columns=["id", "subject", "sender", "date"])

In [120]:
df['subject'] = df['subject'].apply(lambda x: decode_mime_words(x))
df['sender'] = df['sender'].apply(lambda x: decode_mime_words(x))

In [121]:
hdfc = df[df['sender'].apply(lambda x: is_bank('hdfc', x))]
icici = df[df['sender'].apply(lambda x: is_bank('icici', x))]

In [122]:
hdfc.reset_index(drop=True, inplace=True)
icici.reset_index(drop=True, inplace=True)

In [123]:
hdfc_statement = hdfc[hdfc['sender'].str.contains("hdfcbanksmartstatement@hdfcbank.net")]

In [138]:
hdfc_statement['clean_date'] = hdfc_statement.date.apply(lambda x: pd.to_datetime(x).date())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hdfc_statement['clean_date'] = hdfc_statement.date.apply(lambda x: pd.to_datetime(x).date())


In [133]:
icici_statement = icici[icici.subject.str.contains('Bank Statement')]

In [139]:
icici_statement['clean_date'] = icici_statement.date.apply(lambda x: pd.to_datetime(x).date())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  icici_statement['clean_date'] = icici_statement.date.apply(lambda x: pd.to_datetime(x).date())


In [143]:
is_id = icici_statement[icici_statement['clean_date'] > pd.to_datetime('2021-04-01').date()].id.to_list()

In [145]:
hs_id = hdfc_statement[hdfc_statement['clean_date'] > pd.to_datetime('2021-04-01').date()].id.to_list()

In [146]:
for message in mbox:
    # Accessing message attributes
    id = message['message-id']
    if id in is_id:
        for part in message.walk():
            filename = part.get_filename()
            if filename:
                print("ICICI", filename)
                save_attachment(part, r"C:\Users\paulm\Downloads\taxdata\bank_statements\icici")
    elif id in hs_id:
        for part in message.walk():
            filename = part.get_filename()
            if filename:
                print("HDFC", filename)
                save_attachment(part, r"C:\Users\paulm\Downloads\taxdata\bank_statements\hdfc")

ICICI Statement_2023MTH06_906567883.pdf
HDFC Mitaash_Paul_31052023_161850948.pdf
HDFC Mitaash_Paul_30062023_192918217.pdf
ICICI Statement_2023MTH05_906567883.pdf
ICICI Statement_APR2022_883906567.pdf
ICICI Statement_2023MTH04_906567883.pdf
HDFC Mitaash_Paul_30042023_222208643.pdf
HDFC Mitaash_Paul_31032023_021731303.pdf
ICICI Statement_2023MTH03_906567883.pdf
HDFC Mitaash_Paul_28022023_005415795.pdf
ICICI Statement_2023MTH02_906567883.pdf
ICICI Statement_2023MTH01_906567883.pdf
HDFC Mitaash_Paul_31012023_151856967.pdf
HDFC Mitaash_Paul_31122022_004656744.pdf
HDFC Mitaash_Paul_30112022_175113348.pdf
HDFC Mitaash_Paul_31102022_191351866.pdf
ICICI Statement_2022MTH12_906567883.pdf
ICICI Statement_2022MTH11_906567883.pdf
HDFC Mitaash_Paul_30092022_205252923.pdf
ICICI Statement_2022MTH07_906567883.pdf
HDFC Mitaash_Paul_31082022_142530514.pdf
ICICI Statement_2022MTH04_906567883.pdf
ICICI Statement_2021MTH12_906567883.pdf
ICICI Statement_2021MTH11_906567883.pdf
ICICI Statement_2022MTH08_90656

In [1]:
import os
import glob
import PyPDF2

def unlock_pdf(input_folder, output_folder, password):
    os.makedirs(output_folder, exist_ok=True)

    pdf_files = glob.glob(os.path.join(input_folder, '*.pdf'))

    for input_path in pdf_files:
        file_name = os.path.basename(input_path)
        output_path = os.path.join(output_folder, file_name)

        with open(input_path, 'rb') as file:
            pdf = PyPDF2.PdfReader(file)
            if pdf.decrypt(password):
                writer = PyPDF2.PdfWriter()
                for page_num in range(len(pdf.pages)):
                    page = pdf.pages[page_num]
                    writer.add_page(page)

                with open(output_path, 'wb') as output_file:
                    writer.write(output_file)
                print(f"PDF '{file_name}' successfully unlocked and saved to '{output_path}'")
            else:
                print(f"Incorrect password for PDF '{file_name}'. Unable to unlock the PDF.")


unlock_pdf(
    input_folder=r"C:\Users\paulm\Downloads\taxdata\bank_statements\hdfc",
    output_folder=r"C:\Users\paulm\Downloads\taxdata\bank_statements\hdfc_unlocked",
    password='47004974')

unlock_pdf(
    input_folder=r"C:\Users\paulm\Downloads\taxdata\bank_statements\icici",
    output_folder=r"C:\Users\paulm\Downloads\taxdata\bank_statements\icici_unlocked",
    password='180401552967')

PDF 'Mitaash_Paul_28022023_005415795.pdf' successfully unlocked and saved to 'C:\Users\paulm\Downloads\taxdata\bank_statements\hdfc_unlocked\Mitaash_Paul_28022023_005415795.pdf'
PDF 'Mitaash_Paul_30042023_222208643.pdf' successfully unlocked and saved to 'C:\Users\paulm\Downloads\taxdata\bank_statements\hdfc_unlocked\Mitaash_Paul_30042023_222208643.pdf'
PDF 'Mitaash_Paul_30062023_192918217.pdf' successfully unlocked and saved to 'C:\Users\paulm\Downloads\taxdata\bank_statements\hdfc_unlocked\Mitaash_Paul_30062023_192918217.pdf'
PDF 'Mitaash_Paul_30092022_205252923.pdf' successfully unlocked and saved to 'C:\Users\paulm\Downloads\taxdata\bank_statements\hdfc_unlocked\Mitaash_Paul_30092022_205252923.pdf'
PDF 'Mitaash_Paul_30112022_175113348.pdf' successfully unlocked and saved to 'C:\Users\paulm\Downloads\taxdata\bank_statements\hdfc_unlocked\Mitaash_Paul_30112022_175113348.pdf'
PDF 'Mitaash_Paul_31012023_151856967.pdf' successfully unlocked and saved to 'C:\Users\paulm\Downloads\taxdata

ignore '/Perms' verify failed
ignore '/Perms' verify failed
ignore '/Perms' verify failed


PDF 'Statement_2022MTH01_906567883.pdf' successfully unlocked and saved to 'C:\Users\paulm\Downloads\taxdata\bank_statements\icici_unlocked\Statement_2022MTH01_906567883.pdf'
PDF 'Statement_2022MTH02_906567883.pdf' successfully unlocked and saved to 'C:\Users\paulm\Downloads\taxdata\bank_statements\icici_unlocked\Statement_2022MTH02_906567883.pdf'
PDF 'Statement_2022MTH03_906567883.pdf' successfully unlocked and saved to 'C:\Users\paulm\Downloads\taxdata\bank_statements\icici_unlocked\Statement_2022MTH03_906567883.pdf'
PDF 'Statement_2022MTH04_906567883.pdf' successfully unlocked and saved to 'C:\Users\paulm\Downloads\taxdata\bank_statements\icici_unlocked\Statement_2022MTH04_906567883.pdf'
PDF 'Statement_2022MTH05_906567883.pdf' successfully unlocked and saved to 'C:\Users\paulm\Downloads\taxdata\bank_statements\icici_unlocked\Statement_2022MTH05_906567883.pdf'
PDF 'Statement_2022MTH06_906567883.pdf' successfully unlocked and saved to 'C:\Users\paulm\Downloads\taxdata\bank_statements\

ignore '/Perms' verify failed
ignore '/Perms' verify failed
ignore '/Perms' verify failed
ignore '/Perms' verify failed
ignore '/Perms' verify failed
ignore '/Perms' verify failed
ignore '/Perms' verify failed
ignore '/Perms' verify failed


PDF 'Statement_2022MTH08_906567883.pdf' successfully unlocked and saved to 'C:\Users\paulm\Downloads\taxdata\bank_statements\icici_unlocked\Statement_2022MTH08_906567883.pdf'
PDF 'Statement_2022MTH09_906567883.pdf' successfully unlocked and saved to 'C:\Users\paulm\Downloads\taxdata\bank_statements\icici_unlocked\Statement_2022MTH09_906567883.pdf'
PDF 'Statement_2022MTH10_906567883.pdf' successfully unlocked and saved to 'C:\Users\paulm\Downloads\taxdata\bank_statements\icici_unlocked\Statement_2022MTH10_906567883.pdf'
PDF 'Statement_2022MTH11_906567883.pdf' successfully unlocked and saved to 'C:\Users\paulm\Downloads\taxdata\bank_statements\icici_unlocked\Statement_2022MTH11_906567883.pdf'
PDF 'Statement_2022MTH12_906567883.pdf' successfully unlocked and saved to 'C:\Users\paulm\Downloads\taxdata\bank_statements\icici_unlocked\Statement_2022MTH12_906567883.pdf'
PDF 'Statement_2023MTH01_906567883.pdf' successfully unlocked and saved to 'C:\Users\paulm\Downloads\taxdata\bank_statements\

ignore '/Perms' verify failed
ignore '/Perms' verify failed
ignore '/Perms' verify failed


PDF 'Statement_2023MTH03_906567883.pdf' successfully unlocked and saved to 'C:\Users\paulm\Downloads\taxdata\bank_statements\icici_unlocked\Statement_2023MTH03_906567883.pdf'
PDF 'Statement_2023MTH04_906567883.pdf' successfully unlocked and saved to 'C:\Users\paulm\Downloads\taxdata\bank_statements\icici_unlocked\Statement_2023MTH04_906567883.pdf'
PDF 'Statement_2023MTH05_906567883.pdf' successfully unlocked and saved to 'C:\Users\paulm\Downloads\taxdata\bank_statements\icici_unlocked\Statement_2023MTH05_906567883.pdf'
PDF 'Statement_2023MTH06_906567883.pdf' successfully unlocked and saved to 'C:\Users\paulm\Downloads\taxdata\bank_statements\icici_unlocked\Statement_2023MTH06_906567883.pdf'
PDF 'Statement_APR2022_883906567.pdf' successfully unlocked and saved to 'C:\Users\paulm\Downloads\taxdata\bank_statements\icici_unlocked\Statement_APR2022_883906567.pdf'
PDF 'Statement_AUG2020_883906567.pdf' successfully unlocked and saved to 'C:\Users\paulm\Downloads\taxdata\bank_statements\icici_