# Import libraries

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import swifter
from unidecode import unidecode

In [None]:
from functions.utils import *
from functions.mapping_virements import *
from functions.clean_data import *
from functions.check_clientops import *

In [None]:
import datetime as dt
import re
import io

In [None]:
import tempfile

# Context

__I. Données reçues__
1. Données de relevé bancaire
2. Suivi de ClientOps
3. Données de BO sur les mouvements
4. Connection Démembrement
5. Données sur les souscriptions
6. Le champs Commentaire dans BO
7. Régul compte à compte

__II. Nom des variables__

La structure des données avant et après 062021 sont un peu différentes (données relevé et ClientOps), qui demandent donc des traitements un peu spécifiques.
- Dans cette notebook, tous les variables qui se terminent par 1 === données depuis 062021
- Toutes les variables qui se terminent par 2 == données avant 2020 (ou aussi entre 2021 et 06/2021 si les données sont cumulées)
- Tous ceux qui se terminent par 3 == données relevé bancaire entre 01/2020 - 06/2021 

(désolée pour l'ordre qui n'est pas cohérent mais c'est l'ordre auquel nous avons reçues les données. Donc je les ai renommées au fur et à mesure)

__III. Cheminement de traitement des données__ :
1. Importer les données de différentes périodes
2. Retraiter les données en faisant des controles pour s'assurer qu'on ne perd pas des lignes au cours de traitement
3. Rapprocher les données entre relevé bancaire vs BO
4. Ajouter les commentaires sur le résultat obtenu

In [None]:
df_list_bic = pd.read_excel("../_config/liste_des_BIC.xlsx",sheet_name="BIC")
list_bic = list(df_list_bic['BIC']) 

In [None]:
df_motsprasites = pd.read_excel("../_config/mots_parasites.xlsx",sheet_name="Sheet1",header=None)
motsparasites_pt = pd.read_excel("../_config/mots_parasites.xlsx",sheet_name="PT",header=None)
motsparasites_nl = pd.read_excel("../_config/mots_parasites.xlsx",sheet_name="NL",header=None)
motsparasites_at = pd.read_excel("../_config/mots_parasites.xlsx",sheet_name="AT",header=None)

pattern_clean_motif = "|".join(list(df_motsprasites[0]))
motsparasites_pt = "|".join(list(motsparasites_pt[0])) + "|" + pattern_clean_motif

list_motsparasites_nl = motsparasites_nl[0].to_list()
motsparasites_nl =  "|".join(list(motsparasites_nl[0])) + "|" + pattern_clean_motif

list_motsparasites_at = motsparasites_at[0].to_list()
motsparasites_at =  "|".join(list(motsparasites_at[0])) + "|" + pattern_clean_motif

# Import data

In [None]:
folder_path = "xxxx"

## Commentaire BO

In [None]:
df_commentaire = pd.read_csv(folder_path + "/raw_data/0. Data BO/Annotation/Annotation.csv",sep=";")

## Relevé bancaire - depuis 0621

In [None]:
releve_path = folder_path + "/raw_data/0. Data Releve bancaire/0. BS since June 2021/"
fichier_CO = "CO_BNP_FR.xlsx"
fichier_EU = "Eurion_BNP_FR.xlsx"
fichier_XL = "XL_BNP_FR.xlsx"
df_releve_bancaire_CO = pd.read_excel(releve_path + fichier_CO, sheet_name = "1",header=8)
df_headers_CO = pd.read_excel(releve_path + fichier_CO, sheet_name = "1",  nrows= 7, usecols = 'A:E',header=None)
df_releve_bancaire_CO["Fonds"] = "CC"

df_releve_bancaire_EU = pd.read_excel(releve_path + fichier_EU, sheet_name = "1",header=8)
df_headers_EU = pd.read_excel(releve_path + fichier_EU, sheet_name = "1",  nrows= 7, usecols = 'A:E',header=None)
df_releve_bancaire_EU["Fonds"] = "EU"

df_releve_bancaire_XL = pd.read_excel(releve_path + fichier_XL, sheet_name = "1",header=8)
df_headers_XL = pd.read_excel(releve_path + fichier_XL, sheet_name = "1",  nrows= 7, usecols = 'A:E',header=None)
df_releve_bancaire_XL["Fonds"] = "XL"

## Relevé bancaire - entre 2020-062021

In [None]:
### Les données 2020 et 2021
dir_releve_pdf = folder_path + "/raw_data/0. Data Releve bancaire/2. BS before AMB (New files 2020 & H1 2021)/"
dict_files_CC_3 = read_list_xlsx(dir_releve_pdf,"1. Origin")
dict_files_XL_3 = read_list_xlsx(dir_releve_pdf,"2. XL/V2 Word")
dict_files_EU_3 = read_list_xlsx(dir_releve_pdf,"3. Eurion")

## Relevé bancaire - avant 2020

In [None]:
dir_releve_pdf = folder_path + "/raw_data/0. Data Releve bancaire/4. CC direct debit account/"
dict_files_CO_prlv = read_list_xlsx(dir_releve_pdf,"")

In [None]:
### Il y a beaucoup de fichiers sur les données historiques. Donc il faut d'abord lire fichier par fichier, les cleaner et puis concaténer pour chaque fonds
dir_releve_pdf = folder_path + "/raw_data/0. Data Releve bancaire/1. Bank statements before AMB (June 2021)/"
dict_files_CO = read_list_xlsx(dir_releve_pdf,"1. Origin")
dict_files_XL = read_list_xlsx(dir_releve_pdf,"2. XL")
dict_files_EU = read_list_xlsx(dir_releve_pdf,"3. Eurion")

## Relevé avant 2015

In [None]:
dir_releve_pdf = folder_path + "/raw_data/0. Data Releve bancaire/3. BRED/"
dict_files_CC_bred = read_list_xlsx(dir_releve_pdf,"")

## Compte étranger

### AMB

In [None]:
def read_foreign_bank(folder):
    path = folder_path + "/raw_data/0. Data Releve bancaire/5. Foreign/2. Extract AMB/"
    path += folder
    filenames = os.listdir(path)
    df_AMB = pd.DataFrame()
    for filename in filenames:
        df_file = pd.read_excel(path+filename, sheet_name = "1",header=4)
        df_headers = pd.read_excel(path+filename, sheet_name = "1",  nrows= 4, usecols = 'A:A',header=None)
        iban = df_headers.loc[3,0]
        df_file["IBAN"] = iban
        df_file["filename"] = filename
        df_AMB = pd.concat([df_AMB,df_file])
    return df_AMB
df_AT_AMB = read_foreign_bank("1. AT/")
df_NL_AMB = read_foreign_bank("2. NL/")
df_PT_AMB = read_foreign_bank("3. PT/")

### Banque

In [None]:
def read_foreign_bank_pdf(folder):
    path = folder_path + "/raw_data/0. Data Releve bancaire/5. Foreign/1. Banque/"
    path += folder
    filenames = os.listdir(path)
    df_banque = pd.DataFrame()
    for filename in filenames:
        df_file = pd.read_excel(path + filename, sheet_name = "Table 1",header=None)
        df_file["filename"] = filename
        df_banque = pd.concat([df_banque,df_file])
    return df_banque

#### AT

In [None]:
df_AT_banque_cc = read_foreign_bank_pdf("1. AT/CO - BNP - AT/")
df_AT_banque_cc["Compte"] = "CO - BNP - AT"
df_AT_banque_xl = read_foreign_bank_pdf("1. AT/XL - BNP - AT/")
df_AT_banque_xl["Compte"] = "XL - BNP - AT"

#### NL

In [None]:
df_NL_banque_cc = read_foreign_bank_pdf("2. NL/CO - ING - NL/")
df_NL_banque_cc["Compte"] = "CO - ING - NL"
df_NL_banque_xl = read_foreign_bank_pdf("2. NL/XL - ING - NL/")
df_NL_banque_xl["Compte"] = "XL - ING - NL"

#### PT

In [None]:
df_PT_banque_xl = read_foreign_bank_pdf("3. PT/2. XL/")
df_PT_banque_xl["Compte"] = "XL - SANTANDER - PT"

df_PT_banque_cc = read_foreign_bank_pdf("3. PT/1. Origin/")
df_PT_banque_cc["Compte"] = "CC - SANTANDER - PT"

## ClientOps

### Depuis 062021

Tous les fichiers de suivi des virements de ClientOps sont protégés par un mot de passe mais certains fichiers n'ont pas besoin de rentrer le mot de passe pour la lecture par Python, certains oui.

In [None]:
## Fichier sans mot de passe
dict_ClientOps_EU = pd.read_excel(folder_path + "/raw_data/1. Virements/ClientOps/Virements_Eurion_2.0.xlsm",\
                                  sheet_name = ["Traités - EURION",'A traiter - EURION'])
df_ClientOps_EU = concat_tables_clientOps(dict_ClientOps_EU)
dict_ClientOps_XL = pd.read_excel(folder_path + "/raw_data/1. Virements/ClientOps/Virements_XL_3.0.xlsm",\
                                  sheet_name = ['A traiter - XL',"Traités - XL"])
df_ClientOps_XL = concat_tables_clientOps(dict_ClientOps_XL)

### Fichier protégé par mdp (même pour la lecture seule)

passwd = 'yyyy'
path_origin = folder_path+ "/raw_data/1. Virements/ClientOps/Virements_ORIGIN_3.1.xlsm"
dict_ClientOps_CO = read_xlsm(path_origin,"ORIGIN",passwd )
df_ClientOps_CO = concat_tables_clientOps(dict_ClientOps_CO)

df_ClientOps_EU["Fonds"] = "EU"
df_ClientOps_CO["Fonds"] = "CC"
df_ClientOps_XL["Fonds"] = "XL"
df_ClientOps1 = pd.concat([df_ClientOps_EU,df_ClientOps_CO,df_ClientOps_XL])

### Avant 062021

In [None]:
def read_files(dossier,df_ClientOps,fonds):
    path_clientops_historique = folder_path +'/raw_data/1. Virements/ClientOps/Historique/' + dossier
    files = os.listdir(path_clientops_historique)
    for file in files :
        df = pd.read_excel(path_clientops_historique + "/" + file,sheet_name="Feuil1")
        df_ClientOps = pd.concat([df_ClientOps,df])
    df_ClientOps["Fonds"] = fonds
    return df_ClientOps

dossier_cc = "CC/"
dossir_xl = "XL/"
df_ClientOps_CC_2 = pd.DataFrame()
df_ClientOps_XL_2 = pd.DataFrame()
df_ClientOps_CC_2 = read_files(dossier_cc,df_ClientOps_CC_2,"CC")
df_ClientOps_XL_2 = read_files(dossir_xl,df_ClientOps_XL_2,"XL")

df_ClientOps_CC_2.columns = ['Id sys_ClientOps', 'Libellé_ClientOps', 'Valeur_ClientOps', 'Crédit_ClientOps', 'Code Associé',
       'Souscription soldée', 'Colonne1', "Référence de l'ordre",'Colonne2',"Fonds_ClientOps"]
df_ClientOps_XL_2.columns = ['Id sys_ClientOps', 'Libellé_ClientOps', 'Valeur_ClientOps', 'Crédit_ClientOps', 'Code Associé',
       "Référence de l'ordre",'Souscription soldée', 'Colonne1' ,'Colonne2',"Fonds_ClientOps"]
df_ClientOps2 = pd.concat([df_ClientOps_CC_2,df_ClientOps_XL_2])

### Régul compte à compte

Ce sont les fichiers que nous avons trouvés dans les dossiers archivés. Donc il y a plein de lignes en doublon entre les fichiers.

In [None]:
regul_cac_file = pd.ExcelFile(folder_path+ "/raw_data/1. Virements/ClientOps/Régul compte à compte/Regularization (starting April 19, 2019).xlsx")
df_regul_cac = pd.DataFrame()
for sheet_name in regul_cac_file.sheet_names:
    if sheet_name != 'Virements mode emploi' and sheet_name != 'XL to CAM':
        df_regul = regul_cac_file.parse(sheet_name)
        df_regul["Sens de virement à faire"] = sheet_name
        df_regul_cac = pd.concat([df_regul_cac,df_regul])
df_regul_cac = df_regul_cac[["Sens de virement à faire","Reception date","Amount","Associate","Order","Comments"]]
df_regul_cac = df_regul_cac.rename(columns={"Amount":"Amount_RegulClientOps"})

# Nettoyer

## Relevé bancaire

### Depuis 0621

In [None]:
df_releve_bancaire_CO["IBAN"] = get_iban(df_headers_CO.loc[2,0],bol_start_collect=True)
df_releve_bancaire_EU["IBAN"] = get_iban(df_headers_EU.loc[2,0],bol_start_collect=True)
df_releve_bancaire_XL["IBAN"] = get_iban(df_headers_XL.loc[2,0],bol_start_collect=True)
df_releve_bancaire1 = pd.concat([df_releve_bancaire_CO,df_releve_bancaire_EU,df_releve_bancaire_XL])

In [None]:
df_credit1, df_debit1, df_control_donnes_bancaires1 = clean_donnees_releve(df_releve_bancaire1)

In [None]:
df_type_transaction_NA = df_credit1[df_credit1["Type_Transaction"].isnull()]
df_virements1 = df_credit1[df_credit1["Type_Transaction"].str[0:3]=="VIR"]
df_virements_cac1 = df_virements1[df_virements1["Type_Transaction"].str.contains("CPTE A CPTE RECU")]
df_virements1 = df_virements1[~df_virements1["Type_Transaction"].str.contains("CPTE A CPTE RECU")]
df_prelevements1 = df_credit1[df_credit1["Type_Transaction"].str.contains("PRLV",na=False)]
df_cheques1 = df_credit1[df_credit1["Type_Transaction"].str.contains("REMISE",na=False)]
df_rejets1 = df_credit1[df_credit1["Type_Transaction"].str.contains("REJET",na=False)]

In [None]:
liste_colonnes_trier = ["Fonds","Titulaire_clean","Crédit"]
colonne_date = "Valeur"
id_column = "Id sys"
df_vir_doublons1 = verification_doublons(df_virements1,liste_colonnes_trier,colonne_date,id_column)
df_vir_doublons1 = df_vir_doublons1[~df_vir_doublons1["Titulaire_clean"].str.contains("BANQUE")] #enlever les sociétés

df_virements1 = df_virements1.merge(df_vir_doublons1[["Id sys"]], on="Id sys", how="outer",indicator=True)
df_virements1 = df_virements1[df_virements1["_merge"]=='left_only'].drop(columns="_merge")

In [None]:
df_control_donnes_bancaires1 = control_script(df_control_donnes_bancaires1, \
                                   "type transaction non renseigne",df_type_transaction_NA["Id sys"].count(),\
                                   df_type_transaction_NA["Crédit"].sum())

df_control_donnes_bancaires1 = control_script(df_control_donnes_bancaires1, \
                                   "total_vir_compte_a_compte",df_virements_cac1["Id sys"].count(),\
                                   df_virements_cac1["Crédit"].sum())
df_control_donnes_bancaires1 = control_script(df_control_donnes_bancaires1, \
                                   "total_virs_avec_doublons_pendant30jours",df_vir_doublons1["Id sys"].count(),\
                                   df_vir_doublons1["Crédit"].sum())
df_control_donnes_bancaires1 = control_script(df_control_donnes_bancaires1, \
                                   "total_virs_sansCAC_sansDoublons",df_virements1["Id sys"].count(),\
                                   df_virements1["Crédit"].sum())
df_control_donnes_bancaires1 = control_script(df_control_donnes_bancaires1, \
                                   "total_prelevements",df_prelevements1["Id sys"].count(),\
                                   df_prelevements1["Crédit"].sum())
df_control_donnes_bancaires1 = control_script(df_control_donnes_bancaires1, \
                                   "total_cheques",df_cheques1["Id sys"].count(),\
                                   df_cheques1["Crédit"].sum())
df_control_donnes_bancaires1 = control_script(df_control_donnes_bancaires1, \
                                   "rejet",df_rejets1["Id sys"].count(),\
                                   df_rejets1["Crédit"].sum())

In [None]:
len(df_virements1) + len(df_vir_doublons1)

In [None]:
df_credit1.to_excel(folder_path + "/transformed_data/2. Banque/1.0 Depuis 0621_Releve_lignes de crédit.xlsx",sheet_name="Crédit")
df_debit1.to_excel(folder_path +  "/transformed_data/2. Banque/1.0 Depuis 0621_Releve_lignes de débit.xlsx",sheet_name="Débit")

### Avant 0621

#### BRED

In [None]:
### pour chaque fichier, splitter le en plusieurs tables. Chaque table serait traitée séparément 
### car les tables sont toutes décalées, il y a également des tables qui nous concernent pas
dict_files = {}
for filename,df_releve_pdf in dict_files_CC_bred.items():
    df_releve_pdf = dict_files_CC_bred[filename]
    dict_files = splitter_fichier(dict_files,df_releve_pdf,filename) 

### Formatter chaque table et les concaténer
df_releve_bred = concat_fichiers_clean_with_list(dict_files,dict_files.keys(),bol_spec_file=False,bol_bred=True).reset_index()

In [None]:
####### Supprimer les lignes inutiles
df_releve_bred = df_releve_bred[~df_releve_bred["DEBIT"].str.contains("EUROS")]
df_releve_bred = df_releve_bred[df_releve_bred["DATE COMPTABLE"]!= "TOTAL DES MOUVEMEN"]

### formatter
df_releve_bred.loc[:,"CREDIT"] = df_releve_bred.loc[:,"CREDIT"].str.replace(".","",regex=False).str.replace(",",".",regex=False).astype(float)
df_releve_bred.loc[:,"DEBIT"] = df_releve_bred.loc[:,"DEBIT"].str.replace("\.|\s","",regex=True).str.replace(",",".",regex=False).astype(float)
df_releve_bred.loc[:,"DATE DE VALEUR"] = pd.to_datetime(df_releve_bred["DATE DE VALEUR"],format= "%Y-%m-%d %H:%M:%S",errors="coerce").fillna(pd.to_datetime(df_releve_bred["DATE DE VALEUR"], format='%d.%m.%y',errors="coerce"))

def clean_date_comptable(str_date):
    list_item = str_date.split(".")
    if len(list_item) == 2:
        jour = list_item[0]
        mois = list_item[1]
        if len(jour) == 1 : jour = "0" + jour
        if len(mois) == 1 : mois += "0"
        return jour + "." + mois
    return str_date
df_releve_bred.loc[:,"DATE COMPTABLE"] = df_releve_bred.loc[:,"DATE COMPTABLE"].apply(clean_date_comptable)
df_releve_bred.loc[:,"DATE COMPTABLE"] = df_releve_bred.loc[:,"DATE COMPTABLE"] + "." + df_releve_bred["DATE DE VALEUR"].dt.to_period("Y").astype(str)
df_releve_bred.loc[:,"DATE COMPTABLE"] = pd.to_datetime(df_releve_bred["DATE COMPTABLE"],format= "%d.%m.%Y")
df_releve_bred.loc[:,"Fonds"] = "CC"
dict_columns = {"index":"Id sys",
                "DATE COMPTABLE" : "Date",
                "NATURE DES OPERATIONS": "Libellé",
                "DATE DE VALEUR":"Valeur",
                "DEBIT":"Débit",
                "CREDIT":"Crédit"}
df_releve_bred = df_releve_bred.rename(columns=dict_columns)  
df_releve_bred.loc[:,"Libellé"] = df_releve_bred.loc[:,"Libellé"].str.replace(r'\s+', ' ', regex=True).apply(unidecode)

In [None]:
def type_transaction_bred(libelle):
    if libelle[0:8] in ["VIREMENT","EUROVIRE"]: return "VIREMENT"
    if libelle[0:6] == "REMISE" : return "REMISE CHEQUE"
    return "Autre"
df_releve_bred.loc[:,"Type_Transaction"] = df_releve_bred.loc[:,"Libellé"].apply(type_transaction_bred)

In [None]:
df_credit_bred = df_releve_bred[~df_releve_bred["Crédit"].isnull()]
df_debit_bred = df_releve_bred[~df_releve_bred["Débit"].isnull()]

In [None]:
df_virements_bred = df_credit_bred[df_credit_bred["Type_Transaction"]=="VIREMENT"]
df_virements_bred.loc[:,"Libellé_clean"] = df_virements_bred.loc[:,"Libellé"].apply(get_words_only)
df_virements_bred.loc[:,"Libellé_clean"] = df_virements_bred.loc[:,"Libellé_clean"].str.strip()

mask = df_virements_bred["Libellé_clean"].isin(["VIREMENT COMPTE A TERME","VIREMENT","VIREMENT EUROPEEN RECU"])
df_virements_bred.loc[mask,"Type_Transaction"] = "VIREMENT COMPTE A COMPTE"
mask = df_virements_bred["Libellé_clean"].str.contains("VIREMENT COMPENSE")
df_virements_bred.loc[mask,"Type_Transaction"] = "VIREMENT COMPTE A COMPTE"
mask = df_virements_bred["Libellé_clean"].str.contains("LOYER")
df_virements_bred.loc[mask,"Type_Transaction"] = "VIREMENT LOYER"
mask = df_virements_bred["Libellé_clean"].str.contains("FACTURE")
df_virements_bred.loc[mask,"Type_Transaction"] = "VIREMENT AUTRE"

In [None]:
df_virements_bred = clean_libelle_to_titulaire(df_virements_bred,"Titulaire_clean","Libellé_clean",list_bic)

In [None]:
df_virements_bred.to_excel(folder_path+ "/transformed_data/2. Banque/BRED_Virements.xlsx",sheet_name="Virements", index=False)

In [None]:
# df_credit_bred.to_excel(folder_path + "/transformed_data/2. Banque/BRED_Releve_lignes de crédit.xlsx",sheet_name="Crédit", index=False)
# df_debit_bred.to_excel(folder_path+"/transformed_data/2. Banque/BRED_Releve_lignes de débit.xlsx",sheet_name="Débit", index=False)

#### Avant 2020

In [None]:
### Données avant 2020
df_releve_CO = clean_releve_pdf(dict_files_CO)
df_releve_CO["Fonds"] = "CC"
df_control_releve_CO = controle_data_quality(df_releve_CO,"CC")

df_releve_XL = clean_releve_pdf(dict_files_XL)
df_releve_XL["Fonds"] = "XL"
df_control_releve_XL = controle_data_quality(df_releve_XL,"XL")

df_releve_EU = clean_releve_pdf(dict_files_EU)
df_releve_EU["Fonds"] = "EU"
df_control_releve_EU = controle_data_quality(df_releve_EU,"EU")

#### Compte prélèvement de CC entre 2017 et 2021

In [None]:
df_releve_CC_prlv = clean_releve_pdf(dict_files_CO_prlv,date_format = "%d.%m.%y" ,bol_spec_file=True)
df_releve_CC_prlv["Fonds"] = "CC"

In [None]:
nb_prlv_cc = len(df_releve_CC_prlv)

#### Entre 2020 et 062021

In [None]:
### Données entre 2020 et 062021
df_releve_EU_3 = clean_releve_pdf(dict_files_EU_3)
df_releve_EU_3["Fonds"] = "EU"
df_control_releve_EU_3 = controle_data_quality(df_releve_EU_3,"EU")

df_releve_XL_3 = clean_releve_pdf(dict_files_XL_3,date_format = "%d.%m.%y" ,bol_spec_file=True)
df_releve_XL_3["Fonds"] = "XL"
df_control_releve_XL_3 = controle_data_quality(df_releve_XL_3,"XL")

df_releve_CC_3 = clean_releve_pdf(dict_files_CC_3,date_format = "%d.%m.%y" ,bol_spec_file=True)
df_releve_CC_3["Fonds"] = "CC"
df_control_releve_CC_3 = controle_data_quality(df_releve_CC_3,"CC")

#### les données récentes commencent du 18/06/2021. Elles sont plus propres à utiliser
df_releve_EU_3 = df_releve_EU_3[df_releve_EU_3["Date"]< "2021-06-18"]
df_releve_XL_3 = df_releve_XL_3[df_releve_XL_3["Date"]< "2021-06-18"]
df_releve_CC_3 = df_releve_CC_3[df_releve_CC_3["Date"]< "2021-06-18"]

#### Concat avant 062021

In [None]:
df_releve_pdf = pd.concat([df_releve_CC_prlv,\
                           df_releve_CO,df_releve_XL,df_releve_EU,\
                           df_releve_CC_3,df_releve_XL_3,df_releve_EU_3]).reset_index()
df_control_releve = pd.concat([df_control_releve_CO,df_control_releve_XL,df_control_releve_EU,df_control_releve_CC_3,df_control_releve_XL_3,df_control_releve_EU_3])
df_releve_pdf["Type_Transaction"] = df_releve_pdf["Libellé"].apply(get_type_transaction)
df_releve_pdf = df_releve_pdf.rename(columns={"index":"Id sys"})

df_credit2 = df_releve_pdf[~df_releve_pdf["Crédit"].isnull()]
df_debit2 = df_releve_pdf[~df_releve_pdf["Débit"].isnull()]

In [None]:
df_virement2 = df_credit2[df_credit2["Type_Transaction"]=="VIREMENT"]
df_prlv2 = df_credit2[df_credit2["Type_Transaction"]=="PRLV"]
df_cheque2 = df_credit2[df_credit2["Type_Transaction"]=="REMISE CHEQUE"]
df_vir_CAC2 = df_credit2[df_credit2["Type_Transaction"]=="VIRT CPTE A CPTE"]
df_rejet2 = df_credit2[df_credit2["Type_Transaction"]=="REJET"]
df_autre2 = df_credit2[df_credit2["Type_Transaction"]=="Autre"]

In [None]:
patern_titulaire = r'FRM (.+?)\/|DE (.+?)\/'
patern_motif1 = r'EID (.+?)\/|MOTIF (.+?)\/'
patern_motif2 = r'RNF (.*)'
patern_clean_titulaire = r'([a-zA-Z\s\-\.\d\&\-\+]+)'

df_virement2["Libellé_origin"] = df_virement2["Libellé"]
df_virement2["Libellé"] = df_virement2["Libellé"].str.upper()
df_virement2["Libellé"] = df_virement2['Libellé'].str.replace("IRNF","/RNF")
df_virement2["Libellé"] = df_virement2['Libellé'].str.replace('M/1\/1',"",regex=False).str.replace('M/1\/lME',"",regex=False)
df_virement2["Libellé"] = df_virement2['Libellé'].str.replace('M/1\/LME',"",regex=False)
df_virement2["Libellé"] = df_virement2['Libellé'].str.replace('1/',"",regex=False)
df_virement2["Libellé"] = df_virement2['Libellé'].str.replace('□',"D",regex=False)
df_virement2["Libellé"] = df_virement2['Libellé'].str.replace('S.C.I',"SCI",regex=False)
df_virement2["Libellé"] = df_virement2['Libellé'].str.replace('0MR',"",regex=False)

df_virement2["Libellé"] = df_virement2['Libellé'].replace(r'\s+', ' ', regex=True).str.strip()

df_result = df_virement2['Libellé'].str.extract(patern_titulaire)
df_virement2["Titulaire"] = df_result[0].combine_first(df_result[1])
df_virement2["Titulaire_clean"] = df_virement2['Titulaire'].str.extract(patern_clean_titulaire)
df_virement2["Titulaire_clean"] = df_virement2["Titulaire_clean"].str.strip()

df_result = df_virement2['Libellé'].str.extract(patern_motif1)
df_virement2["Motif1"] = df_result[0].combine_first(df_result[1])
df_virement2["Motif1"] = df_virement2['Motif1'].apply(get_words_only)
df_virement2["Motif2"] = df_virement2['Libellé'].str.extract(patern_motif2)
df_virement2["Motif2"] = df_virement2['Motif2'].apply(get_words_only)

df_virement2.loc[:,"Titulaire_clean"] = df_virement2.loc[:,"Titulaire_clean"].apply(clean_name)

df_virement2.loc[:,"Motif1"] = df_virement2.loc[:,"Motif1"].apply(clean_motif)
df_virement2.loc[:,"Motif1"] = df_virement2.loc[:,"Motif1"].apply(clean_name)
df_virement2["Motif1"] = df_virement2['Motif1'].str.replace("NAN","")
df_virement2.loc[:,"Motif2"] = df_virement2.loc[:,"Motif2"].apply(clean_motif)
df_virement2.loc[:,"Motif2"] = df_virement2.loc[:,"Motif2"].apply(clean_name)
df_virement2["Motif2"] = df_virement2['Motif2'].str.replace("NAN","")
df_virement2["Motif2"] = df_virement2['Motif2'].str.replace("NAN","")

In [None]:
df_virement_titulaire_nul = df_virement2[df_virement2["Titulaire_clean"].isnull()]
df_virement_titulaire_NotNull = df_virement2[~df_virement2["Titulaire_clean"].isnull()]

In [None]:
df_virement_titulaire_nul["Titulaire"] = df_virement_titulaire_nul["Libellé"].apply(get_words_only)
df_virement_titulaire_nul.loc[:,"Titulaire_clean"] = df_virement_titulaire_nul.loc[:,"Titulaire"].apply(clean_motif)

df_virement_titulaire_nul.loc[:,"Titulaire_clean"] = df_virement_titulaire_nul.loc[:,"Titulaire_clean"].apply(clean_name)
for i in range(0,5): 
    df_virement_titulaire_nul.loc[:,"Titulaire_clean"] = df_virement_titulaire_nul.loc[:,"Titulaire_clean"].apply(remove_de)
df_virement_titulaire_nul["Titulaire_clean"]=df_virement_titulaire_nul["Titulaire_clean"].apply(remove_duplicated)

In [None]:
df_virements2 = pd.concat([df_virement_titulaire_nul,df_virement_titulaire_NotNull])

In [None]:
liste_a_exclure = ['aaaaa','bbbbb']
df_vir_cac2 = df_virements2[df_virements2["Titulaire_clean"].isin(liste_a_exclure)]
df_vir_cac2 = df_vir_cac2[['Id sys', 'Date', 'Libellé', 'Valeur', 'Débit', 'Crédit',\
                         'Fonds','Periode', 'Annee', 'Type_Transaction']]
df_vir_CAC2 = pd.concat([df_vir_CAC2,df_vir_cac2])
df_virements2 = df_virements2[~df_virements2["Titulaire_clean"].isin(liste_a_exclure)]

In [None]:
df_credit2.to_excel(folder_path + "/transformed_data/2. Banque/2. Données avant 0621_lignes de crédit.xlsx",sheet_name="crédit",index=False)
df_debit2.to_excel(folder_path + "/transformed_data/2. Banque/2. Données avant 0621_lignes de débit.xlsx",sheet_name="débit",index=False)

df_virements2.to_excel(folder_path + "/transformed_data/2. Banque/2. Données avant 0621_Virements sans CAC.xlsx",sheet_name="Sheet1",index=False)
df_vir_CAC2.to_excel(folder_path + "/transformed_data/2. Banque/2. Données avant 0621_Virements Compte a Compte.xlsx",sheet_name="Sheet1",index=False)
df_prlv2.to_excel(folder_path + "/transformed_data/2. Banque/2. Données avant 0621_Prélèvements.xlsx",sheet_name="Sheet1",index=False)
df_cheque2.to_excel(folder_path + "/transformed_data/2. Banque/2. Données avant 0621_Chèques.xlsx",sheet_name="Sheet1",index=False)
df_rejet2.to_excel(folder_path + "/transformed_data/2. Banque/2. Données avant 0621_Rejets.xlsx",sheet_name="Sheet1",index=False)
df_autre2.to_excel(folder_path + "/transformed_data/2. Banque/2. Données avant 0621_Autres.xlsx",sheet_name="Sheet1",index=False)

In [None]:
liste_colonnes_trier = ["Fonds","Titulaire_clean","Crédit"]
colonne_date = "Valeur"
id_column = "Id sys"
df_vir_doublons2 = verification_doublons(df_virements2,liste_colonnes_trier,colonne_date,id_column)
df_vir_doublons2 = df_vir_doublons2[~df_vir_doublons2["Titulaire_clean"].str.contains("BANQUE")] #enlever les sociétés

In [None]:
df_virements2 = df_virements2.merge(df_vir_doublons2[["Id sys"]], on="Id sys", how="outer",indicator=True)
df_virements2 = df_virements2[df_virements2["_merge"]=='left_only'].drop(columns="_merge")

In [None]:
df_virements2["Product_motif"] = df_virements2.apply(lambda f : get_product_motif(f["Libellé"]), axis=1)

In [None]:
len(df_virements2) + len(df_vir_doublons2)

In [None]:
len(df_virements1) + len(df_vir_doublons1)

### Comptes étrangers

#### AMB

In [None]:
df_AMB = pd.concat([df_AT_AMB, df_NL_AMB, df_PT_AMB])

In [None]:
df_AMB["Compte"] = df_AMB["IBAN"].apply(lambda x : x.split("Collecte")[0])
df_AMB["IBAN"] = df_AMB["IBAN"].apply(lambda x : x.split("Collecte")[1])

df_AMB["IBAN"] = df_AMB["IBAN"].str.replace("EUR","").str.replace("-"," ").str.strip()

df_AMB["Compte"] = df_AMB["Compte"].str.replace("Compte :","") 
df_AMB["Banque"] = df_AMB["Compte"].apply(lambda x : x.split("-")[1]).str.strip()
df_AMB["Fonds"] = df_AMB["Compte"].apply(lambda x : x.split("-")[0]).str.strip().str.replace("CO","CC")
df_AMB["Pays"] = df_AMB["Compte"].apply(lambda x : x.split("-")[2]).str.strip()

In [None]:
df_AMB = df_AMB.rename(columns={"Date d'opération":"Date", 
                                'Date de valeur' : "Valeur", 
                                'Référence' : 'Ref.', 
                                'Référence 2' : 'Ref.2',
                                'Libellé code opération':'Type_Transaction'})
df_AMB = df_AMB[~df_AMB["Valeur"].isnull()].reset_index()

In [None]:
df_AMB["Id sys"] = df_AMB["filename"].str.replace(".xlsx","",regex=False) + "_" + df_AMB["index"].astype(str)

In [None]:
df_AMB["Débit"] = df_AMB["Débit"].astype(float)
df_AMB["Date"] = pd.to_datetime(df_AMB["Date"],format = "%Y-%m-%d")
df_AMB["Valeur"] = pd.to_datetime(df_AMB["Valeur"],format = "%Y-%m-%d")

In [None]:
df_AMB = df_AMB.drop(columns=["index","filename"])

#### Extraction de la banque : AT_CC

In [None]:
df_AT_banque_cc = df_AT_banque_cc.reset_index().rename(columns={"index":"index_file"}).reset_index()

In [None]:
### récupérer les montants de "Previous page"
df_previous = df_AT_banque_cc[df_AT_banque_cc[0].str.contains("Previous",na=False)]
df_previous["Balance"] = df_previous[0].apply(lambda x : x.split(" ")[-1])
df_previous = df_previous[["index","Balance"]]

In [None]:
### Filtrer les lignes nulles
df_AT_1 = df_AT_banque_cc[~df_AT_banque_cc[2].isnull()]
df_AT_1 = df_AT_1[~df_AT_1[2].str.contains("E-mail")]
df_AT_2 = df_AT_banque_cc[~df_AT_banque_cc["index"].isin(df_AT_1["index"])]
df_AT_2 = df_AT_2[df_AT_2[0].str.contains("Int. Ref|BEN",na=False)]
df_AT_2 = df_AT_2.drop(columns=2).rename(columns={0:2})

df_AT_banque_cc = pd.concat([df_AT_1,df_AT_2]).sort_values(by="index")

df_AT_banque_cc = df_AT_banque_cc.rename(columns={1:"Date",
                               2:"Libellé",
                               4:"Valeur"
                               })

In [None]:
### Chercher la colonne Balance
df_AT_banque_cc["Balance"] = np.nan
for index, row in df_AT_banque_cc.iterrows():
    for col in [9,8,7,6,5]:
        if str(row[col]) != "nan":
            df_AT_banque_cc.loc[index,"Balance"] = row[col]
            break

In [None]:
### Extraire les données dans le nom du fichier
df_AT_banque_cc["IBAN"] = df_AT_banque_cc["filename"].apply(lambda x : x.split("_")[2])
df_AT_banque_cc["filename_org"] = df_AT_banque_cc["filename"]
df_AT_banque_cc["filename"] = df_AT_banque_cc["filename"].apply(lambda x : "_".join(x.split("_")[4:]).split(".")[0])
df_AT_banque_cc["year"] = df_AT_banque_cc["filename"].apply(lambda x: x.split("_")[1][0:4])

### Fill forward les dates
df_AT_banque_cc["Date"] = df_AT_banque_cc["Date"].ffill(axis=0)
df_AT_banque_cc["Valeur"] = df_AT_banque_cc["Valeur"].ffill(axis=0)
df_AT_banque_cc["Balance"] = df_AT_banque_cc["Balance"].ffill(axis=0)

In [None]:
### Concaténer les Libellés
df_AT_banque_cc = df_AT_banque_cc.groupby(by=["Date","Valeur","filename","filename_org","Compte","IBAN","year","Balance"]).agg({
                                                                                 "index":"first",
                                                                                 "index_file":"first",
                                                                                 "Libellé":" ".join
                                                                                 }).reset_index()
df_AT_banque_cc = df_AT_banque_cc.sort_values(by="index")

### Créer l'Id sys
df_AT_banque_cc["Id sys"] = df_AT_banque_cc["filename_org"] + "_" + df_AT_banque_cc["index_file"].astype(str)

In [None]:
### Ajouter l'année
df_AT_banque_cc["Date"] = df_AT_banque_cc["Date"] + "/" + df_AT_banque_cc["year"]
df_AT_banque_cc["Valeur"] = df_AT_banque_cc["Valeur"] + "/" + df_AT_banque_cc["year"]

df_AT_banque_cc = df_AT_banque_cc.drop(columns=["year","filename_org"])

In [None]:
def find_previous_value(list_index_banque,list_index_previous): 
    previous_values = []
    current_index = []
    for i, idx_banque in enumerate(list_index_banque):
        for idx_previous in list_index_previous:
            if i < len(list_index_banque)-1:
                if idx_previous > idx_banque and idx_previous < list_index_banque[i+1]:
                    previous_values.append(idx_previous)
                    current_index.append(idx_banque)

    df_previous_values = pd.DataFrame(list(zip(current_index, previous_values)),
                   columns =['index', 'previous_values'])
    return df_previous_values

In [None]:
####### Rapprocher la balance précédente (previous_balance) pour vérifier si on n'a pas perdu des lignes
df_AT_banque_cc = df_AT_banque_cc.sort_values(by="index")
df_previous = df_previous.sort_values(by="index")
list_index_banque = df_AT_banque_cc["index"].to_list()
list_index_previous = df_previous["index"].to_list()
df_previous_values = find_previous_value(list_index_banque,list_index_previous)

df_previous = df_previous.rename(columns = {"Balance":"Balance_previous","index":"previous_values"})
df_AT_banque_cc = df_AT_banque_cc.merge(df_previous_values,on="index",how="outer").merge(df_previous,on="previous_values",how="outer")
df_AT_banque_cc = df_AT_banque_cc[~df_AT_banque_cc["Date"].isnull()]
df_AT_banque_cc = df_AT_banque_cc.sort_values(by="index")

In [None]:
def balance_valid(balance,balance_previous):
    if str(balance_previous) != "nan":
        if balance_previous == balance:
            return True
        return False
    return True

In [None]:
df_AT_banque_cc["Valid"] = df_AT_banque_cc.apply(lambda f : balance_valid(f["Balance"],f["Balance_previous"]), axis=1)

In [None]:
df_AT_banque_cc[df_AT_banque_cc["Valid"]==False]

#### Extraction de la banque : AT_XL

In [None]:
df_AT_banque_xl0 = df_AT_banque_xl

In [None]:
df_AT_banque_xl = df_AT_banque_xl0

In [None]:
len(df_AT_banque_xl)

In [None]:
df_AT_banque_xl = df_AT_banque_xl.reset_index().rename(columns={"index":"index_file"}).reset_index()

In [None]:
df_previous = df_AT_banque_xl[df_AT_banque_xl[0].str.contains("Previous",na=False)]
df_previous["Balance"] = df_previous[0].apply(lambda x : x.split(" ")[-1])
df_previous = df_previous[["index","Balance"]]

In [None]:
### Filtrer les lignes nulles
df_AT_1 = df_AT_banque_xl[~df_AT_banque_xl[2].isnull()]
df_AT_1 = df_AT_1[~df_AT_1[2].str.contains("E-mail")]
df_AT_2 = df_AT_banque_xl[~df_AT_banque_xl["index"].isin(df_AT_1["index"])]
df_AT_2 = df_AT_2[df_AT_2[0].str.contains("Int. Ref|BEN",na=False)]
df_AT_2 = df_AT_2.drop(columns=2).rename(columns={0:2})
df_AT_banque_xl = pd.concat([df_AT_1,df_AT_2]).sort_values(by="index")

df_AT_banque_xl[4] = df_AT_banque_xl[4].fillna(df_AT_banque_xl[5])

df_AT_banque_xl = df_AT_banque_xl.rename(columns={1:"Date",
                               2:"Libellé",
                               4:"Valeur"
                               })

In [None]:
### Chercher la colonne Balance
df_AT_banque_xl["Balance"] = np.nan
for index, row in df_AT_banque_xl.iterrows():
    for col in [11,10,9,8,7,6]:
        if str(row[col]) != "nan":
            df_AT_banque_xl.loc[index,"Balance"] = row[col]
            break

In [None]:
### Extraire les données dans le nom du fichier
df_AT_banque_xl["IBAN"] = df_AT_banque_xl["filename"].apply(lambda x : x.split("_")[2])
df_AT_banque_xl["filename_org"] = df_AT_banque_xl["filename"]
df_AT_banque_xl["filename"] = df_AT_banque_xl["filename"].apply(lambda x : "_".join(x.split("_")[4:]).split(".")[0])
df_AT_banque_xl["year"] = df_AT_banque_xl["filename"].apply(lambda x: x.split("_")[1][0:4])

### Fill forward les dates
df_AT_banque_xl["Date"] = df_AT_banque_xl["Date"].ffill(axis=0)
df_AT_banque_xl["Valeur"] = df_AT_banque_xl["Valeur"].ffill(axis=0)
df_AT_banque_xl["Balance"] = df_AT_banque_xl["Balance"].ffill(axis=0)

In [None]:
### Concaténer les Libellés
df_AT_banque_xl = df_AT_banque_xl.groupby(by=["Date","Valeur","filename","filename_org","Compte","IBAN","year","Balance"]).agg({
                                                                                 "index":"first",
                                                                                 "index_file":"first",
                                                                                 "Libellé":" ".join
                                                                                 }).reset_index()
df_AT_banque_xl = df_AT_banque_xl.sort_values(by="index")

### Créer l'Id sys
df_AT_banque_xl["Id sys"] = df_AT_banque_xl["filename_org"] + "_" + df_AT_banque_xl["index_file"].astype(str)

### Ajouter l'année
df_AT_banque_xl["Date"] = df_AT_banque_xl["Date"] + "/" + df_AT_banque_xl["year"]
df_AT_banque_xl["Valeur"] = df_AT_banque_xl["Valeur"] + "/" + df_AT_banque_xl["year"]

df_AT_banque_xl = df_AT_banque_xl.drop(columns=["year","filename_org"])

In [None]:
df_AT_banque_xl = df_AT_banque_xl.sort_values(by="index")

In [None]:
df_previous = df_previous.sort_values(by="index")

In [None]:
df_AT_banque_xl = df_AT_banque_xl.sort_values(by="index")
df_previous = df_previous.sort_values(by="index")
list_index_banque = df_AT_banque_xl["index"].to_list()
list_index_previous = df_previous["index"].to_list()
df_previous_values = find_previous_value(list_index_banque,list_index_previous)

df_previous = df_previous.rename(columns = {"Balance":"Balance_previous","index":"previous_values"})
df_AT_banque_xl = df_AT_banque_xl.merge(df_previous_values,on="index",how="outer").merge(df_previous,on="previous_values",how="outer")
df_AT_banque_xl = df_AT_banque_xl[~df_AT_banque_xl["Date"].isnull()]

In [None]:
df_AT_banque_xl["Valid"] = df_AT_banque_xl.apply(lambda f : balance_valid(f["Balance"],f["Balance_previous"]), axis=1)

In [None]:
df_AT_banque_xl[df_AT_banque_xl["Valid"]==False]

#### Extraction de la banque : concat_AT

In [None]:
df_AT_banque = pd.concat([df_AT_banque_cc,df_AT_banque_xl])

In [None]:
df_AT_banque["Date"] = pd.to_datetime(df_AT_banque["Date"],format="%d/%m/%Y")
df_AT_banque["Valeur"] = pd.to_datetime(df_AT_banque["Valeur"],format="%d/%m/%Y")
df_AT_banque = df_AT_banque.drop(columns=["index_file"]).reset_index(drop=True)

In [None]:
### Calculer le mouvement pour identifier si c'est un débit ou un crédit
df_AT_banque["Balance"] = df_AT_banque["Balance"].str.replace(".","").str.replace(",",".").astype(float)
df_AT_banque = df_AT_banque.sort_values(by=["Compte","index"]) #### reclasser bien les lignes avant de calculer le Mouvement
df_AT_banque["Balance_N_1"] = df_AT_banque["Balance"].shift(1).fillna(0)
df_AT_banque["Mouvement"] = df_AT_banque["Balance"] - df_AT_banque["Balance_N_1"]

df_AT_banque["Débit"] = df_AT_banque["Mouvement"].apply(lambda x : x if x<0 else np.nan)
df_AT_banque["Crédit"] = df_AT_banque["Mouvement"].apply(lambda x : x if x>0 else np.nan)

In [None]:
df_AT_banque["Banque"] = df_AT_banque["Compte"].apply(lambda x : x.split("-")[1]).str.strip()
df_AT_banque["Fonds"] = df_AT_banque["Compte"].apply(lambda x : x.split("-")[0]).str.strip().str.replace("CO","CC")
df_AT_banque["Pays"] = df_AT_banque["Compte"].apply(lambda x : x.split("-")[2]).str.strip()

In [None]:
df_AT_banque = df_AT_banque.drop(columns=["filename","Balance","Balance_N_1","Mouvement",'previous_values', 'Balance_previous', 'Valid','index'])

#### Extraction de la banque : CC_NL + XL_NL

In [None]:
df_NL_banque_cc = df_NL_banque_cc.reset_index().rename(columns={"index":"index_file"})
df_NL_banque_xl = df_NL_banque_xl.reset_index().rename(columns={"index":"index_file"})

In [None]:
df_NL_banque = pd.concat([df_NL_banque_cc,df_NL_banque_xl])

In [None]:
df_NL_banque = df_NL_banque[~df_NL_banque[1].isnull()]
df_NL_banque = df_NL_banque[df_NL_banque[3]!="Type"]

In [None]:
df_NL_banque = df_NL_banque.rename(columns={0 : "Valeur",
                                                 1 : "Libellé",
                                                 3 : "Type_Transaction",
                                                 4 : "Mouvement"})

In [None]:
df_NL_banque["Mouvement"] = df_NL_banque["Mouvement"].ffill(axis=0)

In [None]:
df_NL_banque = df_NL_banque.groupby(by=["Valeur","filename","Compte","Mouvement"]).agg({
                                                                             "index_file":"first",
                                                                             "Libellé":" ".join,
                                                                             "Type_Transaction":"first"}).reset_index()

In [None]:
df_NL_banque["Mouvement"] = df_NL_banque["Mouvement"].astype(str)

In [None]:
df_NL_banque["Crédit"] = df_NL_banque["Mouvement"].apply(lambda x : x if "+" in x else np.nan).str.replace("\+|\.","",regex=True).str.replace(",",".")
df_NL_banque["Débit"] = df_NL_banque["Mouvement"].apply(lambda x : x if "-" in x else np.nan).str.replace("\-|\.","",regex=True).str.replace(",",".")

In [None]:
df_NL_banque = df_NL_banque.sort_values(by="index_file")

In [None]:
df_NL_banque["IBAN"] = df_NL_banque["filename"].apply(lambda x : x.split("_")[0])

In [None]:
df_NL_banque["Type_Transaction"].unique()

In [None]:
df_NL_banque["Type_Transaction"] = df_NL_banque["Type_Transaction"].map({"Overschrijving":"Virement",
                                                                          "Online bankieren" : "Online banking",
                                                                          "Diversen": "Divers",
                                                                          "Verzamelbetaling":"Frais et autres dépenses"})

In [None]:
df_NL_banque["Banque"] = df_NL_banque["Compte"].apply(lambda x : x.split("-")[1]).str.strip()
df_NL_banque["Fonds"] = df_NL_banque["Compte"].apply(lambda x : x.split("-")[0]).str.strip().str.replace("CO","CC")
df_NL_banque["Pays"] = df_NL_banque["Compte"].apply(lambda x : x.split("-")[2]).str.strip()

In [None]:
df_NL_banque["Id sys"] = df_NL_banque["filename"].apply(lambda x : "_".join(x.split("_")[1:]).replace(".xlsx","")) + "_" + df_NL_banque["index_file"].astype(str)

In [None]:
df_NL_banque = df_NL_banque.sort_values(by=["filename","index_file"])

In [None]:
df_NL_banque = df_NL_banque.drop(columns=['filename','index_file','Mouvement'])
df_NL_banque = df_NL_banque.reset_index(drop=True)

In [None]:
df_NL_banque["Date"] = df_NL_banque["Valeur"]

In [None]:
df_NL_banque["Crédit"] = df_NL_banque["Crédit"].astype(float)
df_NL_banque["Débit"] = df_NL_banque["Débit"].astype(float)

#### Extraction de la banque : CC_PT

In [None]:
df_PT_banque_cc0 = df_PT_banque_cc

In [None]:
df_PT_banque_cc = df_PT_banque_cc0

In [None]:
### Récupérer l'IBAN
PT_banque_cc_iban = df_PT_banque_cc[df_PT_banque_cc[0].str.contains("IBAN",na=False)][0].to_list()[0]
PT_banque_cc_iban = re.findall(r'(?<=IBAN)\:\s{2,}([a-zA-Z\d]*)',PT_banque_cc_iban)[0]

In [None]:
df_PT_banque_cc = df_PT_banque_cc[~df_PT_banque_cc[8].isnull()]
df_PT_banque_cc = df_PT_banque_cc[~df_PT_banque_cc[8].str.contains("Saldo")]
df_PT_banque_cc = df_PT_banque_cc[~df_PT_banque_cc[12].str.contains("Saldo")]

In [None]:
df_PT_banque_cc = df_PT_banque_cc.rename(columns={1:"Date",
                                                  2: "Valeur",
                                                  3: "Libellé",
                                                  8: "Mouvement"})
df_PT_banque_cc = df_PT_banque_cc[['Date',    'Valeur',   'Libellé','Mouvement','filename',    'Compte']]
df_PT_banque_cc["IBAN"] = PT_banque_cc_iban

#### Extraction de la banque : XL_PT

In [None]:
df_PT_banque_xl0 = df_PT_banque_xl

In [None]:
### Récupérer l'IBAN
PT_banque_xl_iban = df_PT_banque_xl[df_PT_banque_xl[0].str.contains("IBAN",na=False)][0].to_list()[0]
PT_banque_xl_iban = re.findall(r'(?<=IBAN)\:\s{2,}([a-zA-Z\d]*)',PT_banque_xl_iban)[0]

In [None]:
df_PT_banque_xl = df_PT_banque_xl[~df_PT_banque_xl[11].isnull()]
df_PT_banque_xl = df_PT_banque_xl[df_PT_banque_xl[11]!="Valor"]

In [None]:
df_PT_banque_xl = df_PT_banque_xl.rename(columns={1:"Date",
                                                  2: "Valeur",
                                                  3: "Libellé",
                                                  11: "Mouvement"})
df_PT_banque_xl = df_PT_banque_xl[['Date',    'Valeur',   'Libellé','Mouvement','filename',    'Compte']]
df_PT_banque_xl["IBAN"] = PT_banque_xl_iban


#### Concat PT

In [None]:
df_PT_banque = pd.concat([df_PT_banque_xl,df_PT_banque_cc]).reset_index()

In [None]:
df_PT_banque["Mouvement"] = df_PT_banque["Mouvement"].str.replace(".","",regex=False)
df_PT_banque["Mouvement"] = df_PT_banque["Mouvement"].str.replace(",",".",regex=False)
df_PT_banque["Mouvement"] = df_PT_banque["Mouvement"].astype(float)

df_PT_banque["Crédit"] = df_PT_banque["Mouvement"].apply(lambda x : x if x>0 else np.nan)
df_PT_banque["Débit"] = abs(df_PT_banque["Mouvement"].apply(lambda x : x if x<0 else np.nan))

In [None]:
df_PT_banque["Banque"] = df_PT_banque["Compte"].apply(lambda x : x.split("-")[1]).str.strip()
df_PT_banque["Fonds"] = df_PT_banque["Compte"].apply(lambda x : x.split("-")[0]).str.strip().str.replace("CO","CC")
df_PT_banque["Pays"] = df_PT_banque["Compte"].apply(lambda x : x.split("-")[2]).str.strip()

In [None]:
df_PT_banque["Date"] = df_PT_banque["Date"] +"-2021"
df_PT_banque["Valeur"] = df_PT_banque["Valeur"] +"-2021"
df_PT_banque["Id sys"] = df_PT_banque["filename"].str.replace(".xlsx","",regex=False) + "_" + df_PT_banque["index"].astype(str)

In [None]:
df_PT_banque = df_PT_banque.drop(columns=["index","Mouvement"])

In [None]:
df_PT_banque["Date"] = pd.to_datetime(df_PT_banque["Date"],format= "%d-%m-%Y")
df_PT_banque["Valeur"] = pd.to_datetime(df_PT_banque["Valeur"],format= "%d-%m-%Y")

### Concat comptes étrangers

In [None]:
df_banque = pd.concat([df_AT_banque,df_NL_banque,df_PT_banque])
df_banque["Débit"] = abs(df_banque["Débit"])

In [None]:
#### Modifier l'année pour les opérations dont Date = 31/12/N => ajouter 1 an pour la Valeur
mask = (df_banque['Date'].dt.month == 12) & (df_banque['Valeur'].dt.month == 1)
df_banque.loc[mask,"Valeur"] = df_banque.loc[mask,"Valeur"] + pd.offsets.DateOffset(years=1)

In [None]:
### Enlever les lignes déjà présentes dans l'extract de l'AMB
df_date_min = df_AMB.groupby(by=["Pays","Fonds"])["Date"].min().reset_index().rename(columns={"Date":"Date_min"})
df_banque = df_banque.merge(df_date_min,on=["Pays","Fonds"])
df_banque = df_banque[df_banque["Date"]< df_banque["Date_min"]]

In [None]:
df_foreign = pd.concat([df_AMB,df_banque]).reset_index(drop=True)

In [None]:
df_foreign["Mois_Year"] = df_foreign["Valeur"].dt.to_period('M')
df_foreign["Compte"] = df_foreign["Compte"].str.strip().apply(lambda x : x[:-2] if x[-1]=="-" else x)
df_foreign["IBAN"] = df_foreign["IBAN"].str.replace(" ","") 

In [None]:
df_foreign.groupby(by=["Compte","Mois_Year"]).agg({"Id sys":"count","Crédit":"sum","Débit":"sum"}).reset_index().to_excel(folder_path + "/transformed_data/2. Banque/Comptes étrangers_statisiques.xlsx",sheet_name="foreign")

In [None]:
df_foreign.groupby(by=["Compte"]).agg({
    "Date":"min",
    "Valeur":"max",
    "Mois_Year":"nunique",
    "Id sys":"count",
    "Crédit":"sum",
    "Débit":"sum"}).reset_index().to_excel(folder_path + "/transformed_data/2. Banque/Comptes étrangers_statistiques_globales.xlsx",sheet_name="foreign")

In [None]:
df_foreign["Libellé"] = df_foreign["Libellé"].str.upper().str.replace("/RE MI","REMI",regex=False).str.replace(r'\s+', ' ', regex=True)
df_foreign["Type_Transaction"] = df_foreign["Type_Transaction"].str.upper()

In [None]:
def get_type_transact_foreign(x):
    if "INCOMING PAYMENT" in x or "VIR R SEPA" in x:
        return "VIREMENT"
    return np.nan

In [None]:
def clean_foreign_name(text,pattern_clean_motif):
    resultat = re.sub(pattern_clean_motif,"",text)
    resultat = resultat.strip()
    resultat = ' '.join(w for w in resultat.split(" ") if len(w) > 1)
    return resultat

In [None]:
mask = df_foreign["Type_Transaction"].isnull()
df_foreign.loc[mask,"Type_Transaction"] = df_foreign.loc[mask,"Libellé"].apply(get_type_transact_foreign)

In [None]:
df_foreign_credit = df_foreign[df_foreign["Débit"].isnull()]
df_foreign_debit = df_foreign[df_foreign["Crédit"].isnull()]

In [None]:
df_foreign_AT = df_foreign_credit[df_foreign_credit["Pays"]=="AT"]
df_foreign_NL = df_foreign_credit[df_foreign_credit["Pays"]=="NL"]
df_foreign_PT = df_foreign_credit[df_foreign_credit["Pays"]=="PT"]

#### Get Titulaire compte PT

In [None]:
df_virement_PT = df_foreign_PT[df_foreign_PT["Type_Transaction"]=="VIREMENT"]

In [None]:
### Nettoyer le titulaire
pt_pattern = r'(?<=9001)\-([a-zA-Z\s\-\.\+\d]*)'
df_virement_PT["Titulaire_clean"] = df_virement_PT["Libellé"].str.extract(pt_pattern)
df_virement_PT["Titulaire_clean"] = df_virement_PT["Titulaire_clean"].apply(get_words_only)
df_virement_PT["Titulaire_clean"] = df_virement_PT["Titulaire_clean"].apply(remove_duplicated)

df_virement_PT["Titulaire_clean"] = df_virement_PT.apply(lambda f: clean_foreign_name(f["Titulaire_clean"],
                                                                                     motsparasites_pt),axis=1)
df_virement_PT["Titulaire_clean"] = df_virement_PT["Titulaire_clean"].apply(lambda text : " ".join([word for word in text.split() if any(letter in 'AEIOU' for letter in word) ]))

In [None]:
### Pour les titulaires nuls, remplacer par le motif
mask = df_virement_PT["Titulaire_clean"]==""
df_virement_PT.loc[mask,"Motif1"] = df_virement_PT.loc[mask,"Libellé"].apply(lambda x : x.split("/")[-1])
df_virement_PT.loc[mask,"Motif1"] = df_virement_PT.loc[mask,"Motif1"].apply(get_words_only)
df_virement_PT.loc[mask,"Motif1"] = df_virement_PT.loc[mask,"Motif1"].apply(remove_duplicated)
df_virement_PT.loc[mask,"Motif1"] = df_virement_PT.loc[mask,:].apply(lambda f: clean_foreign_name(f["Motif1"],
                                                                                     motsparasites_pt),axis=1)
df_virement_PT.loc[mask,"Motif1"] = df_virement_PT.loc[mask,"Motif1"].apply(lambda text : " ".join([word for word in text.split() if any(letter in 'AEIOU' for letter in word) ]))
df_virement_PT.loc[mask,"Motif1"] = df_virement_PT.loc[mask,"Motif1"].apply(remove_de)

df_virement_PT.loc[mask,"Titulaire_clean"] = df_virement_PT.loc[mask,"Motif1"]

#### Get Titulaire compte NL

In [None]:
df_foreign_NL["Type_Transaction"].unique()

In [None]:
df_virement_NL = df_foreign_NL[df_foreign_NL["Type_Transaction"] != 'FRAIS ET AUTRES DÉPENSES']

In [None]:
### Type_Transaction = Online Banking, titulaire = libellé
mask = df_virement_NL["Type_Transaction"] == "ONLINE BANKING"
df_virement_NL.loc[mask,"Titulaire_clean"] = df_virement_NL.loc[mask,"Libellé"]
df_virement_NL.loc[mask,"Titulaire_clean"] = df_virement_NL.loc[mask,"Titulaire_clean"].str.replace("EN/OF","",regex=False)

In [None]:
### Type_Transaction = Virement
mask = df_virement_NL["Type_Transaction"] == "VIREMENT"

### Cleaner le libellé avant extract
df_virement_NL.loc[mask,"Libellé_clean"] = df_virement_NL.loc[mask,"Libellé"].str.replace('/UST D/','/USTD/',regex=False)
df_virement_NL.loc[mask,"Libellé_clean"] = df_virement_NL.loc[mask,"Libellé_clean"].str.replace('/ ','/',regex=False)
df_virement_NL.loc[mask,"Libellé_clean"] = df_virement_NL.loc[mask,"Libellé_clean"].str.replace('.','',regex=False)
df_virement_NL.loc[mask,"Libellé_clean"] = df_virement_NL.loc[mask,"Libellé_clean"].str.replace('R EMI/','REMI/',regex=False)
df_virement_NL.loc[mask,"Libellé_clean"] = df_virement_NL.loc[mask,"Libellé_clean"].str.replace('REM I/','REMI/',regex=False)
df_virement_NL.loc[mask,"Libellé_clean"] = df_virement_NL.loc[mask,"Libellé_clean"].str.replace(r'\/{3}',' ',regex=True)


#### Extract avec les premières règles
pattern_name1 = r'(?<=NAME)\/([a-zA-Z\s\-\.\+\,\d]*)|([a-zA-Z\s\-\.\+\,\d]*)(?=\sREMI)'
df_result1 = df_virement_NL.loc[mask,"Libellé_clean"].str.extract(pattern_name1)
df_virement_NL.loc[mask,"Titulaire_clean"] = df_result1[0].combine_first(df_result1[1])

### Pour les lignes dont le premier pattern ne matche pas, ressayer d'autres règles
mask2 = (df_virement_NL["Type_Transaction"] == "VIREMENT") & (df_virement_NL["Titulaire_clean"].isnull())

pattern_name2 = r'([a-zA-Z\s\-\.\+\,\d]*)(?=\/{2}REMI)|([a-zA-Z\s\-\.\+\,\d]*)(?=\/{2}REOWN)|(?<=\/)\/([a-zA-Z\s\-\.\+\,\d]*)(?=\/REMI)'
df_result2 = df_virement_NL.loc[mask2,"Libellé_clean"].str.extract(pattern_name2)
df_virement_NL.loc[mask2,"Titulaire_clean"] = df_result2[0].combine_first(df_result2[1]).combine_first(df_result2[2])

### Pour les lignes avant 2021-07-23, le libellé = Titulaire
mask3 = (df_virement_NL["Type_Transaction"] == "VIREMENT") & (df_virement_NL["Titulaire_clean"].isnull()) & (df_virement_NL["Valeur"]<= "2021-07-23")
df_virement_NL.loc[mask3,"Titulaire_clean"] = df_virement_NL.loc[mask3,"Libellé_clean"]

In [None]:
### Nettoyer le Titulaire
df_virement_NL["Titulaire_clean"] = df_virement_NL["Titulaire_clean"].apply(get_words_only)
df_virement_NL["Titulaire_clean"] = df_virement_NL["Titulaire_clean"].str.replace(r'\bHR\b|\bMW\b',"",regex=True)
df_virement_NL["Titulaire_clean"] = df_virement_NL["Titulaire_clean"].str.strip().str.replace(r'\s+', ' ', regex=True)

In [None]:
### Extract le motif :
## Motif 1
pattern_motif1_remi = r'(?<=REMI)\/([a-zA-Z\s\-\.\,\d]*)'
df_virement_NL.loc[:,"Motif1"] = df_virement_NL.loc[:,"Libellé_clean"].str.extract(pattern_motif1_remi)

## USTD
pattern_motif1_ustd = r'(?<=USTD)\/{2}([a-zA-Z\s\-\.\,\d]*)|(?<=USTD)\/([a-zA-Z\s\-\.\,\d]*)'
df1 = df_virement_NL[(df_virement_NL["Motif1"].isnull()) | (df_virement_NL["Motif1"]=="USTD")]
df2 = df_virement_NL[~df_virement_NL["Id sys"].isin(df1["Id sys"])]

df_result = df1["Libellé_clean"].str.extract(pattern_motif1_ustd)
df1["Motif1"] = df_result[0].combine_first(df_result[1])
df_virement_NL= pd.concat([df1,df2]).sort_index()

## Motif2
pattern_motif2 = r'(?<=EREF)\/([a-zA-Z\s\-\.\,\d]*)'
df_virement_NL["Motif2"] = df_virement_NL["Libellé_clean"].str.extract(pattern_motif2)

df_virement_NL = df_virement_NL.drop(columns="Libellé_clean")

In [None]:
### Clean données
df_virement_NL["Motif2"] = df_virement_NL["Motif2"].astype(str).apply(lambda x : np.nan if x.replace(" ","")=="NOTPROVIDED" or x=="nan" else x)
df_virement_NL["Motif2"] = df_virement_NL["Motif2"].apply(get_words_only)
df_virement_NL["Motif2"] = df_virement_NL["Motif2"].str.replace(r'\bHR\b|\bMW\b',"",regex=True)
df_virement_NL["Motif2"] = df_virement_NL["Motif2"].str.strip().str.replace(r'\s+', ' ', regex=True)
df_virement_NL["Motif2"] = df_virement_NL["Motif2"].apply(lambda x : np.nan if x=="nan" or x=="None" else x)

In [None]:
def clean_motif1_foreign(text,list_motsparasites = list_motsparasites_nl):
    if text:
        text_clean = text.split()
        for mot in list_motsparasites:
            text = text_clean
            if mot in "".join(text):
                for index in range(0,len(text)-1):
                    if text[index] + text[index+1] == mot:
                        text_clean = [x for x in text if x not in [text[index],text[index+1]]]
        return " ".join(text_clean)

In [None]:
for col in ["Motif1","Motif2"]:
    df_virement_NL[col] = df_virement_NL[col].astype(str).apply(lambda x : np.nan if x.replace(" ","")=="NOTPROVIDED" or x=="nan" else x)
    df_virement_NL[col] = df_virement_NL[col].apply(get_words_only)

    df_virement_NL[col] = df_virement_NL[col].apply(clean_motif1_foreign)
    df_virement_NL[col] = df_virement_NL.astype(str).apply(lambda f: clean_foreign_name(f[col],
                                                                                         motsparasites_nl),axis=1)

    df_virement_NL[col] = df_virement_NL[col].str.replace(r'\bHR\b|\bMW\b',"",regex=True)
    df_virement_NL[col] = df_virement_NL[col].str.strip().str.replace(r'\s+', ' ', regex=True)
    df_virement_NL[col] = df_virement_NL[col].apply(lambda x : np.nan if x=="nan" or x=="None" else x)



#### Get Titulaire compte AT

In [None]:
df_foreign_AT["Type_Transaction"].unique()

In [None]:
df_virement_AT = df_foreign_AT.copy() ### il n'y a que des virements sur les comptes de AT

In [None]:
df_virement_AT.loc[:,"Titulaire_clean"] = df_virement_AT.loc[:,"Libellé"].apply(lambda x : x.split("/")[0])
list_mots = ["BEN", "CLT", "ACC", "SCT", "CT", "INT", "REF", "NOTPROVIDED","INCOMING", "PAYMENT", "CT"]
list_mots = "|".join(list_mots)
df_virement_AT.loc[:,"Titulaire_clean"] = df_virement_AT.loc[:,"Titulaire_clean"].apply(get_words_only)
df_virement_AT.loc[:,"Titulaire_clean"] = df_virement_AT.apply(lambda f: clean_foreign_name(f["Titulaire_clean"],
                                                                                         list_mots),axis=1)
df_virement_AT["Titulaire_clean"] = df_virement_AT["Titulaire_clean"].str.replace(r'\bDR\b',"",regex=True)
df_virement_AT.loc[:,"Titulaire_clean"] = df_virement_AT.loc[:,"Titulaire_clean"].str.strip().str.replace(r'\s+', ' ', regex=True)

In [None]:
### Extract le motif :
## Motif 1
pattern_motif1_remi = r"(?<=REMI)\/([a-zA-Z\s\-\.\,\'\d]*)"
df_virement_AT.loc[:,"Motif1"] = df_virement_AT.loc[:,"Libellé"].str.extract(pattern_motif1_remi)

## USTD
pattern_motif1_scor = r'(?<=SCOR)\/([a-zA-Z\s\-\.\,\d]*)'
df_virement_AT["Motif1"] = df_virement_AT["Motif1"].apply(lambda x : x if x!="SCOR" else np.nan)
df1 = df_virement_AT[df_virement_AT["Motif1"].isnull()]
df2 = df_virement_AT[~df_virement_AT["Id sys"].isin(df1["Id sys"])]

df1["Motif1"] = df1["Libellé"].str.extract(pattern_motif1_scor)
df_virement_AT= pd.concat([df1,df2]).sort_index()

## Motif2
pattern_motif2 = r'(?<=EREF)\/([a-zA-Z\s\-\.\,\d]*)'
df_virement_AT["Motif2"] = df_virement_AT["Libellé"].str.extract(pattern_motif2)

In [None]:
#### Nettoyer les Motifs
for col in ["Motif1","Motif2"]:
    df_virement_AT[col] = df_virement_AT[col].astype(str).apply(lambda x : np.nan if x.replace(" ","")=="NOTPROVIDED" or x=="nan" else x)
    df_virement_AT[col] = df_virement_AT[col].apply(get_words_only)

    df_virement_AT[col] = df_virement_AT.apply(lambda f: clean_motif1_foreign(f[col],
                                                                                list_motsparasites_at),axis=1)
    df_virement_AT[col] = df_virement_AT.astype(str).apply(lambda f: clean_foreign_name(f[col],
                                                                                         motsparasites_at),axis=1)

    df_virement_AT[col] = df_virement_AT[col].str.replace(r'\bDR\b',"",regex=True)
    df_virement_AT[col] = df_virement_AT[col].str.strip().str.replace(r'\s+', ' ', regex=True)
    df_virement_AT[col] = df_virement_AT[col].apply(lambda x : np.nan if x=="nan" or x=="None" else x)


In [None]:
df_foreign_credit.to_excel(folder_path + "/transformed_data/2. Banque/Comptes étrangers_Lignes de crédit.xlsx",sheet_name="foreign",index=False)
df_foreign_debit.to_excel(folder_path + "/transformed_data/2. Banque/Comptes étrangers_Lignes de débit.xlsx",sheet_name="foreign",index=False)

In [None]:
df_virements_foreign = pd.concat([df_virement_AT,df_virement_PT,df_virement_NL]).reset_index()

In [None]:
df_virements_foreign = df_virements_foreign.sort_values(by=["Fonds","Pays","Date"])

In [None]:
df_virements_foreign = df_virements_foreign[['Date', 'Valeur', 'Libellé', 'Code opération',
       'Type_Transaction', 'Ref.', 'Ref.2', 'Numéro de pièce', 'Débit',
       'Crédit', 'IBAN', 'Compte', 'Banque', 'Fonds', 'Pays', 'Id sys',
       'Titulaire_clean', 'Motif1', 'Motif2']]

In [None]:
df_virements_foreign.to_excel(folder_path + "/transformed_data/2. Banque/Comptes étrangers_Virements.xlsx",sheet_name="virements",index=False)

### Concat

In [None]:
df_credit = pd.concat([df_credit1,df_credit2,df_credit_bred])
df_debit = pd.concat([df_debit1,df_debit2,df_debit_bred])

In [None]:
df_debit = df_debit[['Id sys', 'Date', 'Valeur', 'Libellé', 'Ref.', 'Débit', 'Crédit',
       'Fonds', 'IBAN', 'Type_Transaction']]
df_credit = df_credit[['Id sys', 'Date', 'Valeur', 'Libellé', 'Ref.', 'Débit', 'Crédit',
       'Fonds', 'IBAN', 'Type_Transaction']]

In [None]:
# df_virements2 = df_virements2.drop(columns=["Libellé_origin"])
# df_virements1 = df_virements1.drop(columns=["ID_remise","Nb_transaction"])

In [None]:
df_vir_doublons2 = df_vir_doublons2.drop(columns=["Libellé_origin"])
df_vir_doublons1 = df_vir_doublons1.drop(columns=["ID_remise","Nb_transaction"])

In [None]:
df_virements = pd.concat([df_virements1,df_virements2])
df_vir_doublons = pd.concat([df_vir_doublons1,df_vir_doublons2])

In [None]:
df_virements_total = pd.concat([df_virements,df_vir_doublons])

In [None]:
df_virements_total.to_excel(folder_path + "/transformed_data/2. Banque/All_Virements de la colllecte.xlsx",sheet_name="Sheet1",index=False)

In [None]:
len(df_virements) + len(df_vir_doublons)

In [None]:
def reclasser_type_transaction(type_transaction):
    if str(type_transaction)!="nan":
        type_transaction_clean = type_transaction.replace("VIRT","VIR").replace("RECU","")
        type_transaction_clean = type_transaction_clean.strip()
    else : type_transaction_clean = np.nan
    if "REMISE" in type_transaction_clean : type_transaction_clean = "REMISE CHEQUE"
    elif "PRLV" in type_transaction_clean : type_transaction_clean = "PRELEVEMENT"
    elif type_transaction_clean in ["VIR TRESORERIE","VIR TRESO ETRANGER"] : type_transaction_clean = "VIR TRESORERIE"
    elif type_transaction_clean in ["VIR INST","VIR SEPA","VIREMENT  TIERS","VIR ETRANGER","VIREMENT"] : 
        type_transaction_clean = "VIREMENT"
    return type_transaction_clean

In [None]:
mask = ~df_credit["Type_Transaction"].isnull()
df_credit.loc[mask,"Type_Transaction_clean"] = df_credit.loc[mask,"Type_Transaction"].apply(reclasser_type_transaction)

In [None]:
df_credit = df_credit.reset_index().drop(columns="index").reset_index()
mask = df_credit["Id sys"].str.len() > 8
df_credit.loc[mask,"Id sys_clean"] = df_credit.loc[mask,"index"].astype(str).apply(lambda x: x.zfill(8)) 
df_credit.loc[~mask,"Id sys_clean"] = df_credit.loc[~mask,"Id sys"]
df_credit = df_credit.drop(columns="index")

In [None]:
df_debit = df_debit.reset_index().drop(columns="index").reset_index()
mask = df_debit["Id sys"].str.len() > 8
df_debit.loc[mask,"Id sys_clean"] = df_debit.loc[mask,"index"].astype(str).apply(lambda x: "1" + x.zfill(7)) 
df_debit.loc[~mask,"Id sys_clean"] = df_debit.loc[~mask,"Id sys"]
df_debit = df_debit.drop(columns="index")

In [None]:
len(df_credit)

### Forcer l'Id sys

Avec l'intégration des données au fur et à mesure, le code a été amélioré qui a impacté sur le calcul de l'Id sys d'une version déjà en prod. Cette étape est juste pour récupérer l'Id sys déjà en prod.

A la sortie, nous allons avoir 3 colonnes d'Id sys :
+ __Id sys_old__ : l'ancienne version déjà en prod
+ __Id sys__ : nouveaux Id sys
+ __Id sys_clean__: version Id sys clean (que des chiffres pour importer dans le BO ensuite, car les autres Id sys sont créés = filename + Index de la ligne)

In [None]:
df_credit_old = pd.read_excel(folder_path + "/raw_data/6. Données modifiées manuellement/1. Relevé_ancienne version/All_lignes de crédit.xlsx",sheet_name="crédit")
df_debit_old = pd.read_excel(folder_path + "/raw_data/6. Données modifiées manuellement/1. Relevé_ancienne version/All_lignes de débit.xlsx",sheet_name="débit")

In [None]:
def clean_duplicates(df_match_containing_duplicates,columns_matching):
    df_doublons = df_match_containing_duplicates[df_match_containing_duplicates.duplicated(subset="Id sys_old",keep=False)]
    ### Lignes sans doublons
    df_match_without_duplicates = df_match_containing_duplicates[~df_match_containing_duplicates["Id sys_old"].isin(df_doublons["Id sys_old"])]

    df_doublons_old = df_doublons[columns_matching+["Id sys_old"]].drop_duplicates()
    columns_new = [x for x in df_doublons.columns if "_y" not in x]
    columns_new.remove("Id sys_old")
    df_doublons_new = df_doublons[columns_new].drop_duplicates()
    df_match_duplicates = pd.DataFrame()
    for i in range(0,10): # hypothèse: 10 virements max pour le même montant et le même client par jour

        df1_i = df_doublons_old.drop_duplicates(subset=columns_matching,keep='first')
        df_doublons_old = df_doublons_old[~df_doublons_old["Id sys_old"].isin(df1_i["Id sys_old"])]

        df2_i = df_doublons_new.drop_duplicates(subset=columns_matching,keep='first')
        df_doublons_new = df_doublons_new[~df_doublons_new["Id sys"].isin(df2_i["Id sys"])]

        df_match = df2_i.merge(df1_i,on=columns_matching)
        df_match_duplicates = pd.concat([df_match_duplicates,df_match])
    df_match = pd.concat([df_match_without_duplicates,df_match_duplicates])
    return df_match

In [None]:
def recuperer_Id_sys_old(df_credit,df_credit_old):
    ### Formatting
    df_credit_old = df_credit_old.rename(columns={"Id sys":"Id sys_old"})
    df_credit_old = df_credit_old[["Id sys_old","Date","Libellé","Valeur","Ref.","Débit","Crédit","Fonds"]]
    df_credit_old["Id sys_old"] = df_credit_old["Id sys_old"].astype(str)
    df_credit["Id sys"] = df_credit["Id sys"].astype(str)
    df_credit_old["Libellé"] = df_credit_old["Libellé"].astype(str).str.upper()
    df_credit["Libellé"] = df_credit["Libellé"].astype(str)

    ### Rapprocher pour récupérer l'Id sys
    columns_matching = ["Date","Libellé","Valeur","Ref.","Débit","Crédit","Fonds"]
    df_match_avec_libelle = df_credit.merge(df_credit_old,on=columns_matching)

    df_match_avec_libelle = clean_duplicates(df_match_avec_libelle,columns_matching)#df_credit_old.columns,df_credit.columns)


    df_old_restant = df_credit_old[~df_credit_old["Id sys_old"].isin(df_match_avec_libelle["Id sys_old"])]
    df_new_restant = df_credit[~df_credit["Id sys"].isin(df_match_avec_libelle["Id sys"])]

    ### Il y a des lignes dont le libellé est légèrement différent (une partie en plus dans l'ancienne version car c'était pas bien nettoyé)
    columns_matching = ["Date","Valeur","Ref.","Débit","Crédit","Fonds"]
    columns_old = columns_matching+["Id sys_old"]
    columns_new = df_new_restant.columns
    df_old_restant = df_old_restant[columns_old+["Libellé"]]
    df_match_sans_libelle = df_new_restant.merge(df_old_restant,on=columns_matching)

    df_match_sans_libelle = clean_duplicates(df_match_sans_libelle,columns_matching)
    df_match_sans_libelle = df_match_sans_libelle.drop(columns="Libellé_y").rename(columns={"Libellé_x":"Libellé"})

    df_old_restant = df_old_restant[~df_old_restant["Id sys_old"].isin(df_match_sans_libelle["Id sys_old"])]
    df_new_restant = df_new_restant[~df_new_restant["Id sys"].isin(df_match_sans_libelle["Id sys"])]
    df_old_restant["Id sys"] = df_old_restant["Id sys_old"]
    df_new_restant["IBAN"].unique() #### les lignes non rapprochées = BRED + prélèvements CC
    df_new_restant["Id sys_old"] = "Nouvelles_données_intégrées"

    df_credit = pd.concat([df_match_avec_libelle,df_match_sans_libelle,df_new_restant])

    return df_credit,df_old_restant

In [None]:
df_credit_new,df_new_restant = recuperer_Id_sys_old(df_credit,df_credit_old)

In [None]:
df_credit_new = df_credit_new[[ 'Id sys_old', 'Date', 'Valeur', 'Libellé', 'Ref.', 'Débit', 'Crédit',
       'Fonds', 'IBAN', 'Type_Transaction', 'Type_Transaction_clean',
       'Id sys_clean','Id sys']]

In [None]:
df_debit_new,df_debit_new_restant = recuperer_Id_sys_old(df_debit,df_debit_old)

In [None]:
### Créer le Id sys_clean 
mask = df_debit_new["Id sys_clean"].str[0:1]=="1"
max_id = df_debit_new[mask]["Id sys_clean"].astype(int).max()
list_id = [i+max_id for i in range(0,len(df_debit_new_restant)) ]
df_debit_new_restant["Id sys_clean"] = list_id

In [None]:
df_debit_new = pd.concat([df_debit_new,df_debit_new_restant])

In [None]:
df_debit_new = df_debit_new[['Id sys_old', 'Date', 'Valeur', 'Libellé', 'Ref.', 'Débit', 'Crédit',
       'Fonds', 'IBAN', 'Type_Transaction', 'Id sys_clean','Id sys']]

In [None]:
df_debit_new.to_excel(folder_path + "/transformed_data/2. Banque/All_Lignes de débit.xlsx",sheet_name="Sheet1",index=False)
df_credit_new.to_excel(folder_path + "/transformed_data/2. Banque/All_Lignes de crédit.xlsx",sheet_name="Sheet1",index=False)

In [None]:
len(df_credit_new) - len(df_credit_new[df_credit_new["Id sys_old"]=="Nouvelles_données_intégrées"])

## Commentaire BO

In [None]:
####### Concaténer toutes les infos de commentaire en une seule colonne
df_commentaire["subject"] = df_commentaire["subject"].fillna("NA") 
df_commentaire["NoteText"] = df_commentaire["NoteText"].fillna("NA") 
df_commentaire["cor_ordreid"] = df_commentaire["cor_ordreid"].str.lower()
df_commentaire["NoteBO"] = df_commentaire["createdon"] + ", [Objet] " +  df_commentaire["subject"] + ", [Note] " + df_commentaire["NoteText"]
df_commentaire_agg = df_commentaire.groupby(by="cor_ordreid").agg({"NoteBO":"\n".join}).reset_index()
df_commentaire_agg = df_commentaire_agg.rename(columns={"cor_ordreid":"Id_order"})

In [None]:
df_commentaire_agg.to_excel(folder_path + "/transformed_data/commentaire_agg.xlsx",sheet_name="commentaire",index=False)

## ClientOps

### Depuis 062021

In [None]:
df_ClientOps1 = clean_CLientOps(df_ClientOps1)

In [None]:
df_ClientOps1_clean = df_ClientOps1[df_ClientOps1["<"].isnull()]
df_ClientOps1_notclean = df_ClientOps1[~df_ClientOps1["<"].isnull()]
df_ClientOps1_notclean = df_ClientOps1_notclean.drop(columns="Date Valeur").rename(columns={"<":"Date Valeur"})

In [None]:
df_ClientOps1 = pd.concat([df_ClientOps1_clean,df_ClientOps1_notclean])

In [None]:
def change_date(date_valeur,id_sys,date_soldee):
    '''Le format de date pour certaines lignes ne sont pas correctes (1900-04-01). 
    Cette fonction est juste pour les corriger manuellement en reprenant la vraie valeur depuis le fichier excel
    '''
    if id_sys == "472950624":
        return "2020-12-02 00:00:00"
    if id_sys == "29556139":
        return "2022-05-10 00:00:00"
    if id_sys == "15319921":
        return "2020-06-26 00:00:00"
    if str(date_valeur) == "nan":
        return str(date_soldee)
    return date_valeur

In [None]:
df_ClientOps1["Date Valeur"] = df_ClientOps1.apply(lambda f: change_date(f["Date Valeur"],f["Id sys"],f["Souscription soldée (Date/vide)"]), axis=1)
df_ClientOps1["Date Valeur"] = pd.to_datetime(df_ClientOps1["Date Valeur"], format = '%Y-%m-%d %H:%M:%S',errors="coerce").fillna(pd.to_datetime(df_ClientOps1["Date Valeur"], format='%d/%m/%Y',errors="coerce"))

In [None]:
df_ClientOps1_2 = df_ClientOps1[df_ClientOps1["Date Valeur"]< "2021-06-18"] ## Données historiques
df_ClientOps1 = df_ClientOps1[df_ClientOps1["Date Valeur"]>= "2021-06-18"]

In [None]:
df_ClientOps1_2 = df_ClientOps1_2[['Id sys', 'Libellé', 'Date Valeur', 'Montant', 
       "Référence de l'ordre",'Fonds', "Référence de l'ordre_origin", 'Note']].rename(columns={"Id sys":"Id sys_ClientOps",
                                                                            "Libellé":"Libellé_ClientOps",
                                                                            "Date Valeur":"Valeur_ClientOps",
                                                                            "Montant":"Crédit_ClientOps",
                                                                            "Fonds":"Fonds_ClientOps"})

In [None]:
df_ClientOps1 = df_ClientOps1[["Id sys","Référence de l'ordre","Référence de l'ordre_origin",'Fonds']]

### Avant 062021

In [None]:
# Supprimer les lignes vides
df_ClientOps2 = df_ClientOps2[~df_ClientOps2["Référence de l'ordre"].isnull()].drop(columns=['Code Associé','Souscription soldée', 'Colonne1','Colonne2'])

# Cleaner et supprimer les doublons
df_ClientOps2 = clean_CLientOps(df_ClientOps2)

In [None]:
df_ClientOps2 = pd.concat([df_ClientOps2,df_ClientOps1_2])

In [None]:
######## La Date de Valeur dans le fichier ClientOps est parfois inversée entre jour et mois, parfois non. Donc nous allons créer 2 colonnes pour les 2 formats afin de rapprocher le maximum de colonnes avec relevé
df_ClientOps_initial = df_ClientOps2.drop_duplicates()
list_remove = ["i"]
df_ClientOps_initial = df_ClientOps_initial[~df_ClientOps_initial["Crédit_ClientOps"].isin(list_remove)]
df_ClientOps_initial["Crédit_ClientOps"] = df_ClientOps_initial["Crédit_ClientOps"].astype(float)
df_ClientOps_initial["Valeur_initiale_ClientOps"] = df_ClientOps_initial["Valeur_ClientOps"]
df_ClientOps_initial["Valeur_ClientOps"] = pd.to_datetime(df_ClientOps_initial["Valeur_initiale_ClientOps"], format='%Y-%d-%m %H:%M:%S',errors="coerce").fillna(pd.to_datetime(df_ClientOps_initial["Valeur_initiale_ClientOps"], format='%d/%m/%Y',errors="coerce"))
df_ClientOps_initial["Valeur_ClientOps_2"] = pd.to_datetime(df_ClientOps_initial["Valeur_initiale_ClientOps"], format='%Y-%m-%d %H:%M:%S',errors="coerce").fillna(pd.to_datetime(df_ClientOps_initial["Valeur_initiale_ClientOps"], format='%d/%m/%Y',errors="coerce"))

### Pour les lignes ayant le Même Montant et Date (mais qui vient de différentes personnes / pls virs de la même personne), il faut rapprocher avec le titulaire
df_ClientOps_doublons = df_ClientOps_initial[df_ClientOps_initial.duplicated(subset=["Valeur_ClientOps","Crédit_ClientOps"],keep=False)]
df_ClientOps_doublons["Titulaire"] = df_ClientOps_doublons["Libellé_ClientOps"].apply(get_words_only)
df_ClientOps_doublons["Titulaire_clean"] = df_ClientOps_doublons["Titulaire"].apply(clean_motif)

df_ClientOps_doublons["Titulaire_clean"] = df_ClientOps_doublons["Titulaire_clean"].apply(clean_name)

for i in range(0,5): 
    df_ClientOps_doublons["Titulaire_clean"] = df_ClientOps_doublons["Titulaire_clean"].apply(remove_de)
df_ClientOps_doublons["Titulaire_clean"]=df_ClientOps_doublons["Titulaire_clean"].apply(remove_duplicated)

In [None]:
### Rapprocher les doublons
df_virs_concat2 = pd.concat([df_virements2,df_vir_doublons2])
df_match = merge_with_duplicates(df_virs_concat2,df_ClientOps_doublons,"Valeur_ClientOps")
df_match = df_match[['Id sys', 'Id sys_ClientOps', 'Libellé_ClientOps', 'Valeur_ClientOps',"Fonds",
                   'Crédit_ClientOps', "Référence de l'ordre","Référence de l'ordre_origin", 'Note']]

In [None]:
### Rapprocher avec la colonne Valeur_ClientOps (date)
df_ClientOps_sansdoublon = df_ClientOps_initial[~df_ClientOps_initial["Id sys_ClientOps"].isin(list(df_ClientOps_doublons["Id sys_ClientOps"]))]
df_ClientOps2_1 = df_virements2[["Id sys","Valeur","Crédit","Fonds"]].merge(df_ClientOps_sansdoublon, 
                                left_on = ["Valeur","Crédit","Fonds"],\
                              right_on = ["Valeur_ClientOps","Crédit_ClientOps","Fonds_ClientOps"]
                              ).drop(columns=["Valeur","Crédit","Valeur_ClientOps_2"])

In [None]:
### Rapprocher avec la colonne Valeur_ClientOps_2 (inverser les dates)
df_virements_restant2 = df_virements2[~df_virements2["Id sys"].isin(df_ClientOps2_1["Id sys"])]
df_ClientOps_restant2 = df_ClientOps_sansdoublon[~df_ClientOps_sansdoublon["Id sys_ClientOps"].isin(df_ClientOps2_1["Id sys_ClientOps"])]
df_ClientOps2_2 = df_virements_restant2[["Id sys","Valeur","Crédit","Fonds"]].merge(df_ClientOps_restant2, 
                                left_on = ["Valeur","Crédit","Fonds"],\
                              right_on = ["Valeur_ClientOps_2","Crédit_ClientOps","Fonds_ClientOps"]
                              ).drop(columns=["Valeur","Crédit"])
df_ClientOps2_2 = df_ClientOps2_2.drop(columns = "Valeur_ClientOps").rename(columns={"Valeur_ClientOps_2":"Valeur_ClientOps"})

In [None]:
df_ClientOps2 = pd.concat([df_ClientOps2_1,df_ClientOps2_2,df_match])

In [None]:
df_ClientOps_doublons = df_ClientOps_doublons[~df_ClientOps_doublons["Id sys_ClientOps"].isin(df_ClientOps2["Id sys_ClientOps"])]

list_matched = list(df_ClientOps2["Id sys_ClientOps"]) +  list(df_ClientOps_doublons["Id sys_ClientOps"])
df_ClientOps_restant2 = df_ClientOps_initial[~df_ClientOps_initial["Id sys_ClientOps"].isin(list_matched)]

In [None]:
df_ClientOps2 = df_ClientOps2[['Id sys', 'Id sys_ClientOps',"Référence de l'ordre","Référence de l'ordre_origin",'Fonds']]

In [None]:
df_ClientOps2 = df_ClientOps2.drop_duplicates(subset=["Id sys_ClientOps","Référence de l'ordre"])

In [None]:
len(df_ClientOps2)

### Concat

In [None]:
df_ClientOps = pd.concat([df_ClientOps1,df_ClientOps2])

In [None]:
df_ClientOps.to_excel(folder_path + "/transformed_data/3. ClientOps/Suivi ClientOps_concaténé.xlsx",sheet_name="ClientOps",index=False)