In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("train.csv", sep=";")
data.head()

Unnamed: 0,transactionId,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,103822,10,CASH_OUT,56957.7,C847783725,5090.0,0.0,C1980189656,715016.0,1619320.0,0
1,218494,13,CASH_IN,88956.4,C1602372431,25836700.0,25925600.0,C1790096045,106039.0,17082.8,0
2,31215,8,CASH_IN,106888.0,C377554582,7640700.0,7747590.0,C796028148,642962.0,536074.0,0
3,634387,35,PAYMENT,10239.3,C826339167,12880.4,2641.17,M635490610,0.0,0.0,0
4,318808,16,PAYMENT,15767.4,C1112957158,107767.0,91999.6,M313416684,0.0,0.0,0


In [None]:
data.dropna

In [3]:
data.groupby("isFraud")["type"].value_counts()

isFraud  type    
0        CASH_OUT    355205
         PAYMENT     323381
         CASH_IN     214501
         TRANSFER     80564
         DEBIT         6348
1        CASH_OUT      4040
         TRANSFER      4009
Name: count, dtype: int64

In [4]:
data[(data["type"].isin(["CASH_OUT", "TRANSFER"])) & (data["oldbalanceOrg"] > 0) & (data["newbalanceOrig"] < 1) & (data["newbalanceDest"] > 0) & (data["isFraud"] == 1)]

Unnamed: 0,transactionId,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
98,4785024,337,CASH_OUT,658637.0,C1494570595,658637.0,0.0,C975876326,0.0,658637.0,1
1142,6265002,613,CASH_OUT,86002.1,C1363187770,86002.1,0.0,C1320785064,0.0,86002.1,1
1374,6336148,691,CASH_OUT,32935.2,C2017409191,32935.2,0.0,C1772041191,0.0,32935.2,1
2015,5188061,368,CASH_OUT,199414.0,C1712549867,199414.0,0.0,C359165677,0.0,199414.0,1
2106,6221527,590,CASH_OUT,802484.0,C1430477508,802484.0,0.0,C1344272821,0.0,802484.0,1
...,...,...,...,...,...,...,...,...,...,...,...
986282,6273169,633,CASH_OUT,125107.0,C1400028457,125107.0,0.0,C1683701941,0.0,125107.0,1
986462,1932174,171,CASH_OUT,6501300.0,C1026263737,6501300.0,0.0,C45760834,0.0,6501300.0,1
986988,6175272,564,CASH_OUT,1650860.0,C138899312,1650860.0,0.0,C603288664,1295490.0,2946350.0,1
987845,6194207,572,CASH_OUT,161017.0,C1403093345,161017.0,0.0,C1655653682,92806.2,253823.0,1


In [5]:
data["newbalanceOrig"].sort_values().unique().tolist()
data[data["newbalanceOrig"] == 0.73]

Unnamed: 0,transactionId,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
805936,329006,16,CASH_OUT,10620.3,C2084013643,10621.0,0.73,C1102995953,303304.0,1395910.0,0


In [6]:
from sklearn.preprocessing import FunctionTransformer # pour transformer les fonctions

# feature engineering

In [7]:
def difference(df):
    df["differenceDest"] = df["newbalanceDest"] - df["oldbalanceDest"]
    df["differenceOrig"] = df["newbalanceOrig"] - df["oldbalanceOrg"]
    return df[["differenceDest","differenceOrig"]]

differences = FunctionTransformer(difference, validate=False)

In [8]:
def ratio(df):
    df["ratioDest"] = df["newbalanceDest"] / (df["oldbalanceDest"] + 1e-6) # 1e-6 pour eviter la division sur 0
    df["ratioOrig"] = df["newbalanceOrig"] / (df["oldbalanceOrg"] + 1e-6) # 1e-6 pour eviter la division sur 0
    return df[["ratioDest","ratioOrig"]]

ratios = FunctionTransformer(ratio, validate=False)

In [9]:
def montant(df):
    df["transaction_amount"] = abs((df["newbalanceDest"] - df["oldbalanceDest"]) - (df["newbalanceOrig"] - df["oldbalanceOrg"]))
    return df[["transaction_amount"]]

montants = FunctionTransformer(montant,validate=False)

In [10]:
def indicateur(df):
    df["suspicious_flag"] = ((df["type"].isin(["CASH_OUT", "TRANSFER"])) &
                              (df["oldbalanceOrg"] > 0) &
                                (df["newbalanceOrig"] < 1) &
                                  (df["newbalanceDest"] > 0) ).astype(int)
    
    return df[["suspicious_flag"]]

indicateurs = FunctionTransformer(indicateur,validate=False)

In [11]:
def balance_diff(df):
    df["balance_diff"] = (df["oldbalanceOrg"] - df["oldbalanceDest"]) / (df["oldbalanceOrg"] + df["oldbalanceDest"] + 1e-6) # 1e-6 pour eviter la division sur 0
    return df[["balance_diff"]]

balance_diffs = FunctionTransformer(balance_diff,validate=False)

# fonction drop columns

In [37]:
def drop_colonne(df, colonne_drop):
    colonne_drop = [col for col in colonne_drop if col in df.columns]
    df =  df.drop(colonne_drop, axis=1)
    return df



drop_colunms = FunctionTransformer(drop_colonne, validate=False) # ma fonction de supprimer 

# fonction supprimer les NaN si on a

In [38]:
from sklearn.impute import SimpleImputer
simply = SimpleImputer(strategy="most_frequent")

def drop_nan_numerique(df, plan):
    for col in df.columns:
        if df[col].isna().sum() > 0:
            if plan == "mean" and df[col].dtypes in [float, int]:
                df[col].fillna(np.mean(df[col]), inplace=True)
            elif plan == "median" and df[col].dtypes in [float, int]:
                df[col].fillna(df[col].median(), inplace=True)
            elif plan == "drop":
                df.dropna(axis=0, inplace=True)
                 
    return df


def drop_nan_categorique(df):
    for col in df.columns:
        if df[col].dtypes == object:
            df[col] = simply.fit_transform(df[[col]]).ravel()
    return df



# plan  : ["mean", "median" , "drop"]
plans = {
    'numeric': 'mean',            
    'categorical': 'most_frequent'  
}

def imputer_transformer(df):
    df = drop_nan_numerique(df, plans['numeric'])
    df = drop_nan_categorique(df)

    return df


transformer = FunctionTransformer(imputer_transformer, validate=False)

In [12]:
from sklearn.preprocessing import OneHotEncoder # encoder les categoriques
from sklearn.preprocessing import StandardScaler # normaliser les données numerique