In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
import numpy as np

In [75]:
data = pd.read_csv('train.csv', sep=';')
data.head()

Unnamed: 0,transactionId,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,103822,10,CASH_OUT,56957.7,C847783725,5090.0,0.0,C1980189656,715016.0,1619320.0,0
1,218494,13,CASH_IN,88956.4,C1602372431,25836700.0,25925600.0,C1790096045,106039.0,17082.8,0
2,31215,8,CASH_IN,106888.0,C377554582,7640700.0,7747590.0,C796028148,642962.0,536074.0,0
3,634387,35,PAYMENT,10239.3,C826339167,12880.4,2641.17,M635490610,0.0,0.0,0
4,318808,16,PAYMENT,15767.4,C1112957158,107767.0,91999.6,M313416684,0.0,0.0,0


In [76]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 988048 entries, 0 to 988047
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   transactionId   988048 non-null  int64  
 1   step            988048 non-null  int64  
 2   type            988048 non-null  object 
 3   amount          988048 non-null  float64
 4   nameOrig        988048 non-null  object 
 5   oldbalanceOrg   988048 non-null  float64
 6   newbalanceOrig  988048 non-null  float64
 7   nameDest        988048 non-null  object 
 8   oldbalanceDest  988048 non-null  float64
 9   newbalanceDest  988048 non-null  float64
 10  isFraud         988048 non-null  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 82.9+ MB


In [77]:
def indicateur(df):
    df["suspicious_flag"] = ((df["type"].isin(["CASH_OUT", "TRANSFER"])) &
                              (df["oldbalanceOrg"] > 0) &
                                (df["newbalanceOrig"] < 1) &
                                  (df["newbalanceDest"] > 0) ).astype(int)
    
    return df

indicateurs = FunctionTransformer(indicateur,validate=False)

In [78]:
def drop_colonne(df):
    colonne_drop = ["transactionId","step",'nameOrig','nameDest','oldbalanceDest','newbalanceOrig']
    colonne_drop = [col for col in colonne_drop if col in df.columns]
    df =  df.drop(columns= colonne_drop, axis=1, inplace=True)
    return df
drop_colunms = FunctionTransformer(drop_colonne, validate=False)

In [79]:
def transformer(df):
    numerical_cols = df.select_dtypes(include=np.number).columns
    categorical_cols = df.select_dtypes(include=object).columns

    processeur = ColumnTransformer([
        ("numerical", MinMaxScaler(), numerical_cols),
        ("categorical", OneHotEncoder(sparse_output=False), categorical_cols)
    ])

    df_transformer = processeur.fit_transform(df)

    return df_transformer
transformers = FunctionTransformer(transformer, validate=False)

In [80]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=0)

In [81]:
X = data.drop("isFraud", axis=1)
y = data["isFraud"]  

In [82]:
y.value_counts()

isFraud
0    979999
1      8049
Name: count, dtype: int64

In [83]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)

In [84]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as pi

In [93]:
model_final = Pipeline([
    ("indicateur", indicateurs),
   ("drop_colonnes",drop_colunms),
   ("transformers",transformers),
   ("model", model)
])


In [None]:
model_final.fit(X)

In [157]:
df = pd.read_csv("C:\\Users\\yacine.medjbeur\\Downloads\\data_30000.csv", sep=";")

In [158]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,transactionId
0,403,PAYMENT,4478.36,C839471019,0.0,0.0,M1816671010,0.0,0.0,0,5904099
1,234,CASH_IN,376200.73,C1170853173,3809732.32,4185933.05,C398294488,571520.05,195319.32,0,3053641
2,282,PAYMENT,9855.71,C199562839,119286.0,109430.29,M841433582,0.0,0.0,0,3842444
3,328,CASH_IN,265209.16,C1887469617,123554.0,388763.16,C283241429,1327142.29,1061933.13,0,4596599
4,238,PAYMENT,15447.88,C76290743,7804.0,0.0,M685801254,0.0,0.0,0,3170119


In [159]:
df["isFraud"].value_counts()

isFraud
0    29969
1       31
Name: count, dtype: int64

In [160]:
y = df["isFraud"]
X = df.drop("isFraud", axis=1)

In [162]:
from imblearn.over_sampling import RandomOverSampler

over = RandomOverSampler(sampling_strategy={1:29969})
X_res, y_res = over.fit_resample(X, y)

In [164]:
data = pd.concat([X_res, y_res], axis=1)

In [166]:
data.to_csv("data_modifier_par_smote.csv", sep=';', index=False)

In [5]:
data  = pd.read_csv("IA\\test.csv", sep=';')

In [7]:
data["isFraud"].value_counts()

isFraud
0    20001
1      164
Name: count, dtype: int64