# Cette experience avec smote "auto" et avec le balance_diff

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import FunctionTransformer

In [2]:
data = pd.read_csv("train.csv", sep=";")
data.head()

Unnamed: 0,transactionId,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,103822,10,CASH_OUT,56957.7,C847783725,5090.0,0.0,C1980189656,715016.0,1619320.0,0
1,218494,13,CASH_IN,88956.4,C1602372431,25836700.0,25925600.0,C1790096045,106039.0,17082.8,0
2,31215,8,CASH_IN,106888.0,C377554582,7640700.0,7747590.0,C796028148,642962.0,536074.0,0
3,634387,35,PAYMENT,10239.3,C826339167,12880.4,2641.17,M635490610,0.0,0.0,0
4,318808,16,PAYMENT,15767.4,C1112957158,107767.0,91999.6,M313416684,0.0,0.0,0


# feature engineering

In [3]:
def balance_diff(df):
    df["balance_diff"] = (df["oldbalanceOrg"] - df["oldbalanceDest"]) / (df["oldbalanceOrg"] + df["oldbalanceDest"] + 1e-6) # 1e-6 pour eviter la division sur 0
    return df[["balance_diff"]]

balance_diffs = FunctionTransformer(balance_diff,validate=False)

# fonction drop columns

In [4]:
def drop_colonne(df):
    colonne_drop = ["transactionId",'nameOrig','nameDest','oldbalanceDest', 'newbalanceDest','oldbalanceOrg','newbalanceOrig']
    colonne_drop = [col for col in colonne_drop if col in df.columns]
    df =  df.drop(colonne_drop, axis=1)
    return df

drop_colunms = FunctionTransformer(drop_colonne, validate=False) # ma fonction de supprimer 

In [5]:
from sklearn.impute import SimpleImputer
simply = SimpleImputer(strategy="most_frequent")

def drop_nan_numerique(df, plan):
    for col in df.columns:
        if df[col].isna().sum() > 0:
            if plan == "mean" and df[col].dtypes in [float, int]:
                df[col].fillna(np.mean(df[col]), inplace=True)
            elif plan == "median" and df[col].dtypes in [float, int]:
                df[col].fillna(df[col].median(), inplace=True)
            elif plan == "drop":
                df.dropna(axis=0, inplace=True)
                 
    return df


def drop_nan_categorique(df):
    for col in df.columns:
        if df[col].dtypes == object:
            df[col] = simply.fit_transform(df[[col]]).ravel()
    return df

# plan  : ["mean", "median" , "drop"]
plans = {
    'numeric': ['mean',"median" , "drop"],            
    'categorical': 'most_frequent'  
}
# 0 : mean
# 1 : medien
# 2 : drop

def imputer_transformer(df):
    df = drop_nan_numerique(df, plans['numeric'][1])
    df = drop_nan_categorique(df)

    return df

In [6]:
nan_transformer = FunctionTransformer(imputer_transformer, validate=False)

# Encoder 

In [7]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(handle_unknown="ignore") # encoder les categorical

# normaliser les données 

In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean=False) # normaliser les données 

# equilibrer les classe

In [9]:
from imblearn.over_sampling import SMOTE
samply = SMOTE(sampling_strategy={0:600000, 1:400000},random_state=0) 

# les models choisi 

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [11]:
from imblearn.pipeline import Pipeline as pi
from sklearn.pipeline import make_pipeline 
from sklearn.model_selection import learning_curve, train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [12]:
rf = RandomForestClassifier(random_state=0)
svm = SVC(random_state=0)

In [13]:
liste_models = {"RandomForestClassifier":rf,
                "SVC":svm}

# definir la target et les features

In [14]:
y = data["isFraud"]
X = data.drop("isFraud", axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((790438, 10), (197610, 10), (790438,), (197610,))

In [15]:
def pipeline_avec_smote(model):
    return pi([("balance_diffs", balance_diffs),
                                  ("drop_colunms",drop_colunms),
                                  ("nan_transformer",nan_transformer),
                                  ("encoder",encoder),
                                  ("scaler",scaler),
                                  ("smote",samply),
                                  ("model",model)])


def train_model(model):
    
    final_pipeline = pipeline_avec_smote(model)
        
    final_pipeline.fit(X_train, y_train)
    y_pred = final_pipeline.predict(X_test)
    print(model.__class__.__name__)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

In [16]:
from tqdm import tqdm
for name, model in tqdm(liste_models.items()):
    print(name)
    train_model(model)

  0%|          | 0/2 [00:00<?, ?it/s]

RandomForestClassifier


  0%|          | 0/2 [00:00<?, ?it/s]


ValueError: With over-sampling methods, the number of samples in a class should be greater or equal to the original number of samples. Originally, there is 784020 samples and 600000 samples are asked.

# tracer la courbe d'aprentisage 

In [None]:
N, train_score, val_score = learning_curve(# model choisi ,
     X_train, y_train, cv=4, train_sizes=np.linspace(0.1,1,10))

In [None]:
plt.figure(figsize=(18,10))
plt.plot(N, train_score.mean(axis=1), label="train score")
plt.plot(N, val_score.mean(axis=1), label="val score")
plt.legend()