In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder

from imblearn.over_sampling import SMOTE

from imblearn.under_sampling import TomekLinks

In [2]:
data = pd.read_csv('datas.csv', sep=';')
data.head()

Unnamed: 0,transactionId,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,0,1,PAYMENT,983964,C1231006815,170136,16029636,M1979787155,0,0,0
1,1,1,PAYMENT,186428,C1666544295,21249,1938472,M2044282225,0,0,0
2,2,1,TRANSFER,181,C1305486145,181,0,C553264065,0,0,1
3,3,1,CASH_OUT,181,C840083671,181,0,C38997010,21182,0,1
4,4,1,PAYMENT,1166814,C2048537720,41554,2988586,M1230701703,0,0,0


In [3]:
data[['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']] = data[['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']].replace({',': '.'}, regex=True)
data[['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']] = data[['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']].astype(float)

In [4]:
data = data.drop(data[['nameOrig', 'step', 'newbalanceOrig', 'oldbalanceDest', 'transactionId', 'nameDest']], axis=1)

In [5]:
encodeur = OneHotEncoder(sparse_output=False)

data_to_encode = data[['type']]

encoded_data = encodeur.fit_transform(data_to_encode)

encoded_df = pd.DataFrame(encoded_data, columns=encodeur.get_feature_names_out(['type']))

data = data.drop(columns=['type'])
data = pd.concat([data, encoded_df], axis=1)



In [6]:
data.head()

Unnamed: 0,amount,oldbalanceOrg,newbalanceDest,isFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,9839.64,170136.0,0.0,0,0.0,0.0,0.0,1.0,0.0
1,1864.28,21249.0,0.0,0,0.0,0.0,0.0,1.0,0.0
2,181.0,181.0,0.0,1,0.0,0.0,0.0,0.0,1.0
3,181.0,181.0,0.0,1,0.0,1.0,0.0,0.0,0.0
4,11668.14,41554.0,0.0,0,0.0,0.0,0.0,1.0,0.0


In [7]:
data['isFraud'].value_counts()

isFraud
0    1000000
1       8213
Name: count, dtype: int64

In [8]:
scaler = MinMaxScaler()

data_scaler = data[['amount', 'oldbalanceOrg', 'newbalanceDest']]

scaled_data = scaler.fit_transform(data_scaler)

scaled_df = pd.DataFrame(scaled_data, columns=['amount', 'oldbalanceOrg', 'newbalanceDest'])

data[['amount', 'oldbalanceOrg', 'newbalanceDest']] = scaled_df


In [9]:
data.head()

Unnamed: 0,amount,oldbalanceOrg,newbalanceDest,isFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,0.000984,0.002855,0.0,0,0.0,0.0,0.0,1.0,0.0
1,0.000186,0.000357,0.0,0,0.0,0.0,0.0,1.0,0.0
2,1.8e-05,3e-06,0.0,1,0.0,0.0,0.0,0.0,1.0
3,1.8e-05,3e-06,0.0,1,0.0,1.0,0.0,0.0,0.0
4,0.001167,0.000697,0.0,0,0.0,0.0,0.0,1.0,0.0


In [10]:
from sklearn.model_selection import train_test_split

x = data[['amount', 'oldbalanceOrg', 'newbalanceDest', 'type_CASH_IN', 'type_CASH_OUT', 'type_DEBIT',	'type_PAYMENT', 'type_TRANSFER']]
y = data['isFraud']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# // SMOTE

In [11]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
x_smote_train, y_smote_train = smote.fit_resample(x_train, y_train)
x_smote_test, y_smote_test = smote.fit_resample(x_test, y_test)

In [12]:
x_smote_train.head()

Unnamed: 0,amount,oldbalanceOrg,newbalanceDest,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,0.003763,0.358146,0.000134,1.0,0.0,0.0,0.0,0.0
1,0.00645,0.177587,0.001476,1.0,0.0,0.0,0.0,0.0
2,0.025567,0.0,0.043822,0.0,1.0,0.0,0.0,0.0
3,0.045541,0.000345,0.001556,0.0,1.0,0.0,0.0,0.0
4,0.000586,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [14]:
x_smote_train.shape[0]

1599878

In [15]:
y_smote_train.value_counts()

isFraud
0    799939
1    799939
Name: count, dtype: int64

In [16]:
x_smote_test.shape[0]

400122

In [17]:
y_smote_test.value_counts()

isFraud
0    200061
1    200061
Name: count, dtype: int64

In [44]:
# x_smote_train.to_csv('x_smote_train.csv', index=False, sep=';')
# y_smote_train.to_csv('y_smote_train.csv', index=False, sep=';')
# x_test.to_csv('x_smote_test.csv', index=False, sep=';')
# y_test.to_csv('y_smote_test.csv', index=False, sep=';')

# TOMEK LINKS

In [18]:
tomek = TomekLinks()

x_smote_tomek_train, y_smote_tomek_train = tomek.fit_resample(x_smote_train, y_smote_train)
x_smote_tomek_test, y_smote_tomek_test = tomek.fit_resample(x_smote_test, y_smote_test)

In [19]:
x_smote_tomek_train.shape[0]

1599477

In [20]:
y_smote_tomek_train.value_counts()

isFraud
0    799939
1    799538
Name: count, dtype: int64

In [22]:
x_smote_tomek_test.shape[0]

399994

In [23]:
y_smote_tomek_test.value_counts()

isFraud
0    200061
1    199933
Name: count, dtype: int64

In [24]:
x_smote_tomek_train.to_csv('x_smote_tomek_train.csv', index=False, sep=';')
y_smote_tomek_train.to_csv('y_smote_tomek_train.csv', index=False, sep=';')
x_smote_tomek_test.to_csv('x_smote_tomek_test.csv', index=False, sep=';')
y_smote_tomek_test.to_csv('y_smote_tomek_test.csv', index=False, sep=';')

# // NEAR MISS

In [16]:
from imblearn.under_sampling import NearMiss

near_miss = NearMiss()

x_nearmiss_train, y_nearmiss_train = near_miss.fit_resample(x_train, y_train)

In [17]:
x_nearmiss_train.shape

(13262, 8)

In [18]:
y_nearmiss_train.value_counts()

isFraud
0    6631
1    6631
Name: count, dtype: int64

In [19]:
x_nearmiss_train.to_csv('x_nearmiss_train.csv', index=False, sep=';')
y_nearmiss_train.to_csv('y_nearmiss_train.csv', index=False, sep=';')

# // SMOTEENN

In [21]:
from imblearn.combine import SMOTEENN

smoteenn = SMOTEENN()

x_train_smoteenn, y_train_smoteenn = smoteenn.fit_resample(x_train, y_train)
x_test_smoteenn, y_test_smoteenn = smoteenn.fit_resample(x_test, y_test)

KeyboardInterrupt: 

In [21]:
x_train_smoteenn.shape[0]

1589392

In [22]:
y_train_smoteenn.value_counts()

isFraud
1    797616
0    791776
Name: count, dtype: int64

In [None]:
tomek = TomekLinks()

x_smoteenn_tomek_train, y_smoteenn_tomek_train = tomek.fit_resample(x_smoteenn_train, y_smoteenn_train)
x_smoteenn_tomek_test, y_smoteenn_tomek_test = tomek.fit_resample(x_smoteenn_test, y_smoteenn_test)

In [23]:
x_smoteenn_tomek_train.to_csv('x_smoteenn_tomek_train.csv', index=False, sep=';')
y_smoteenn_tomek_train.to_csv('y_smoteenn_tomek_train.csv', index=False, sep=';')
x_smoteenn_tomek_test.to_csv('x_smoteenn_tomek_test.csv', index=False, sep=';')
y_smoteenn_tomek_test.to_csv('y_smoteenn_tomek_test.csv', index=False, sep=';')