In [3]:
import pandas as pd
from sklearn.pipeline import FunctionTransformer, Pipeline
from sklearn.preprocessing import StandardScaler
!pip install category_encoders
import category_encoders as ce

class Data():
    def __init__(self, path) -> None:
        self.df = pd.read_csv(path)
        self.X = []
        self.y = []

    def cleaner(self, df):
        df['Hour'] = df['Hour'].astype('float')

        # Scale the "Amount" column
        scaler = StandardScaler()
        df['Amount'] = scaler.fit_transform(df[['Amount']])

        # Binary encoding for categorical variables
        cat_col = ['Use Chip', 'Day of Week']
        for col in cat_col:
            if col in df.columns:
                be = ce.BinaryEncoder(drop_invariant=False)
                enc_df = pd.DataFrame(be.fit_transform(df[col]), dtype='int8')
                df = pd.concat([df, enc_df], axis=1)
                df.drop([col], axis=1, inplace=True)

        for col in df.columns:
            df[col] = df[col].astype(float)
        return df

    def clean_data(self):
        self.df["Amount"] = self.df["Amount"].str.replace("$","").astype(float)
        self.df["Hour"] = self.df["Time"].str [0:2]
        self.df["Minute"] = self.df["Time"].str [3:5]
        self.df = self.df.drop(['Time'],axis=1)
        self.df["Is Fraud?"] = self.df["Is Fraud?"].apply(lambda x: 1 if x == 'Yes' else 0)
        self.df['Date'] = pd.to_datetime(self.df[['Year', 'Month', 'Day']])

        # Extract day of the week and map it to its name
        days = {0:'Mon', 1:'Tue', 2:'Wed', 3:'Thu', 4:'Fri', 5:'Sat', 6:'Sun'}
        self.df['Day of Week'] = self.df['Date'].dt.dayofweek.map(days)

        columns_to_select = ['Year', 'Day of Week', 'Hour', 'Amount', 'Use Chip', 'Merchant Name', 'MCC', 'Is Fraud?']
        self.df = self.df[columns_to_select]
        preprocessing_pipeline = Pipeline([
            ('cleaning', FunctionTransformer(self.cleaner, validate=False)),
        ], verbose=True)

        self.df = preprocessing_pipeline.fit_transform(self.df)


Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.3


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, f1_score

class RandomForest():
    def __init__(self, n_estimators = 100, random_state = 42) -> None:
        self.n_estimator = n_estimators
        self.random_state = random_state
        self.model = RandomForestClassifier(n_estimators=self.n_estimator, random_state=self.random_state)
        self.x_train = []
        self.y_train = []

    def fit(self, x_train, y_train):
        self.model.fit(x_train,y_train)

    def predict(self, X):
        return self.model.predict(X)

    def test(self, x_test, y_test):
        y_pred = self.model.predict(x_test)

        accuracy = accuracy_score(y_test, y_pred)
        classification_report_result = classification_report(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        return accuracy, classification_report_result, f1

    def param_optim(self, X, Y, n_estim = [50, 100, 200, 500], max_depth=[None, 10, 20, 30], min_samples_split=[2, 5, 10], min_samples_leaf=[1, 2, 4]):
        param_grid = {
            'n_estimators': n_estim,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf
        }

        self.x_train = X
        self.y_train = Y
        new_model = RandomForestClassifier(random_state=100)

        grid_search = GridSearchCV(new_model, param_grid, cv=5, scoring='accuracy')
        grid_search.fit(X, Y)

        self.model = RandomForestClassifier(
                                            n_estimators=grid_search.best_params_['n_estimators'],
                                            max_depth=grid_search.best_params_['max_depth'],
                                            min_samples_split=grid_search.best_params_['min_samples_split'],
                                            min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                            random_state=self.random_state)




In [None]:
from google.colab import files
files.upload()

In [6]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!rm kaggle.json

In [7]:
!kaggle datasets download -d ealtman2019/credit-card-transactions

Downloading credit-card-transactions.zip to /content
 99% 262M/263M [00:01<00:00, 181MB/s]
100% 263M/263M [00:01<00:00, 174MB/s]


In [8]:
! mkdir train
! unzip credit-card-transactions.zip -d train

Archive:  credit-card-transactions.zip
  inflating: train/User0_credit_card_transactions.csv  
  inflating: train/credit_card_transactions-ibm_v2.csv  
  inflating: train/sd254_cards.csv   
  inflating: train/sd254_users.csv   


In [12]:
files.upload()

Saving knn_model.zip to knn_model.zip


In [14]:
! mkdir models
! unzip knn_model.zip -d models
! rm knn_model.zip

mkdir: cannot create directory ‘models’: File exists
unzip:  cannot find or open knn_model.zip, knn_model.zip.zip or knn_model.zip.ZIP.
rm: cannot remove 'knn_model.zip': No such file or directory


In [16]:
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import TomekLinks


dataFrame = Data('/content/train/credit_card_transactions-ibm_v2.csv')
dataFrame.clean_data()

X = dataFrame.df.drop(columns=['Is Fraud?'])
y = dataFrame.df['Is Fraud?']

# Split the resampled data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1613)

  self.df["Amount"] = self.df["Amount"].str.replace("$","").astype(float)


[Pipeline] .......... (step 1 of 1) Processing cleaning, total=  42.5s


In [None]:
tl = TomekLinks(sampling_strategy='auto', n_jobs=-1)
X_resampled, y_resampled = tl.fit_resample(X_train, y_train)

In [None]:
rf = RandomForest()


rf.fit(X_resampled, y_resampled)

acc, crr, f1 = rf.test(X_test, y_test)

print(crr)

6
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00   7307107
         1.0       0.95      0.55      0.69      8963

    accuracy                           1.00   7316070
   macro avg       0.97      0.77      0.85   7316070
weighted avg       1.00      1.00      1.00   7316070



In [None]:
new_test_df = dataFrame.df[dataFrame.df['Is Fraud?'] == 1]

new_test_x = new_test_df.drop(columns=['Is Fraud?'])
new_test_y = new_test_df['Is Fraud?']

_, crr1 , _ = rf.test(new_test_x, new_test_y)

print(crr1)

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         0
         1.0       1.00      0.86      0.92     29757

    accuracy                           0.86     29757
   macro avg       0.50      0.43      0.46     29757
weighted avg       1.00      0.86      0.92     29757



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
print(len(X_train), len(X_resampled), sep = "\n")

X_deleted = X_train[~X_train.index.isin(X_resampled.index)]
Y_deleted = y_train[~y_train.index.isin(y_resampled.index)]
Y_deleted = Y_deleted.apply(lambda x: 2)

x_train_with_new_label = pd.concat([X_resampled, X_deleted])
y_train_with_new_label = pd.concat([y_resampled, Y_deleted])

17070830
17067434


In [None]:
from imblearn.under_sampling import RandomUnderSampler
desired_proportion = 0.2
total_samples = 40000
fraud_samples = int(total_samples * desired_proportion)

# Create RandomUnderSampler with the desired sampling strategy
rus = RandomUnderSampler(sampling_strategy={0: total_samples - fraud_samples, 1: fraud_samples, 2: int(fraud_samples/2)}, random_state=1613)
# Apply random undersampling to the original dataset
X_resampled2, y_resampled2 = rus.fit_resample(x_train_with_new_label, y_train_with_new_label)

In [None]:
new_model = RandomForest()
new_model.fit(X_resampled2, y_resampled2)

In [None]:
import joblib
file_path = "random_forest_model.joblib"
joblib.dump(new_model, file_path)

['random_forest_model.joblib']

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
desired_proportion = 0.2
total_samples = 40000
fraud_samples = int(total_samples * desired_proportion)

# Create RandomUnderSampler with the desired sampling strategy
rus = RandomUnderSampler(sampling_strategy={0: total_samples - fraud_samples, 1: fraud_samples}, random_state=1613)

# Apply random undersampling to the original dataset
X_resampled3, y_resampled3 = rus.fit_resample(X_resampled, y_resampled)
knn_model = KNeighborsClassifier(n_neighbors=5, algorithm='auto')
knn_model.fit(X_resampled3, y_resampled3)

y_pred_knn = knn_model.predict(X_test)

print("k-Nearest Neighbors Classifier Results:")
print(classification_report(y_test, y_pred_knn))
print(confusion_matrix(y_test, y_pred_knn))

k-Nearest Neighbors Classifier Results:
              precision    recall  f1-score   support

         0.0       1.00      0.95      0.97   7307107
         1.0       0.02      0.73      0.03      8963

    accuracy                           0.95   7316070
   macro avg       0.51      0.84      0.50   7316070
weighted avg       1.00      0.95      0.97   7316070

[[6934674  372433]
 [   2403    6560]]


In [None]:
import joblib
file_path = "knn_model.joblib"
joblib.dump(knn_model, file_path)

['knn_model.joblib']

In [17]:
import joblib
rf_model_3_label = joblib.load("/content/models/random_forest_model.joblib")
knn_model = joblib.load("/content/models/knn_model.joblib")

In [20]:
def test(X, rf_model, knn_model):
    # First labels'ı predict et
    first_labels = rf_model.predict(X)

    # First labels içinde 2 olanları seç
    second_x = X[first_labels == 2]

    # Second labels'ı predict et
    second_labels = knn_model.predict(second_x)

    # First labels ve Second labels'ı birleştir
    combined_labels = pd.Series(first_labels)
    combined_labels[first_labels == 2] = second_labels

    return combined_labels

In [21]:
combined_labels_result = test(X_test, rf_model_3_label, knn_model)
print(combined_labels_result)

0          0.0
1          0.0
2          0.0
3          0.0
4          0.0
          ... 
7316065    0.0
7316066    0.0
7316067    0.0
7316068    0.0
7316069    0.0
Length: 7316070, dtype: float64


In [22]:
print(classification_report(y_test, combined_labels_result))

              precision    recall  f1-score   support

         0.0       1.00      0.99      0.99   7307107
         1.0       0.06      0.78      0.12      8963

    accuracy                           0.99   7316070
   macro avg       0.53      0.88      0.55   7316070
weighted avg       1.00      0.99      0.99   7316070



In [23]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
