In [1]:
import pandas as pd
from sklearn.pipeline import FunctionTransformer, Pipeline
from sklearn.preprocessing import StandardScaler
!pip install category_encoders
import category_encoders as ce

class Data():
    def __init__(self, path) -> None:
        self.df = pd.read_csv(path)
        self.X = []
        self.y = []

    def cleaner(self, df):
        df['Hour'] = df['Hour'].astype('float')

        # Scale the "Amount" column
        scaler = StandardScaler()
        df['Amount'] = scaler.fit_transform(df[['Amount']])

        # Binary encoding for categorical variables
        cat_col = ['Use Chip', 'Day of Week']
        for col in cat_col:
            if col in df.columns:
                be = ce.BinaryEncoder(drop_invariant=False)
                enc_df = pd.DataFrame(be.fit_transform(df[col]), dtype='int8')
                df = pd.concat([df, enc_df], axis=1)
                df.drop([col], axis=1, inplace=True)

        for col in df.columns:
            df[col] = df[col].astype(float)
        return df

    def clean_data(self):
        self.df["Amount"] = self.df["Amount"].str.replace("$","").astype(float)
        self.df["Hour"] = self.df["Time"].str [0:2]
        self.df["Minute"] = self.df["Time"].str [3:5]
        self.df = self.df.drop(['Time'],axis=1)
        self.df["Is Fraud?"] = self.df["Is Fraud?"].apply(lambda x: 1 if x == 'Yes' else 0)
        self.df['Date'] = pd.to_datetime(self.df[['Year', 'Month', 'Day']])

        # Extract day of the week and map it to its name
        days = {0:'Mon', 1:'Tue', 2:'Wed', 3:'Thu', 4:'Fri', 5:'Sat', 6:'Sun'}
        self.df['Day of Week'] = self.df['Date'].dt.dayofweek.map(days)

        columns_to_select = ['Year', 'Day of Week', 'Hour', 'Amount', 'Use Chip', 'Merchant Name', 'MCC', 'Is Fraud?']
        self.df = self.df[columns_to_select]
        preprocessing_pipeline = Pipeline([
            ('cleaning', FunctionTransformer(self.cleaner, validate=False)),
        ], verbose=True)

        self.df = preprocessing_pipeline.fit_transform(self.df)


Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.3


In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, f1_score

class RandomForest():
    def __init__(self, n_estimators = 100, random_state = 42) -> None:
        self.n_estimator = n_estimators
        self.random_state = random_state
        self.model = RandomForestClassifier(n_estimators=self.n_estimator, random_state=self.random_state)
        self.x_train = []
        self.y_train = []

    def fit(self, x_train, y_train):
        self.model.fit(x_train,y_train)

    def predict(self, X):
        return self.model.predict(X)

    def test(self, x_test, y_test):
        y_pred = self.model.predict(x_test)

        accuracy = accuracy_score(y_test, y_pred)
        classification_report_result = classification_report(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        return accuracy, classification_report_result, f1

    def param_optim(self, X, Y, n_estim = [50, 100, 200, 500], max_depth=[None, 10, 20, 30], min_samples_split=[2, 5, 10], min_samples_leaf=[1, 2, 4]):
        param_grid = {
            'n_estimators': n_estim,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf
        }

        self.x_train = X
        self.y_train = Y
        new_model = RandomForestClassifier(random_state=100)

        grid_search = GridSearchCV(new_model, param_grid, cv=5, scoring='accuracy')
        grid_search.fit(X, Y)

        self.model = RandomForestClassifier(
                                            n_estimators=grid_search.best_params_['n_estimators'],
                                            max_depth=grid_search.best_params_['max_depth'],
                                            min_samples_split=grid_search.best_params_['min_samples_split'],
                                            min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                            random_state=self.random_state)




In [None]:
from google.colab import files
files.upload()

In [4]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!rm kaggle.json

In [5]:
!kaggle datasets download -d ealtman2019/credit-card-transactions

Downloading credit-card-transactions.zip to /content
 99% 260M/263M [00:07<00:00, 42.1MB/s]
100% 263M/263M [00:07<00:00, 38.2MB/s]


In [6]:
! mkdir train
! unzip credit-card-transactions.zip -d train

Archive:  credit-card-transactions.zip
  inflating: train/User0_credit_card_transactions.csv  
  inflating: train/credit_card_transactions-ibm_v2.csv  
  inflating: train/sd254_cards.csv   
  inflating: train/sd254_users.csv   


In [7]:
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import TomekLinks


dataFrame = Data('/content/train/credit_card_transactions-ibm_v2.csv')
print("1")
dataFrame.clean_data()
print("2")

X = dataFrame.df.drop(columns=['Is Fraud?'])
y = dataFrame.df['Is Fraud?']
print("3")

# Split the resampled data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1613)

print("4")
tl = TomekLinks(sampling_strategy='auto', n_jobs=-1)
X_resampled, y_resampled = tl.fit_resample(X_train, y_train)
print("5")

rf = RandomForest()


rf.fit(X_resampled, y_resampled)
print("6")

acc, crr, f1 = rf.test(X_test, y_test)

print(crr)


1


  self.df["Amount"] = self.df["Amount"].str.replace("$","").astype(float)


[Pipeline] .......... (step 1 of 1) Processing cleaning, total=  38.9s
2
3
4
5
6
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00   7307107
         1.0       0.95      0.55      0.69      8963

    accuracy                           1.00   7316070
   macro avg       0.97      0.77      0.85   7316070
weighted avg       1.00      1.00      1.00   7316070



In [9]:
new_test_df = dataFrame.df[dataFrame.df['Is Fraud?'] == 1]

new_test_x = new_test_df.drop(columns=['Is Fraud?'])
new_test_y = new_test_df['Is Fraud?']

_, crr1 , _ = rf.test(new_test_x, new_test_y)

print(crr1)

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         0
         1.0       1.00      0.86      0.92     29757

    accuracy                           0.86     29757
   macro avg       0.50      0.43      0.46     29757
weighted avg       1.00      0.86      0.92     29757



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
