In [None]:
!mkdir -p ~/.kaggle
!touch ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json
!echo '{"username":"YOUR-USERNAME","key":"YOUR-API-KEY"}' > ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d kartik2112/fraud-detection

fraud-detection.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
!unzip -o fraud-detection.zip

Archive:  fraud-detection.zip
  inflating: fraudTest.csv           
  inflating: fraudTrain.csv          


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('fraudTrain.csv', index_col=0)
y = df['is_fraud']

In [None]:
# Source: https://stackoverflow.com/a/29546836/6948907

def haversine_np(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)
    All args must be of equal length.    

    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km

def is_weekend(txn_time):
    return int(txn_time.weekday() >= 5)

def is_night(txn_time):
    return int(txn_time.hour <= 6 or txn_time.hour >= 22)

In [None]:
def get_txn_history(txns, windows=[7, 15, 30, 60]):
    txns = txns.sort_values('trans_date_trans_time')
    txns.index = txns.trans_date_trans_time
    
    for window in windows:
        total_amt = txns['amt'].rolling(window, min_periods = 1).sum()
        count_amt = txns['amt'].rolling(window, min_periods = 1).count()

        avg_amt = total_amt/count_amt

        txns[f'count_amt_{window}_days'] = list(count_amt)
        txns[f'count_avg_{window}_days'] = list(avg_amt)

    txns.reset_index(drop = True)
    return txns

In [None]:
def transform_columns(df):
    
    df=df.groupby('cc_num').apply(lambda x: get_txn_history(x, windows_size_in_days=[1,7,15,30]))
    df.reset_index(drop=True, inplace = True)
    df=df.sort_values('trans_date_trans_time')
    df['distance_bet_user_merchant'] = haversine_np(df['lat'],df['long'],df['merch_lat'],df['merch_long'])
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
    df['dob'] = pd.to_datetime(df['dob'])
    df['user_age'] = (df['trans_date_trans_time'] - df['dob']).astype('<m8[Y]').apply(pd.to_numeric)
    df['txn_at_night'] = df['trans_date_trans_time'].apply(lambda txn_time: is_night(txn_time))
    df['txn_on_weekend'] = df['trans_date_trans_time'].apply(lambda txn_time: is_weekend(txn_time))
    df.drop(axis = 1, inplace = True, columns = [ 'trans_num', 'street', 'first', 'last', 'gender', 'unix_time', 'city', 'state',
                'trans_date_trans_time', 'dob', 'job', 'cc_num', 'is_fraud', 'category', 'merchant'])
    return df

In [None]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('transform_columns', FunctionTransformer(func=transform_columns), ['cc_num', 'trans_date_trans_time', 'lat', 
                                                                            'long','merch_lat', 'merch_long', 'dob', 
                                                                            'amt',  'trans_num', 'street', 'first', 
                                                                            'last', 'gender', 'unix_time', 'city', 'state',
                                                                            'job', 'is_fraud', 'category', 'merchant']),
        
    ]
)

In [None]:
from sklearn.base import BaseEstimator
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

class CategoryEncoder(BaseEstimator):
    def __init__(self):
        self.cat_ohe = OneHotEncoder(handle_unknown='ignore')
        self.merchant_oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        self.city_oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        self.state_oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        self.job_oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

    def fit(self, df, y=None):
        self.cat_ohe.fit(df["category"].values.reshape(-1, 1))
        df['merchant'] = df['merchant'].apply(lambda name : name.replace("fraud_", ""))
        self.merchant_oe.fit(df["merchant"].values.reshape(-1, 1))
        self.city_oe.fit(df["city"].values.reshape(-1, 1))
        self.state_oe.fit(df["state"].values.reshape(-1, 1))
        self.job_oe.fit(df["job"].values.reshape(-1, 1))

        return self

    def transform(self, df):
        ohe_df = pd.DataFrame(self.cat_ohe.transform(df['category'].values.reshape(-1, 1)).toarray())
        df = df.join(ohe_df)

        df['merchant'] = df['merchant'].apply(lambda name : name.replace("fraud_", ""))
        df['merchant_oe'] = self.merchant_oe.transform(df['merchant'].values.reshape(-1, 1)).ravel()

        df['city_oe'] = self.city_oe.transform(df['city'].values.reshape(-1, 1)).ravel()
        df['state_oe'] = self.state_oe.transform(df['state'].values.reshape(-1, 1)).ravel()
        df['job_oe'] = self.job_oe.transform(df['job'].values.reshape(-1, 1)).ravel()
        # print(df.columns)
        return df

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

MAX_ITER = 500
CLASS_WEIGHTS = {
    0: 1,
    1: 75,
}

pipeline = Pipeline(
    steps = [
        ('categorical_encoding', CategoryEncoder()),
        ('pre_processing', preprocessor),
        ('scaling', StandardScaler()),
        # ('debug', Debug()), 
        ('logistic_regression', LogisticRegression(max_iter=MAX_ITER, class_weight=CLASS_WEIGHTS))
    ]
)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

pipeline.fit(df, y)
pipeline.score(df, y)



0.979450517670195

In [None]:
confusion_matrix(y, pipeline.predict(df))

array([[1263777,   25392],
       [   1254,    6252]])

In [None]:
test_df = pd.read_csv('fraudTest.csv', index_col=0)
test_y = test_df['is_fraud']
print(pipeline.score(test_df, test_y))
confusion_matrix(test_y, pipeline.predict(test_df))

0.9785809014987791


array([[542037,  11537],
       [   366,   1779]])

In [None]:
from sklearn.metrics import classification_report
print(classification_report(pipeline.predict(test_df), test_y))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99    542403
           1       0.83      0.13      0.23     13316

    accuracy                           0.98    555719
   macro avg       0.90      0.57      0.61    555719
weighted avg       0.98      0.98      0.97    555719

