In [1]:
import pandas as pd
from sklearn import preprocessing
import sweetviz as sv


In [2]:
# Loads dtypes
types = pd.read_csv("./data/interim/transactions_dtypes.csv", index_col=0)
types_dict = types["0"].to_dict()
df = pd.read_csv("./data/interim/transactions.csv", index_col=0)
# sets loaded dtypes to dataframe
df = df.astype(types_dict)


# Adding Features

In [3]:
df.columns


Index(['customerId', 'creditLimit', 'availableMoney', 'transactionDateTime',
       'transactionAmount', 'merchantName', 'acqCountry',
       'merchantCountryCode', 'posEntryMode', 'posConditionCode',
       'merchantCategoryCode', 'currentExpDate', 'accountOpenDate',
       'dateOfLastAddressChange', 'cardCVV', 'enteredCVV', 'cardLast4Digits',
       'transactionType', 'currentBalance', 'cardPresent',
       'expirationDateKeyInMatch', 'isFraud'],
      dtype='object')

### Age of account

In [4]:
df["accountAge"] = (df.transactionDateTime - df.accountOpenDate).dt.days



### Card CVV = entered CVV

In [5]:
df[df.cardCVV != df.enteredCVV]["isFraud"].value_counts(normalize=True)


False    0.971429
True     0.028571
Name: isFraud, dtype: float64

In [6]:
df["cvvMatch"] = df.cardCVV == df.enteredCVV


### Time since address was changed

In [7]:
df["sinceDateOfLastAddressChange"] = (
    df.transactionDateTime - df.dateOfLastAddressChange
).dt.days


### If country codes match

In [8]:
df["countryMatch"] = df.acqCountry == df.merchantCountryCode


### General date properties

In [9]:
df["dayOfMonth"] = df.transactionDateTime.dt.day
df["month"] = df.transactionDateTime.dt.month
df["dayOfYear"] = df.transactionDateTime.dt.dayofyear
df["weekOfYear"] = df.transactionDateTime.dt.weekofyear
df["dayOfWeek"] = df.transactionDateTime.dt.dayofweek
df["quarter"] = df.transactionDateTime.dt.quarter
df["hour"] = df.transactionDateTime.dt.hour


  df["weekOfYear"] = df.transactionDateTime.dt.weekofyear


### Weekday or weekend

In [10]:
df["weekday"] = df.dayOfWeek < 5


### Time of day I decided to make three parts of the day


In [11]:
df["timeOfDay"] = pd.cut(df.hour, bins=3, labels=[0, 1, 2])


### Average fraud of customer. Shifted so each line shows what's happened in the past


In [12]:
df["avgFraud"] = (
    df.groupby("customerId")["isFraud"]
    .apply(lambda x: x.shift().expanding().mean())
    .fillna(0)
)


## Revisting with more features
After running a basic Log Regression model, I think I need better features

### Transactions that happen less than 3 minutes apart

In [13]:
df["quick_transaction"] = (
    df.groupby("customerId").apply(
        lambda x: x.rolling("3min", on="transactionDateTime").count()
    )
)["isFraud"]


### Transactions that happen in separate countries, but minutes apart

In [14]:
df["time_shift_down1"] = df.groupby("customerId").transactionDateTime.shift(1)
df["country_shift_down1"] = df.groupby("customerId").acqCountry.shift(1)


In [15]:
ctry_diff = df.acqCountry != df.country_shift_down1
in_window = (df.transactionDateTime - df.time_shift_down1) < pd.Timedelta("5 minutes")
df["ctry_diff_five_min"] = (ctry_diff & in_window).fillna(False)


### Minimum time (minutes) between transactions in a 30 day span

In [16]:
# Difference in time per group
df["time_diff"] = (
    df.groupby("customerId")["transactionDateTime"].diff().fillna(pd.Timedelta(0))
)
df["time_diff"] = (df.time_diff.dt.total_seconds() / 60).astype("int")


In [17]:
df["min_time_diff_month"] = (
    df.groupby("customerId")
    .apply(lambda x: x.rolling("30D", on="transactionDateTime")["time_diff"].min())
    .reset_index("customerId")
    .drop("customerId", axis=1)
    .sort_index()
    .squeeze()
)


### Average Amount spent per transaction over 1 month

In [18]:
df["avg_spent_month"] = (
    df.groupby("customerId")
    .apply(
        lambda x: x.rolling("30D", on="transactionDateTime")["transactionAmount"].mean()
    )
    .reset_index("customerId")
    .drop("customerId", axis=1)
    .sort_index()
    .squeeze()
)


### Total amount spent on the same day

In [19]:
df["total_spent_one_day"] = (
    df.groupby("customerId")
    .apply(
        lambda x: x.rolling("1D", on="transactionDateTime")["transactionAmount"].sum()
    )
    .reset_index("customerId")
    .drop("customerId", axis=1)
    .sort_index()
    .squeeze()
)


In [20]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 776668 entries, 0 to 786362
Data columns (total 44 columns):
 #   Column                        Non-Null Count   Dtype         
---  ------                        --------------   -----         
 0   customerId                    776668 non-null  object        
 1   creditLimit                   776668 non-null  float64       
 2   availableMoney                776668 non-null  float64       
 3   transactionDateTime           776668 non-null  datetime64[ns]
 4   transactionAmount             776668 non-null  float64       
 5   merchantName                  776668 non-null  object        
 6   acqCountry                    776668 non-null  object        
 7   merchantCountryCode           776668 non-null  object        
 8   posEntryMode                  776668 non-null  object        
 9   posConditionCode              776668 non-null  object        
 10  merchantCategoryCode          776668 non-null  object        
 11  currentExpDat

# Encoding

In [22]:
# Commented out because It takes a while to run

my_report = sv.analyze(df,)
my_report.show_html(filepath="2_SWEETVIZ_REPORT.html")


Done! Use 'show' commands to display/save.   |██████████| [100%]   00:02 -> (00:00 left)


Report 2_SWEETVIZ_REPORT.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [None]:
"""Enconding List

customerId - nothing
creditLimit - nothing
availableMoney - cut
transactionDateTime - drop
transactionAmount - qcut
merchantName - ordinal
acqCountry - ordinal
merchantCountryCode - ordinal
posEntryMode - ordinal
posConditionCode - ordinal
merchantCategoryCode - ordinal
currentExpDate - drop
accountOpenDate - drop
dateOfLastAddressChange - drop
cardCVV - drop
enteredCVV - drop
cardLast4Digits - drop
transactionType - ordinal
currentBalance - nothing
cardPresent - boolean
expirationDateKeyInMatch - boolean
isFraud - boolean
accountAge - nothing
cvvMatch - boolean
sinceDateOfLastAddressChange - nothing
countryMatch - boolean
dayOfMonth - nothing
month - nothing
dayOfYear - nothing
weekOfYear - nothing
dayOfWeek - nothing
quarter - nothing
hour - nothing
weekday - boolean
timeOfDay - nothing
avgFraud - nothing
quick_transaction - nothing
time_shift_down1 - drop
country_shift_down1 - drop
ctry_diff_five_min - boolean
time_diff - nothing
min_time_diff_month - nothing
cumFraud - nothing
avg_spent_month - nothing
total_spent_one_day - nothing"""


## Binning

In [25]:
# available money - cut
df["availableMoney"] = pd.cut(
    df.availableMoney,
    bins=[-5000, -1000, -500, -100, 0, 100, 500, 1000, 5000, 50000],
    labels=[0, 1, 2, 3, 4, 5, 6, 7, 8],
)

# transactionAmount - qcut
df["transactionAmount"] = pd.qcut(df.transactionAmount, 4, labels=[0, 1, 2, 3])


## Ordinal

In [26]:
# merchantName - i'm going to remove the individual locations - like AMC #010101 - could change this choice later
df["merchantName"] = df.merchantName.str.replace(r"\s#.*$", "", regex=True)

# List of cols to encode
ordinal_encode = [
    "acqCountry",
    "merchantCountryCode",
    "posEntryMode",
    "posConditionCode",
    "merchantCategoryCode",
    "transactionType",
    "merchantName",
]

# Encode all cols
ordinal = preprocessing.OrdinalEncoder()
df[ordinal_encode] = ordinal.fit_transform(df[ordinal_encode])
# Counts number of nans
df[ordinal_encode].isnull().sum().sum()


0

## Booleans

In [27]:
boolean_encode = [
    "cardPresent",
    "expirationDateKeyInMatch",
    "isFraud",
    "cvvMatch",
    "countryMatch",
    "weekday",
    "ctry_diff_five_min",
]

df[boolean_encode] = df[boolean_encode].astype("int")


## Drop

In [28]:
df = df.drop(
    [
        "transactionDateTime",
        "currentExpDate",
        "accountOpenDate",
        "dateOfLastAddressChange",
        "cardCVV",
        "enteredCVV",
        "cardLast4Digits",
        "time_shift_down1",
        "country_shift_down1",
    ],
    axis=1,
)


In [29]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 776668 entries, 0 to 786362
Data columns (total 35 columns):
 #   Column                        Non-Null Count   Dtype   
---  ------                        --------------   -----   
 0   customerId                    776668 non-null  object  
 1   creditLimit                   776668 non-null  float64 
 2   availableMoney                776668 non-null  category
 3   transactionAmount             776668 non-null  category
 4   merchantName                  776668 non-null  float64 
 5   acqCountry                    776668 non-null  float64 
 6   merchantCountryCode           776668 non-null  float64 
 7   posEntryMode                  776668 non-null  float64 
 8   posConditionCode              776668 non-null  float64 
 9   merchantCategoryCode          776668 non-null  float64 
 10  transactionType               776668 non-null  float64 
 11  currentBalance                776668 non-null  float64 
 12  cardPresent                   

In [31]:
# Saves types to load quickly
df.dtypes.to_csv("./data/processed/transactions_dtypes.csv")
# Save data
df.to_csv("./data/processed/transactions.csv")
