## Generate Transactions Data

With the original transactional dataset from Kaggle, this notebook was used to generate our version of transactional dataset

In [2]:
import pandas as pd
import numpy as np

In [3]:
np.random.seed(42)

In [4]:
# Load the original transactional dataset
trans_df = pd.read_csv('original-datasets-from-kaggle/financial-transactions/transactions_data.csv')
trans_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13305915 entries, 0 to 13305914
Data columns (total 12 columns):
 #   Column          Dtype  
---  ------          -----  
 0   id              int64  
 1   date            object 
 2   client_id       int64  
 3   card_id         int64  
 4   amount          object 
 5   use_chip        object 
 6   merchant_id     int64  
 7   merchant_city   object 
 8   merchant_state  object 
 9   zip             float64
 10  mcc             int64  
 11  errors          object 
dtypes: float64(1), int64(5), object(6)
memory usage: 1.2+ GB


In [61]:
# Drop features that will not be used along with the samples having errors regarding transactional details
trans_df = trans_df[trans_df['errors'].isna()]
trans_df.drop(['merchant_city', 'merchant_state', 'zip', 'errors', 'client_id'], axis=1, inplace=True)
trans_df.info()
trans_df.head(10)

<class 'pandas.core.frame.DataFrame'>
Index: 13094522 entries, 0 to 13305914
Data columns (total 7 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   id           int64 
 1   date         object
 2   card_id      int64 
 3   amount       object
 4   use_chip     object
 5   merchant_id  int64 
 6   mcc          int64 
dtypes: int64(4), object(3)
memory usage: 799.2+ MB


Unnamed: 0,id,date,card_id,amount,use_chip,merchant_id,mcc
0,7475327,2010-01-01 00:01:00,2972,$-77.00,Swipe Transaction,59935,5499
1,7475328,2010-01-01 00:02:00,4575,$14.57,Swipe Transaction,67570,5311
2,7475329,2010-01-01 00:02:00,102,$80.00,Swipe Transaction,27092,4829
3,7475331,2010-01-01 00:05:00,2860,$200.00,Swipe Transaction,27092,4829
4,7475332,2010-01-01 00:06:00,3915,$46.41,Swipe Transaction,13051,5813
5,7475333,2010-01-01 00:07:00,165,$4.81,Swipe Transaction,20519,5942
6,7475334,2010-01-01 00:09:00,2972,$77.00,Swipe Transaction,59935,5499
7,7475335,2010-01-01 00:14:00,2140,$26.46,Online Transaction,39021,4784
8,7475336,2010-01-01 00:21:00,5131,$261.58,Online Transaction,50292,7801
9,7475337,2010-01-01 00:21:00,1112,$10.74,Swipe Transaction,3864,5813


In [62]:
# Merging with to find the type of cards that was used for the transaction
card_df = pd.read_csv('original-datasets-from-kaggle/financial-transactions/cards_data.csv')
card_df = card_df[['id', 'card_type']]
card_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6146 entries, 0 to 6145
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         6146 non-null   int64 
 1   card_type  6146 non-null   object
dtypes: int64(1), object(1)
memory usage: 96.2+ KB


In [63]:
# Left join two tables to get card_type (Debit, Credit, Debit Prepaid)
df = trans_df.merge(card_df, how='left', left_on='card_id', right_on='id')
df.rename(columns={'id_x': 'transaction_id', 'use_chip': 'transaction_type'}, inplace=True)
df.drop(['id_y'], axis=1, inplace=True)
df.info()
df.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13094522 entries, 0 to 13094521
Data columns (total 8 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   transaction_id    int64 
 1   date              object
 2   card_id           int64 
 3   amount            object
 4   transaction_type  object
 5   merchant_id       int64 
 6   mcc               int64 
 7   card_type         object
dtypes: int64(4), object(4)
memory usage: 799.2+ MB


Unnamed: 0,transaction_id,date,card_id,amount,transaction_type,merchant_id,mcc,card_type
0,7475327,2010-01-01 00:01:00,2972,$-77.00,Swipe Transaction,59935,5499,Debit (Prepaid)
1,7475328,2010-01-01 00:02:00,4575,$14.57,Swipe Transaction,67570,5311,Credit
2,7475329,2010-01-01 00:02:00,102,$80.00,Swipe Transaction,27092,4829,Debit
3,7475331,2010-01-01 00:05:00,2860,$200.00,Swipe Transaction,27092,4829,Debit
4,7475332,2010-01-01 00:06:00,3915,$46.41,Swipe Transaction,13051,5813,Debit
5,7475333,2010-01-01 00:07:00,165,$4.81,Swipe Transaction,20519,5942,Debit (Prepaid)
6,7475334,2010-01-01 00:09:00,2972,$77.00,Swipe Transaction,59935,5499,Debit (Prepaid)
7,7475335,2010-01-01 00:14:00,2140,$26.46,Online Transaction,39021,4784,Debit (Prepaid)
8,7475336,2010-01-01 00:21:00,5131,$261.58,Online Transaction,50292,7801,Debit
9,7475337,2010-01-01 00:21:00,1112,$10.74,Swipe Transaction,3864,5813,Debit (Prepaid)


In [64]:
# Transform amount
df['amount'] = df['amount'].str.replace('$', '', regex=False).astype(float)

# Add currency
df['currency'] = '£'

In [65]:
# Generate new transaction dates (2021 to 2024)
def random_dates(start, end, n=len(df)):

    start_un = start.value//10**9
    end_un = end.value//10**9
    
    return pd.to_datetime(np.random.randint(start_un, end_un, n), unit='s')

start = pd.to_datetime('2021-01-01')
end = pd.to_datetime('2024-01-01')
df['date'] = random_dates(start, end)

In [66]:
# Map Customer IDs from customer dataset to transaction dataset
train_df = pd.read_csv('project-dataset/customer.csv')
customer_ids = train_df['customer_id'].unique()
df['customer_id'] = np.random.choice(
    customer_ids, 
    size=len(df), 
    replace=True)

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13094522 entries, 0 to 13094521
Data columns (total 10 columns):
 #   Column            Dtype         
---  ------            -----         
 0   transaction_id    int64         
 1   date              datetime64[ns]
 2   card_id           int64         
 3   amount            float64       
 4   transaction_type  object        
 5   merchant_id       int64         
 6   mcc               int64         
 7   card_type         object        
 8   currency          object        
 9   customer_id       int64         
dtypes: datetime64[ns](1), float64(1), int64(5), object(3)
memory usage: 999.0+ MB


In [68]:
withdrawals = len(df[df['amount'] > 0])
deposits = len(df[df['amount'] < 0])

print(withdrawals, deposits)

12433446 650506


In [69]:
# We determined that feature having positive values in transferred amount are Withdrawals, while the one having negative values will be Withdrawals.
# These features will serve the predictions of product-specific models (loans, credit cards products)
df.loc[df['amount'] < 0, 'transaction_type'] = "Deposit"
df.loc[df['amount'] > 0, 'transaction_type'] = "Withdrawal"
df['amount'] = df['amount'].abs()

In [70]:
positive_amount = len(df[df['amount'] > 0])
negative_amount = len(df[df['amount'] < 0])

print(positive_amount, negative_amount)

13083952 0


In [71]:
# Export the dataset
df.to_csv('project-dataset/transactions.csv', index=False)