# Data Sampling

Data from [Kaggle user ealtman](https://www.kaggle.com/datasets/ealtman2019/credit-card-transactions)

The original dataset is massive, with just one file being over 2 gigs.
In addition, it is unbalanced with their being mostly non-fraudulant transactions.

To remedy this, and make the data under 100 MB for the GitHub upload cap, we'll conserve all fraudulent records but reduce the overall dataset so that 70% are non-fraudulent. This means about 30,000 fraudulent credit transactions and 70,000 random non-fraudulent transactions.

In [1]:
import pandas as pd

df_users = pd.read_csv('../archive/sd254_users.csv', dtype={'merchant_zip':str})
df_cards = pd.read_csv('../archive/sd254_cards.csv')
df_transactions = pd.read_csv('../archive/credit_card_transactions-ibm_v2.csv')

In [2]:
print(df_cards.columns)
df_transactions['Is Fraud?'].value_counts()

Index(['User', 'CARD INDEX', 'Card Brand', 'Card Type', 'Card Number',
       'Expires', 'CVV', 'Has Chip', 'Cards Issued', 'Credit Limit',
       'Acct Open Date', 'Year PIN last Changed', 'Card on Dark Web'],
      dtype='object')


Is Fraud?
No     24357143
Yes       29757
Name: count, dtype: int64

In [3]:
df_yes = df_transactions.loc[df_transactions['Is Fraud?'] == "Yes"]  # ~     30_000 records
df_no = df_transactions.loc[df_transactions['Is Fraud?'] == "No"]    # ~ 20_000_000 records

df : pd.DataFrame = df_yes.append(df_no.sample(70_000)
)


AttributeError: 'DataFrame' object has no attribute 'append'

In [None]:
df_cards.rename(columns={'CARD INDEX' : 'Card'})
df_cards.drop(columns=['User'], inplace=True)

df = df.join(df_cards, on='Card')
df.head()

: 

In [None]:
df = df.join(df_users, on='User', rsuffix='_u')
df.head()
df.columns

: 

# Renaming Columns

In [None]:
df.drop(columns=
        ['Card', 
         'CARD INDEX', 'Card Number', 'CVV', 'Has Chip', 'Cards Issued', 'Card on Dark Web', 
         'Person', 'Birth Year', 'Birth Month', 'Per Capita Income - Zipcode']
         ,errors='ignore', inplace=True)

df.rename(columns={
    'User'                          : 'user_id', 
#    'Card'                          : 'card_id', 
    'Year'                          : 'year', 
    'Month'                         : 'month', 
    'Day'                           : 'day', 
    'Time'                          : 'time', 
    'Amount'                        : 'amount', 
    'Use Chip'                      : 'card_method',
    'Merchant Name'                 : 'merchant',
    'Merchant City'                 : 'merchant_city',
    'Merchant State'                : 'merchant_state',
    'Zip'                           : 'merchant_zip',
    'MCC'                           : 'merchant_code',
    'Errors?'                       : 'errors', 
    'Is Fraud?'                     : 'flagged_as_fraud', 
    
#    'CARD INDEX'                    : 'card_id', 
    'Card Brand'                    : 'card_brand',
    'Card Type'                     : 'card_type',
#    'Card Number'                   , 
    'Expires'                       : 'card_expiry_date', 
#    'CVV'                           , 
#    'Has Chip'                      , 
#    'Cards Issued'                  ,
    'Credit Limit'                  : 'card_limit', 
    'Acct Open Date'                : 'card_issue_date', 
    'Year PIN last Changed'         : 'card_last_pin',
#    'Card on Dark Web'              : , 
    
#    'Person'                        : , 
    'Current Age'                   : 'user_age', 
    'Retirement Age'                : 'user_retirement',
#    'Birth Year'                    : , 
#    'Birth Month'                   : , 
    'Gender'                        : 'user_gender', 
    'Address'                       : 'user_address', 
    'Apartment'                     : 'user_apartment', 
    'City'                          : 'user_city',
    'State'                         : 'user_state', 
    'Zipcode'                       : 'user_zip', 
    'Latitude'                      : 'user_lat', 
    'Longitude'                     : 'user_long',
#    'Per Capita Income - Zipcode'   : '', 
    'Yearly Income - Person'        : 'user_income', 
    'Total Debt'                    : 'user_debt',
    'FICO Score'                    : 'user_rating', 
    'Num Credit Cards'              : 'user_number_of_cards'
}, inplace=True)

df.head()

: 

In [None]:
df.to_csv('../data/credit_transactions.csv', index=False)

: 