# Data Sampling

Data from [Kaggle user ealtman](https://www.kaggle.com/datasets/ealtman2019/credit-card-transactions)

The original dataset is massive, with just one file being over 2 gigs.
In addition, it is unbalanced with their being mostly non-fraudulant transactions.

To remedy this, and make the data under 100 MB for the GitHub upload cap, we'll conserve all fraudulent records but reduce the overall dataset so that 70% are non-fraudulent. This means about 30,000 fraudulent credit transactions and 70,000 random non-fraudulent transactions.

In [1]:
import pandas as pd

df_users = pd.read_csv('../archive/sd254_users.csv', dtype={'merchant_zip':str})
df_cards = pd.read_csv('../archive/sd254_cards.csv')
df_transactions = pd.read_csv('../archive/credit_card_transactions-ibm_v2.csv')

FileNotFoundError: [Errno 2] No such file or directory: '../archive/sd254_users.csv'

In [None]:
print(df_cards.columns)
df_transactions['Is Fraud?'].value_counts()

Index(['User', 'CARD INDEX', 'Card Brand', 'Card Type', 'Card Number',
       'Expires', 'CVV', 'Has Chip', 'Cards Issued', 'Credit Limit',
       'Acct Open Date', 'Year PIN last Changed', 'Card on Dark Web'],
      dtype='object')


No     24357143
Yes       29757
Name: Is Fraud?, dtype: int64

In [None]:
df_yes = df_transactions.loc[df_transactions['Is Fraud?'] == "Yes"]  # ~     30_000 records
df_no = df_transactions.loc[df_transactions['Is Fraud?'] == "No"]    # ~ 20_000_000 records

df : pd.DataFrame = df_yes.append(df_no.sample(70_000)
)


In [None]:
df_cards.rename(columns={'CARD INDEX' : 'Card'})
df_cards.drop(columns=['User'], inplace=True)

df = df.join(df_cards, on='Card')
df.head()

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,...,Card Type,Card Number,Expires,CVV,Has Chip,Cards Issued,Credit Limit,Acct Open Date,Year PIN last Changed,Card on Dark Web
4099,0,0,2015,11,15,12:55,$287.13,Online Transaction,-8194607650924472520,ONLINE,...,Debit,4344676511950444,12/2022,623,YES,2,$24295,09/2002,2008,No
4100,0,0,2015,11,15,13:19,$2.41,Online Transaction,-7759074308363763111,ONLINE,...,Debit,4344676511950444,12/2022,623,YES,2,$24295,09/2002,2008,No
4101,0,0,2015,11,16,09:41,$50.81,Online Transaction,-551332107213382088,ONLINE,...,Debit,4344676511950444,12/2022,623,YES,2,$24295,09/2002,2008,No
4102,0,0,2015,11,16,09:46,$248.36,Online Transaction,4872340518840476610,ONLINE,...,Debit,4344676511950444,12/2022,623,YES,2,$24295,09/2002,2008,No
4103,0,0,2015,11,16,11:20,$473.00,Online Transaction,-8566951830324093739,ONLINE,...,Debit,4344676511950444,12/2022,623,YES,2,$24295,09/2002,2008,No


In [None]:
df = df.join(df_users, on='User', rsuffix='_u')
df.head()
df.columns

Index(['User', 'Card', 'Year', 'Month', 'Day', 'Time', 'Amount', 'Use Chip',
       'Merchant Name', 'Merchant City', 'Merchant State', 'Zip', 'MCC',
       'Errors?', 'Is Fraud?', 'CARD INDEX', 'Card Brand', 'Card Type',
       'Card Number', 'Expires', 'CVV', 'Has Chip', 'Cards Issued',
       'Credit Limit', 'Acct Open Date', 'Year PIN last Changed',
       'Card on Dark Web', 'Person', 'Current Age', 'Retirement Age',
       'Birth Year', 'Birth Month', 'Gender', 'Address', 'Apartment', 'City',
       'State', 'Zipcode', 'Latitude', 'Longitude',
       'Per Capita Income - Zipcode', 'Yearly Income - Person', 'Total Debt',
       'FICO Score', 'Num Credit Cards'],
      dtype='object')

# Renaming Columns

In [None]:
df.drop(columns=
        ['Card', 
         'CARD INDEX', 'Card Number', 'CVV', 'Has Chip', 'Cards Issued', 'Card on Dark Web', 
         'Person', 'Per Capita Income - Zipcode']
         ,errors='ignore', inplace=True)

df.rename(columns={
    'User'                          : 'user_id', 
#    'Card'                          : 'card_id', 
    'Year'                          : 'year', 
    'Month'                         : 'month', 
    'Day'                           : 'day', 
    'Time'                          : 'time', 
    'Amount'                        : 'amount', 
    'Use Chip'                      : 'card_method',
    'Merchant Name'                 : 'merchant',
    'Merchant City'                 : 'merchant_city',
    'Merchant State'                : 'merchant_state',
    'Zip'                           : 'merchant_zip',
    'MCC'                           : 'merchant_code',
    'Errors?'                       : 'errors', 
    'Is Fraud?'                     : 'flagged_as_fraud', 
    
#    'CARD INDEX'                    : 'card_id', 
    'Card Brand'                    : 'card_brand',
    'Card Type'                     : 'card_type',
#    'Card Number'                   , 
    'Expires'                       : 'card_expiry_date', 
#    'CVV'                           , 
#    'Has Chip'                      , 
#    'Cards Issued'                  ,
    'Credit Limit'                  : 'card_limit', 
    'Acct Open Date'                : 'card_issue_date', 
    'Year PIN last Changed'         : 'card_last_pin',
#    'Card on Dark Web'              : , 
    
#    'Person'                        : , 
#    'Current Age'                   : 'user_age', 
    'Retirement Age'                : 'user_retirement',
    'Birth Year'                    : 'user_birth_year', 
    'Birth Month'                   : 'user_birth_month', 
    'Gender'                        : 'user_gender', 
    'Address'                       : 'user_address', 
    'Apartment'                     : 'user_apartment', 
    'City'                          : 'user_city',
    'State'                         : 'user_state', 
    'Zipcode'                       : 'user_zip', 
    'Latitude'                      : 'user_lat', 
    'Longitude'                     : 'user_long',
#    'Per Capita Income - Zipcode'   : '', 
    'Yearly Income - Person'        : 'user_income', 
    'Total Debt'                    : 'user_debt',
    'FICO Score'                    : 'user_rating', 
    'Num Credit Cards'              : 'user_number_of_cards'
}, inplace=True)

print([n for n in df.columns if 'user' in n])

df.head()


['user_id', 'user_retirement', 'user_gender', 'user_address', 'user_apartment', 'user_city', 'user_state', 'user_zip', 'user_lat', 'user_long', 'user_income', 'user_debt', 'user_rating', 'user_number_of_cards']


Unnamed: 0,user_id,year,month,day,time,amount,card_method,merchant,merchant_city,merchant_state,...,user_apartment,user_city,user_state,user_zip,user_lat,user_long,user_income,user_debt,user_rating,user_number_of_cards
4099,0,2015,11,15,12:55,$287.13,Online Transaction,-8194607650924472520,ONLINE,,...,,La Verne,CA,91750,34.15,-117.76,$59696,$127613,787,5
4100,0,2015,11,15,13:19,$2.41,Online Transaction,-7759074308363763111,ONLINE,,...,,La Verne,CA,91750,34.15,-117.76,$59696,$127613,787,5
4101,0,2015,11,16,09:41,$50.81,Online Transaction,-551332107213382088,ONLINE,,...,,La Verne,CA,91750,34.15,-117.76,$59696,$127613,787,5
4102,0,2015,11,16,09:46,$248.36,Online Transaction,4872340518840476610,ONLINE,,...,,La Verne,CA,91750,34.15,-117.76,$59696,$127613,787,5
4103,0,2015,11,16,11:20,$473.00,Online Transaction,-8566951830324093739,ONLINE,,...,,La Verne,CA,91750,34.15,-117.76,$59696,$127613,787,5


In [None]:
df.to_csv('../data/credit_transactions.csv', index=False)