## Training dataset cleaning

In [378]:
import pandas as pd
import seaborn as sns

In [379]:
df = pd.read_csv('data/train.csv')
pd.options.display.max_columns = None
df.head()

Unnamed: 0,TransactionNumber,UserID,Age,Gender,Occupation,EducationLevel,MaritalStatus,NumDependents,Income,Expenditure,GiftsTransaction,TransactionDate,TransactionTime,TransactionAmount,MerchantID,TransactionType,TransactionLocation,DeviceType,Latitude,Longitude,EmailDomain,Terrorism,UserTenure,IsFraud
0,8765,70,37,Female,Professional,Bachelor,Widowed,3,28884.43 AUD,14610.61 AUD,£ 1050.01,2023-03-12,12:25:57,AU$ 258.14,M006,Withdrawal,Adelaide,Mobile,-31.840233,145.612793,jon44@disposable.com,False,113,1
1,9645,3386,34,Male,Student,High School,Married,4,AU$ 54919.07,39169.49 AUD,£ 4969.71,2023-03-05,18:27:24,34.94 AUD,M002,Withdrawal,Canberra,Mobile,-37.0201,144.9646,emilyreese@gmail.com,False,104,1
2,1145,2971,25,Male,Unemployed,Master,Married,2,AU$ 74728.57,55873.76 AUD,£ 1149.85,2023-11-10,17:16:56,AU$ 323.82,M008,Purchase,Brisbane,Mobile,-31.840233,145.612793,fordevan@gmail.com,False,105,0
3,15308,2925,25,Male,Professional,High School,Married,3,AU$ 55712.62,AED 89649.04,AU$ 4335.7,2023-10-07,00/34/17,AED 32.49,M001,Purchase,Darwin,iphone 15,-37.0201,144.9646,kathleenlewis@tempmail.com,False,70,1
4,14967,2339,38,Male,Professional,High School,Single,4,53004.7 AUD,AED 43601.02,4763.48 AUD,2023-09-22,06:40:08 PM,1140.75 AED,M001,Withdrawal,MLB,Tablet,-37.0201,144.9646,kristinawhite@gmail.com,False,27,0


In [380]:
df.describe()

Unnamed: 0,TransactionNumber,UserID,Age,NumDependents,Latitude,Longitude,UserTenure,IsFraud
count,10966.0,10966.0,10966.0,10966.0,10923.0,10923.0,10966.0,10966.0
mean,9117.500091,2483.885282,2751.845887,1.995714,-30.363255,141.254786,60.786157,0.364308
std,5293.66964,1429.402615,9730.988917,1.408035,6.962819,11.268395,34.254477,0.481257
min,1.0,1.0,-68.0,0.0,-41.640079,-112.02605,1.0,0.0
25%,4516.25,1253.0,26.0,1.0,-37.0201,142.702789,31.0,0.0
50%,9120.5,2471.0,34.0,2.0,-31.840233,144.9646,61.0,0.0
75%,13710.75,3727.0,43.0,3.0,-25.042261,145.612793,90.0,1.0
max,18277.0,5000.0,67000.0,4.0,57.85158,149.012375,119.0,1.0


It's seems thath the Age parameter has unnatural values - both negative age and age in the thousands.

In [381]:
df.isna().sum() # Only coordinates are missing

TransactionNumber       0
UserID                  0
Age                     0
Gender                  0
Occupation              0
EducationLevel          0
MaritalStatus           0
NumDependents           0
Income                  0
Expenditure             0
GiftsTransaction        0
TransactionDate         0
TransactionTime         0
TransactionAmount       0
MerchantID              0
TransactionType         0
TransactionLocation     0
DeviceType              0
Latitude               43
Longitude              43
EmailDomain             0
Terrorism               0
UserTenure              0
IsFraud                 0
dtype: int64

In [382]:
df = df.dropna()

### Transaction Number

In [383]:
# TransactionNubmber

df.TransactionNumber # Integer, no negative values

0         8765
1         9645
2         1145
3        15308
4        14967
         ...  
10961    11284
10962    11964
10963     5390
10964      860
10965    15795
Name: TransactionNumber, Length: 10923, dtype: int64

In [384]:
df.drop_duplicates(subset=['TransactionNumber'], inplace=True)

### User ID

In [385]:
df.UserID #integer, no negative values

0          70
1        3386
2        2971
3        2925
4        2339
         ... 
10961    3632
10962    3925
10963    4811
10964    1110
10965    3608
Name: UserID, Length: 10923, dtype: int64

In [386]:
df.drop_duplicates(subset=['UserID'], inplace=True)

### Age

In [387]:
# Checking the age column

df.Age[df.Age < 18]

13      -24
43      -43
47      -29
60      -47
72      -49
         ..
9925    -47
10048   -46
10056   -48
10484   -38
10906   -46
Name: Age, Length: 349, dtype: int64

In [388]:
df.Age[df.Age > 80]

38       29000
41       24000
57       27000
83       42000
110      36000
         ...  
10437    56000
10569    40000
10610    45000
10734    20000
10831    30000
Name: Age, Length: 349, dtype: int64

In [389]:
# Checking the age column

df.Age[(df.Age < 18) & (df.Age > 0)]


Series([], Name: Age, dtype: int64)

In [390]:
df = df.drop(df[df.Age < 0].index) # Drop unreasonable values
df = df.drop(df[df.Age > 100].index)

### Gender

In [391]:
# Check Gender column

df.Gender.value_counts()

Gender
Male           1617
Female         1537
he               98
man              94
woman            80
isnotfemale      78
fem              72
she              70
isnotmale        62
Name: count, dtype: int64

Unordinary labels are present and should be replaced. The isnotmale and isnotfemale are ambigious, and dhould be removed

In [392]:
df.Gender[(df['Gender'] == 'man') | (df["Gender"] == 'he')] = 'Male'

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df.Gender[(df['Gender'] == 'man') | (df["Gender"] == 'he')] = 'Male'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-c

In [393]:
df.Gender[(df['Gender'] == 'woman') | (df["Gender"] == 'fem') | (df["Gender"] == 'she')] = 'Female'

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df.Gender[(df['Gender'] == 'woman') | (df["Gender"] == 'fem') | (df["Gender"] == 'she')] = 'Female'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.h

In [394]:
df = df.drop(df[(df.Gender  == 'isnotfemale') | (df.Gender == 'isnotmale')].index)

In [395]:
# Check Gender column

df.Gender.value_counts()

Gender
Male      1809
Female    1759
Name: count, dtype: int64

### Occupation

In [396]:
# Check occupation

df.Occupation.value_counts() # all good

Occupation
Professional    1715
Student         1110
Unemployed       387
Retired          356
Name: count, dtype: int64

### Education Level

In [397]:
df.EducationLevel.value_counts() # all good

EducationLevel
Bachelor       1419
High School    1379
Master          562
PhD             208
Name: count, dtype: int64

### Number of Dependents

In [398]:
df.NumDependents # int no negative

0        3
1        4
2        2
3        3
4        4
        ..
10921    4
10923    0
10929    3
10952    1
10957    3
Name: NumDependents, Length: 3568, dtype: int64

In [399]:
df.head()

Unnamed: 0,TransactionNumber,UserID,Age,Gender,Occupation,EducationLevel,MaritalStatus,NumDependents,Income,Expenditure,GiftsTransaction,TransactionDate,TransactionTime,TransactionAmount,MerchantID,TransactionType,TransactionLocation,DeviceType,Latitude,Longitude,EmailDomain,Terrorism,UserTenure,IsFraud
0,8765,70,37,Female,Professional,Bachelor,Widowed,3,28884.43 AUD,14610.61 AUD,£ 1050.01,2023-03-12,12:25:57,AU$ 258.14,M006,Withdrawal,Adelaide,Mobile,-31.840233,145.612793,jon44@disposable.com,False,113,1
1,9645,3386,34,Male,Student,High School,Married,4,AU$ 54919.07,39169.49 AUD,£ 4969.71,2023-03-05,18:27:24,34.94 AUD,M002,Withdrawal,Canberra,Mobile,-37.0201,144.9646,emilyreese@gmail.com,False,104,1
2,1145,2971,25,Male,Unemployed,Master,Married,2,AU$ 74728.57,55873.76 AUD,£ 1149.85,2023-11-10,17:16:56,AU$ 323.82,M008,Purchase,Brisbane,Mobile,-31.840233,145.612793,fordevan@gmail.com,False,105,0
3,15308,2925,25,Male,Professional,High School,Married,3,AU$ 55712.62,AED 89649.04,AU$ 4335.7,2023-10-07,00/34/17,AED 32.49,M001,Purchase,Darwin,iphone 15,-37.0201,144.9646,kathleenlewis@tempmail.com,False,70,1
4,14967,2339,38,Male,Professional,High School,Single,4,53004.7 AUD,AED 43601.02,4763.48 AUD,2023-09-22,06:40:08 PM,1140.75 AED,M001,Withdrawal,MLB,Tablet,-37.0201,144.9646,kristinawhite@gmail.com,False,27,0


### Income

Income has the name of the currency in it which should be removed

In [400]:
income_tmp = df.Income
income_tmp = income_tmp.replace('AUD','', regex=True) # Only in AUD
income_tmp = income_tmp.replace('AU\$','', regex=True)
income_tmp = income_tmp.astype(float)
income_tmp

  income_tmp = income_tmp.replace('AU\$','', regex=True)


0        28884.43
1        54919.07
2        74728.57
3        55712.62
4        53004.70
           ...   
10921    74835.82
10923    86312.26
10929    50836.99
10952    64400.78
10957    32580.76
Name: Income, Length: 3568, dtype: float64

In [401]:
df.Income = income_tmp

### Expenditure

Expenditure also has currency name which should be removed. It uses two currencies - AED and AUD, and another column should be made for those

In [402]:
exp_tmp = df.Expenditure
exp_tmp = exp_tmp.replace('\d+', '', regex=True) # Remove numbers to see formatting
exp_tmp.value_counts() # Only used AUD and AED

  exp_tmp = exp_tmp.replace('\d+', '', regex=True) # Remove numbers to see formatting


Expenditure
AU$ .    1400
. AUD    1290
. AED     457
AED .     421
Name: count, dtype: int64

In [403]:
# Move currency to new column and remove
df['Expenditure'] = df['Expenditure'].replace('\$', 'D', regex=True)

df.loc[df['Expenditure'].str.contains('AUD'), 'ExpenditureCurrency'] = 'AUD'
df.loc[df['Expenditure'].str.contains('AED'), 'ExpenditureCurrency'] = 'AED'

df['Expenditure'] = df['Expenditure'].replace('AUD', '', regex=True)
df['Expenditure'] = df['Expenditure'].replace('AED', '', regex=True)

df['Expenditure'] = df['Expenditure'].astype(float)

  df['Expenditure'] = df['Expenditure'].replace('\$', 'D', regex=True)


In [404]:
df.Expenditure # Successfully converted to float

0        14610.61
1        39169.49
2        55873.76
3        89649.04
4        43601.02
           ...   
10921    45101.32
10923    48357.47
10929    29800.75
10952    30010.69
10957    14882.96
Name: Expenditure, Length: 3568, dtype: float64

In [405]:
df.ExpenditureCurrency.value_counts()

ExpenditureCurrency
AUD    2690
AED     878
Name: count, dtype: int64

### Gifts Transaction

Also have several currencies, which should be moved to another column

In [406]:
# Gifts transactions - seem to be in different currencies

gift_tmp = df.GiftsTransaction
gift_tmp = gift_tmp.replace('\d+', '', regex=True)
gift_tmp.value_counts() # Only used AUD and GBP

  gift_tmp = gift_tmp.replace('\d+', '', regex=True)


GiftsTransaction
. GBP    1373
£ .      1317
AU$ .     443
. AUD     435
Name: count, dtype: int64

In [407]:
GB_list = ['GBP', '£']
AD_list = ['AUD']

df['GiftsTransaction'] = df['GiftsTransaction'].replace('\$', 'D', regex=True)

df.loc[df['GiftsTransaction'].str.contains('|'.join(GB_list)), 'GiftsCurrency'] = 'GBP'
df.loc[df['GiftsTransaction'].str.contains('|'.join(AD_list)), 'GiftsCurrency'] = 'AUD'

  df['GiftsTransaction'] = df['GiftsTransaction'].replace('\$', 'D', regex=True)


In [408]:
df.head()

Unnamed: 0,TransactionNumber,UserID,Age,Gender,Occupation,EducationLevel,MaritalStatus,NumDependents,Income,Expenditure,GiftsTransaction,TransactionDate,TransactionTime,TransactionAmount,MerchantID,TransactionType,TransactionLocation,DeviceType,Latitude,Longitude,EmailDomain,Terrorism,UserTenure,IsFraud,ExpenditureCurrency,GiftsCurrency
0,8765,70,37,Female,Professional,Bachelor,Widowed,3,28884.43,14610.61,£ 1050.01,2023-03-12,12:25:57,AU$ 258.14,M006,Withdrawal,Adelaide,Mobile,-31.840233,145.612793,jon44@disposable.com,False,113,1,AUD,GBP
1,9645,3386,34,Male,Student,High School,Married,4,54919.07,39169.49,£ 4969.71,2023-03-05,18:27:24,34.94 AUD,M002,Withdrawal,Canberra,Mobile,-37.0201,144.9646,emilyreese@gmail.com,False,104,1,AUD,GBP
2,1145,2971,25,Male,Unemployed,Master,Married,2,74728.57,55873.76,£ 1149.85,2023-11-10,17:16:56,AU$ 323.82,M008,Purchase,Brisbane,Mobile,-31.840233,145.612793,fordevan@gmail.com,False,105,0,AUD,GBP
3,15308,2925,25,Male,Professional,High School,Married,3,55712.62,89649.04,AUD 4335.7,2023-10-07,00/34/17,AED 32.49,M001,Purchase,Darwin,iphone 15,-37.0201,144.9646,kathleenlewis@tempmail.com,False,70,1,AED,AUD
4,14967,2339,38,Male,Professional,High School,Single,4,53004.7,43601.02,4763.48 AUD,2023-09-22,06:40:08 PM,1140.75 AED,M001,Withdrawal,MLB,Tablet,-37.0201,144.9646,kristinawhite@gmail.com,False,27,0,AED,AUD


In [409]:
df.GiftsCurrency.value_counts()

GiftsCurrency
GBP    2690
AUD     878
Name: count, dtype: int64

In [410]:
GB_list = ['GBP', '£']
AD_list = ['AUD', 'AU\$']

gft_tmp = df.GiftsTransaction
gft_tmp = gft_tmp.replace(GB_list,'', regex=True)
gft_tmp = gft_tmp.replace(AD_list,'', regex=True)
gft_tmp = gft_tmp.astype(float)
gft_tmp # converted to float

  AD_list = ['AUD', 'AU\$']


0        1050.01
1        4969.71
2        1149.85
3        4335.70
4        4763.48
          ...   
10921    1270.87
10923    2009.35
10929    1291.68
10952     203.90
10957     862.94
Name: GiftsTransaction, Length: 3568, dtype: float64

In [411]:
df.GiftsTransaction = gft_tmp

In [412]:
df.head()

Unnamed: 0,TransactionNumber,UserID,Age,Gender,Occupation,EducationLevel,MaritalStatus,NumDependents,Income,Expenditure,GiftsTransaction,TransactionDate,TransactionTime,TransactionAmount,MerchantID,TransactionType,TransactionLocation,DeviceType,Latitude,Longitude,EmailDomain,Terrorism,UserTenure,IsFraud,ExpenditureCurrency,GiftsCurrency
0,8765,70,37,Female,Professional,Bachelor,Widowed,3,28884.43,14610.61,1050.01,2023-03-12,12:25:57,AU$ 258.14,M006,Withdrawal,Adelaide,Mobile,-31.840233,145.612793,jon44@disposable.com,False,113,1,AUD,GBP
1,9645,3386,34,Male,Student,High School,Married,4,54919.07,39169.49,4969.71,2023-03-05,18:27:24,34.94 AUD,M002,Withdrawal,Canberra,Mobile,-37.0201,144.9646,emilyreese@gmail.com,False,104,1,AUD,GBP
2,1145,2971,25,Male,Unemployed,Master,Married,2,74728.57,55873.76,1149.85,2023-11-10,17:16:56,AU$ 323.82,M008,Purchase,Brisbane,Mobile,-31.840233,145.612793,fordevan@gmail.com,False,105,0,AUD,GBP
3,15308,2925,25,Male,Professional,High School,Married,3,55712.62,89649.04,4335.7,2023-10-07,00/34/17,AED 32.49,M001,Purchase,Darwin,iphone 15,-37.0201,144.9646,kathleenlewis@tempmail.com,False,70,1,AED,AUD
4,14967,2339,38,Male,Professional,High School,Single,4,53004.7,43601.02,4763.48,2023-09-22,06:40:08 PM,1140.75 AED,M001,Withdrawal,MLB,Tablet,-37.0201,144.9646,kristinawhite@gmail.com,False,27,0,AED,AUD


### Transaction Date

In [413]:
time_tmp = df.TransactionDate
time_tmp = time_tmp.replace('\d+', '', regex=True)
time_tmp.value_counts() # Sae format, all good

  time_tmp = time_tmp.replace('\d+', '', regex=True)


TransactionDate
--    3568
Name: count, dtype: int64

### Transaction Time

In [414]:
time_tmp = df.TransactionTime
time_tmp = time_tmp.replace('\d+', '', regex=True)
time_tmp.value_counts() # Has to be converted to same format

  time_tmp = time_tmp.replace('\d+', '', regex=True)


TransactionTime
::       2848
//        357
:: PM     187
:: AM     176
Name: count, dtype: int64

In [415]:
df.TransactionTime = df.TransactionTime.replace('/', ':', regex=True)
df.TransactionTime =  pd.to_datetime(df.TransactionTime).dt.strftime('%H:%M:%S')

  df.TransactionTime =  pd.to_datetime(df.TransactionTime).dt.strftime('%H:%M:%S')


In [416]:
time_tmp = df.TransactionTime
time_tmp = time_tmp.replace('\d+', '', regex=True)
time_tmp.value_counts() # Converted

  time_tmp = time_tmp.replace('\d+', '', regex=True)


TransactionTime
::    3568
Name: count, dtype: int64

In [417]:
# Create datetime column

df['TransactionDatetime'] = df['TransactionDate'] + ' ' + df['TransactionTime']
df['TransactionDatetime'] = pd.to_datetime(df['TransactionDatetime'])

In [418]:
df.head()

Unnamed: 0,TransactionNumber,UserID,Age,Gender,Occupation,EducationLevel,MaritalStatus,NumDependents,Income,Expenditure,GiftsTransaction,TransactionDate,TransactionTime,TransactionAmount,MerchantID,TransactionType,TransactionLocation,DeviceType,Latitude,Longitude,EmailDomain,Terrorism,UserTenure,IsFraud,ExpenditureCurrency,GiftsCurrency,TransactionDatetime
0,8765,70,37,Female,Professional,Bachelor,Widowed,3,28884.43,14610.61,1050.01,2023-03-12,12:25:57,AU$ 258.14,M006,Withdrawal,Adelaide,Mobile,-31.840233,145.612793,jon44@disposable.com,False,113,1,AUD,GBP,2023-03-12 12:25:57
1,9645,3386,34,Male,Student,High School,Married,4,54919.07,39169.49,4969.71,2023-03-05,18:27:24,34.94 AUD,M002,Withdrawal,Canberra,Mobile,-37.0201,144.9646,emilyreese@gmail.com,False,104,1,AUD,GBP,2023-03-05 18:27:24
2,1145,2971,25,Male,Unemployed,Master,Married,2,74728.57,55873.76,1149.85,2023-11-10,17:16:56,AU$ 323.82,M008,Purchase,Brisbane,Mobile,-31.840233,145.612793,fordevan@gmail.com,False,105,0,AUD,GBP,2023-11-10 17:16:56
3,15308,2925,25,Male,Professional,High School,Married,3,55712.62,89649.04,4335.7,2023-10-07,00:34:17,AED 32.49,M001,Purchase,Darwin,iphone 15,-37.0201,144.9646,kathleenlewis@tempmail.com,False,70,1,AED,AUD,2023-10-07 00:34:17
4,14967,2339,38,Male,Professional,High School,Single,4,53004.7,43601.02,4763.48,2023-09-22,18:40:08,1140.75 AED,M001,Withdrawal,MLB,Tablet,-37.0201,144.9646,kristinawhite@gmail.com,False,27,0,AED,AUD,2023-09-22 18:40:08


### Transaction 

Also two currencies, should be moved.

In [419]:
trans_tmp = df.TransactionAmount
trans_tmp = trans_tmp.replace('\d+', '', regex=True)
trans_tmp.value_counts()

  trans_tmp = trans_tmp.replace('\d+', '', regex=True)


TransactionAmount
AU$ .    1363
. AUD    1327
AED .     446
. AED     432
Name: count, dtype: int64

In [420]:
df['TransactionAmount'] = df['TransactionAmount'].replace('\$', 'D', regex=True)

df.loc[df['TransactionAmount'].str.contains('AUD'), 'TransactionCurrency'] = 'AUD'
df.loc[df['TransactionAmount'].str.contains('AED'), 'TransactionCurrency'] = 'AED'

df['TransactionAmount'] = df['TransactionAmount'].replace('AUD', '', regex=True)
df['TransactionAmount'] = df['TransactionAmount'].replace('AED', '', regex=True)

df['TransactionAmount'] = df['TransactionAmount'].astype(float)

  df['TransactionAmount'] = df['TransactionAmount'].replace('\$', 'D', regex=True)


In [421]:
df.head()

Unnamed: 0,TransactionNumber,UserID,Age,Gender,Occupation,EducationLevel,MaritalStatus,NumDependents,Income,Expenditure,GiftsTransaction,TransactionDate,TransactionTime,TransactionAmount,MerchantID,TransactionType,TransactionLocation,DeviceType,Latitude,Longitude,EmailDomain,Terrorism,UserTenure,IsFraud,ExpenditureCurrency,GiftsCurrency,TransactionDatetime,TransactionCurrency
0,8765,70,37,Female,Professional,Bachelor,Widowed,3,28884.43,14610.61,1050.01,2023-03-12,12:25:57,258.14,M006,Withdrawal,Adelaide,Mobile,-31.840233,145.612793,jon44@disposable.com,False,113,1,AUD,GBP,2023-03-12 12:25:57,AUD
1,9645,3386,34,Male,Student,High School,Married,4,54919.07,39169.49,4969.71,2023-03-05,18:27:24,34.94,M002,Withdrawal,Canberra,Mobile,-37.0201,144.9646,emilyreese@gmail.com,False,104,1,AUD,GBP,2023-03-05 18:27:24,AUD
2,1145,2971,25,Male,Unemployed,Master,Married,2,74728.57,55873.76,1149.85,2023-11-10,17:16:56,323.82,M008,Purchase,Brisbane,Mobile,-31.840233,145.612793,fordevan@gmail.com,False,105,0,AUD,GBP,2023-11-10 17:16:56,AUD
3,15308,2925,25,Male,Professional,High School,Married,3,55712.62,89649.04,4335.7,2023-10-07,00:34:17,32.49,M001,Purchase,Darwin,iphone 15,-37.0201,144.9646,kathleenlewis@tempmail.com,False,70,1,AED,AUD,2023-10-07 00:34:17,AED
4,14967,2339,38,Male,Professional,High School,Single,4,53004.7,43601.02,4763.48,2023-09-22,18:40:08,1140.75,M001,Withdrawal,MLB,Tablet,-37.0201,144.9646,kristinawhite@gmail.com,False,27,0,AED,AUD,2023-09-22 18:40:08,AED


In [422]:
df.TransactionCurrency.value_counts()

TransactionCurrency
AUD    2690
AED     878
Name: count, dtype: int64

### Merchant ID

In [423]:
df.MerchantID.value_counts() # ok

MerchantID
M006    487
M007    464
M008    460
M004    454
M005    447
M002    435
M001    424
M003    397
Name: count, dtype: int64

### Transaction Type

In [424]:
df.TransactionType.value_counts() #ok

TransactionType
Purchase      1725
Transfer       758
Withdrawal     710
Payment        375
Name: count, dtype: int64

### Transaction Location

In [425]:
df.TransactionLocation.value_counts()

TransactionLocation
Darwin           462
Sydney           451
Adelaide         440
Hobart           431
Canberra         429
Brisbane         428
Perth            422
Melbourne        316
Mel               32
Melb              31
melbourne         23
MLB               22
Melburn           18
Bne                8
hobart             6
Pth                5
perth              5
brisbane           5
BNE                5
canberra           4
Syd                3
adl                3
DRW                3
Drw                2
sydney             2
Cbr                2
PTH                2
Hbt                2
Adelaide City      2
Adl                1
CBR                1
darwin             1
SYD                1
Name: count, dtype: int64

Non-standardised values, should be replaced

In [426]:
SYD = ['SYD', 'Syd', 'sydney']
BNE = ['BNE', 'Bne', 'brisbane']
PTH = ['Pth', 'perth', 'PTH']
MLB = ['melbourne', 'Melb', 'Mel', 'Melburn', 'MLB']
ADL = ['adl', 'Adl']
DRW = ['DRW', 'Drw', 'darwin']
HBT = ['hobart', 'HBT', 'Hbt']
CBR = ['canberra', 'Cbr', 'CBR']

df.loc[df['TransactionLocation'].isin(SYD), 'TransactionLocation'] = 'Sydney'
df.loc[df['TransactionLocation'].isin(BNE), 'TransactionLocation'] = 'Brisbane'
df.loc[df['TransactionLocation'].isin(PTH), 'TransactionLocation'] = 'Perth'
df.loc[df['TransactionLocation'].isin(MLB), 'TransactionLocation'] = 'Melbourne'
df.loc[df['TransactionLocation'].isin(ADL), 'TransactionLocation'] = 'Adelaide City'
df.loc[df['TransactionLocation'].isin(DRW), 'TransactionLocation'] = 'Darwin'
df.loc[df['TransactionLocation'].isin(HBT), 'TransactionLocation'] = 'Hobart'
df.loc[df['TransactionLocation'].isin(CBR), 'TransactionLocation'] = 'Canberra'

In [427]:
df.TransactionLocation.value_counts()

TransactionLocation
Darwin           468
Sydney           457
Brisbane         446
Melbourne        442
Adelaide         440
Hobart           439
Canberra         436
Perth            434
Adelaide City      6
Name: count, dtype: int64

In [428]:
df.head()

Unnamed: 0,TransactionNumber,UserID,Age,Gender,Occupation,EducationLevel,MaritalStatus,NumDependents,Income,Expenditure,GiftsTransaction,TransactionDate,TransactionTime,TransactionAmount,MerchantID,TransactionType,TransactionLocation,DeviceType,Latitude,Longitude,EmailDomain,Terrorism,UserTenure,IsFraud,ExpenditureCurrency,GiftsCurrency,TransactionDatetime,TransactionCurrency
0,8765,70,37,Female,Professional,Bachelor,Widowed,3,28884.43,14610.61,1050.01,2023-03-12,12:25:57,258.14,M006,Withdrawal,Adelaide,Mobile,-31.840233,145.612793,jon44@disposable.com,False,113,1,AUD,GBP,2023-03-12 12:25:57,AUD
1,9645,3386,34,Male,Student,High School,Married,4,54919.07,39169.49,4969.71,2023-03-05,18:27:24,34.94,M002,Withdrawal,Canberra,Mobile,-37.0201,144.9646,emilyreese@gmail.com,False,104,1,AUD,GBP,2023-03-05 18:27:24,AUD
2,1145,2971,25,Male,Unemployed,Master,Married,2,74728.57,55873.76,1149.85,2023-11-10,17:16:56,323.82,M008,Purchase,Brisbane,Mobile,-31.840233,145.612793,fordevan@gmail.com,False,105,0,AUD,GBP,2023-11-10 17:16:56,AUD
3,15308,2925,25,Male,Professional,High School,Married,3,55712.62,89649.04,4335.7,2023-10-07,00:34:17,32.49,M001,Purchase,Darwin,iphone 15,-37.0201,144.9646,kathleenlewis@tempmail.com,False,70,1,AED,AUD,2023-10-07 00:34:17,AED
4,14967,2339,38,Male,Professional,High School,Single,4,53004.7,43601.02,4763.48,2023-09-22,18:40:08,1140.75,M001,Withdrawal,Melbourne,Tablet,-37.0201,144.9646,kristinawhite@gmail.com,False,27,0,AED,AUD,2023-09-22 18:40:08,AED


### Device Type

Non-standard, should be replaced

In [429]:
df.DeviceType.value_counts()

DeviceType
Mobile        1639
Desktop       1078
Tablet         342
android        109
iphone 15      105
galaxys7       104
smartphone     103
mob             88
Name: count, dtype: int64

In [430]:
Mob = ['galaxys7', 'iphone 15', 'android', 'mob', 'smartphone']

df.loc[df['DeviceType'].isin(Mob), 'DeviceType'] = 'Mobile'

In [431]:
df.DeviceType.value_counts()

DeviceType
Mobile     2148
Desktop    1078
Tablet      342
Name: count, dtype: int64

### EMail Domain

In [432]:
df.EmailDomain # ok

0               jon44@disposable.com
1               emilyreese@gmail.com
2                 fordevan@gmail.com
3         kathleenlewis@tempmail.com
4            kristinawhite@gmail.com
                    ...             
10921         melaniereyes@gmail.com
10923            anthony35@gmail.com
10929             mdixon@outlook.com
10952    jasminecastillo@outlook.com
10957             fryharry@yahoo.com
Name: EmailDomain, Length: 3568, dtype: object

### Terrorism

In [433]:
df.Terrorism.value_counts() # ok

Terrorism
False    2469
True     1099
Name: count, dtype: int64

### User Tenure

In [434]:
df.UserTenure # Integer, positive

0        113
1        104
2        105
3         70
4         27
        ... 
10921     67
10923     63
10929     12
10952     52
10957     11
Name: UserTenure, Length: 3568, dtype: int64

### Is Fraud

In [435]:
df.IsFraud.value_counts() #ok

IsFraud
0    2259
1    1309
Name: count, dtype: int64

In [436]:
df.head() #done

Unnamed: 0,TransactionNumber,UserID,Age,Gender,Occupation,EducationLevel,MaritalStatus,NumDependents,Income,Expenditure,GiftsTransaction,TransactionDate,TransactionTime,TransactionAmount,MerchantID,TransactionType,TransactionLocation,DeviceType,Latitude,Longitude,EmailDomain,Terrorism,UserTenure,IsFraud,ExpenditureCurrency,GiftsCurrency,TransactionDatetime,TransactionCurrency
0,8765,70,37,Female,Professional,Bachelor,Widowed,3,28884.43,14610.61,1050.01,2023-03-12,12:25:57,258.14,M006,Withdrawal,Adelaide,Mobile,-31.840233,145.612793,jon44@disposable.com,False,113,1,AUD,GBP,2023-03-12 12:25:57,AUD
1,9645,3386,34,Male,Student,High School,Married,4,54919.07,39169.49,4969.71,2023-03-05,18:27:24,34.94,M002,Withdrawal,Canberra,Mobile,-37.0201,144.9646,emilyreese@gmail.com,False,104,1,AUD,GBP,2023-03-05 18:27:24,AUD
2,1145,2971,25,Male,Unemployed,Master,Married,2,74728.57,55873.76,1149.85,2023-11-10,17:16:56,323.82,M008,Purchase,Brisbane,Mobile,-31.840233,145.612793,fordevan@gmail.com,False,105,0,AUD,GBP,2023-11-10 17:16:56,AUD
3,15308,2925,25,Male,Professional,High School,Married,3,55712.62,89649.04,4335.7,2023-10-07,00:34:17,32.49,M001,Purchase,Darwin,Mobile,-37.0201,144.9646,kathleenlewis@tempmail.com,False,70,1,AED,AUD,2023-10-07 00:34:17,AED
4,14967,2339,38,Male,Professional,High School,Single,4,53004.7,43601.02,4763.48,2023-09-22,18:40:08,1140.75,M001,Withdrawal,Melbourne,Tablet,-37.0201,144.9646,kristinawhite@gmail.com,False,27,0,AED,AUD,2023-09-22 18:40:08,AED


In [437]:
df.head() #done

Unnamed: 0,TransactionNumber,UserID,Age,Gender,Occupation,EducationLevel,MaritalStatus,NumDependents,Income,Expenditure,GiftsTransaction,TransactionDate,TransactionTime,TransactionAmount,MerchantID,TransactionType,TransactionLocation,DeviceType,Latitude,Longitude,EmailDomain,Terrorism,UserTenure,IsFraud,ExpenditureCurrency,GiftsCurrency,TransactionDatetime,TransactionCurrency
0,8765,70,37,Female,Professional,Bachelor,Widowed,3,28884.43,14610.61,1050.01,2023-03-12,12:25:57,258.14,M006,Withdrawal,Adelaide,Mobile,-31.840233,145.612793,jon44@disposable.com,False,113,1,AUD,GBP,2023-03-12 12:25:57,AUD
1,9645,3386,34,Male,Student,High School,Married,4,54919.07,39169.49,4969.71,2023-03-05,18:27:24,34.94,M002,Withdrawal,Canberra,Mobile,-37.0201,144.9646,emilyreese@gmail.com,False,104,1,AUD,GBP,2023-03-05 18:27:24,AUD
2,1145,2971,25,Male,Unemployed,Master,Married,2,74728.57,55873.76,1149.85,2023-11-10,17:16:56,323.82,M008,Purchase,Brisbane,Mobile,-31.840233,145.612793,fordevan@gmail.com,False,105,0,AUD,GBP,2023-11-10 17:16:56,AUD
3,15308,2925,25,Male,Professional,High School,Married,3,55712.62,89649.04,4335.7,2023-10-07,00:34:17,32.49,M001,Purchase,Darwin,Mobile,-37.0201,144.9646,kathleenlewis@tempmail.com,False,70,1,AED,AUD,2023-10-07 00:34:17,AED
4,14967,2339,38,Male,Professional,High School,Single,4,53004.7,43601.02,4763.48,2023-09-22,18:40:08,1140.75,M001,Withdrawal,Melbourne,Tablet,-37.0201,144.9646,kristinawhite@gmail.com,False,27,0,AED,AUD,2023-09-22 18:40:08,AED


In [438]:
# Save to file
df.to_csv('data_clean/train.csv', index=False)