In [2]:
# Libraries
import pandas as pd
import numpy as np

# Load datasets
fraud_df = pd.read_csv("../data/raw/Fraud_Data.csv")
credit_df = pd.read_csv("../data/creditcard/creditcard.csv")

# Quick overview
display(fraud_df.head())
display(credit_df.head())


Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
# Fraud dataset
fraud_df.drop_duplicates(inplace=True)
fraud_df['signup_time'] = pd.to_datetime(fraud_df['signup_time'])
fraud_df['purchase_time'] = pd.to_datetime(fraud_df['purchase_time'])
fraud_df['ip_address'] = fraud_df['ip_address'].astype(str)

# Convert credit card 'Amount' and 'Time' types if necessary
credit_df['Amount'] = credit_df['Amount'].astype(float)
credit_df['Time'] = credit_df['Time'].astype(float)

# Confirm changes
fraud_df.info()
credit_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   user_id         151112 non-null  int64         
 1   signup_time     151112 non-null  datetime64[ns]
 2   purchase_time   151112 non-null  datetime64[ns]
 3   purchase_value  151112 non-null  int64         
 4   device_id       151112 non-null  object        
 5   source          151112 non-null  object        
 6   browser         151112 non-null  object        
 7   sex             151112 non-null  object        
 8   age             151112 non-null  int64         
 9   ip_address      151112 non-null  object        
 10  class           151112 non-null  int64         
dtypes: datetime64[ns](2), int64(4), object(5)
memory usage: 12.7+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Nu

In [11]:
# ================================
# Cell 4: Feature Engineering
# ================================

# ----------------
# Time-based features
# ----------------
fraud_df['purchase_hour'] = fraud_df['purchase_time'].dt.hour
fraud_df['purchase_day'] = fraud_df['purchase_time'].dt.dayofweek

fraud_df['signup_to_purchase_days'] = (
    fraud_df['purchase_time'] - fraud_df['signup_time']
).dt.total_seconds() / (24 * 3600)

# ----------------
# Transaction value features
# ----------------
fraud_df['log_purchase_value'] = np.log1p(fraud_df['purchase_value'])

# Avoid divide-by-zero
fraud_df['value_per_day'] = fraud_df['purchase_value'] / (
    fraud_df['signup_to_purchase_days'].replace(0, np.nan)
)

# ----------------
# IP-based feature
# ----------------
# IPs are stored as floats in this dataset → truncate decimals safely
fraud_df['ip_int'] = fraud_df['ip_address'].apply(
    lambda x: int(float(x)) if pd.notnull(x) else np.nan
)

# ----------------
# Device-based behavioral features (BEST PRACTICE)
# ----------------
fraud_df['device_tx_count'] = fraud_df.groupby('device_id')['user_id'].transform('count')
fraud_df['device_user_count'] = fraud_df.groupby('device_id')['user_id'].transform('nunique')

# ----------------
# Categorical encoding (safe for EDA & baseline models)
# ----------------
fraud_df['browser_code'] = pd.factorize(fraud_df['browser'])[0]
fraud_df['source_code'] = pd.factorize(fraud_df['source'])[0]
fraud_df['sex_code'] = pd.factorize(fraud_df['sex'])[0]

# ----------------
# Final sanity check
# ----------------
fraud_df.head()


Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,...,purchase_day,signup_to_purchase_days,log_purchase_value,ip_int,browser_code,value_per_day,device_tx_count,device_user_count,source_code,sex_code
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758368.79972,...,5,52.160671,3.555348,732758368,0,0.6518321,1,1,0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311387.865908,...,0,0.207685,2.833213,350311387,0,77.03968,1,1,1,1
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621473820.11095,...,3,1.2e-05,2.772589,2621473820,1,1296000.0,12,12,0,0
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542443.91396,...,0,5.695428,3.806662,3840542443,2,7.725495,1,1,0,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583117.452712,...,2,50.479873,3.688879,415583117,2,0.7725851,1,1,1,0


In [12]:
# Log-transform transaction amount to reduce skew
credit_df['log_amount'] = np.log1p(credit_df['Amount'])

# Create time-based features (e.g., hour of transaction)
credit_df['transaction_hour'] = (credit_df['Time'] % (24*3600)) // 3600

# Optional: normalize PCA features if needed
from sklearn.preprocessing import StandardScaler

pca_cols = [col for col in credit_df.columns if col.startswith('V')]
scaler = StandardScaler()
credit_df[pca_cols] = scaler.fit_transform(credit_df[pca_cols])

# Quick check
credit_df.head()


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V24,V25,V26,V27,V28,Amount,Class,log_amount,transaction_hour,amount_to_mean
0,0.0,-0.694242,-0.044075,1.672773,0.973366,-0.245117,0.347068,0.193679,0.082637,0.331128,...,0.110507,0.246585,-0.39217,0.330892,-0.063781,149.62,0,5.01476,0.0,1.693499
1,0.0,0.608496,0.161176,0.109797,0.316523,0.043483,-0.06182,-0.0637,0.071253,-0.232494,...,-0.561131,0.320694,0.261069,-0.022256,0.044608,2.69,0,1.305626,0.0,0.030447
2,1.0,-0.6935,-0.811578,1.169468,0.268231,-0.364572,1.351454,0.639776,0.207373,-1.378675,...,-1.138092,-0.628537,-0.288447,-0.137137,-0.181021,378.66,0,5.939276,0.0,4.285927
3,1.0,-0.493325,-0.112169,1.182516,-0.609727,-0.007469,0.93615,0.192071,0.316018,-1.262503,...,-1.941027,1.241904,-0.460217,0.155396,0.186189,123.5,0,4.824306,0.0,1.397855
4,2.0,-0.59133,0.531541,1.021412,0.284655,-0.295015,0.071999,0.479302,-0.22651,0.744326,...,0.23325,-0.395202,1.041611,0.54362,0.651816,69.99,0,4.262539,0.0,0.792194


In [13]:
# Fraud dataset: interaction feature example
fraud_df['value_per_day'] = fraud_df['purchase_value'] / (fraud_df['signup_to_purchase_days'] + 1)

# Credit dataset: aggregated features example
credit_df['amount_to_mean'] = credit_df['Amount'] / credit_df['Amount'].mean()

# Check
fraud_df[['purchase_value','signup_to_purchase_days','value_per_day']].head()
credit_df[['Amount','amount_to_mean']].head()


Unnamed: 0,Amount,amount_to_mean
0,149.62,1.693499
1,2.69,0.030447
2,378.66,4.285927
3,123.5,1.397855
4,69.99,0.792194


In [18]:
# ================================
# Encode country feature (Fraud dataset)
# ================================

if 'country' not in fraud_df.columns:
    print(" 'country' column not found. Skipping country encoding.")
else:
    fraud_df['country'] = fraud_df['country'].fillna('Unknown')

    fraud_df = pd.get_dummies(
        fraud_df,
        columns=['country'],
        prefix='country',
        drop_first=True
    )

    print(" Country feature encoded successfully")

# ================================
# Credit card dataset check
# ================================

credit_df.head()


 'country' column not found. Skipping country encoding.


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V24,V25,V26,V27,V28,Amount,Class,log_amount,transaction_hour,amount_to_mean
0,0.0,-0.694242,-0.044075,1.672773,0.973366,-0.245117,0.347068,0.193679,0.082637,0.331128,...,0.110507,0.246585,-0.39217,0.330892,-0.063781,149.62,0,5.01476,0.0,1.693499
1,0.0,0.608496,0.161176,0.109797,0.316523,0.043483,-0.06182,-0.0637,0.071253,-0.232494,...,-0.561131,0.320694,0.261069,-0.022256,0.044608,2.69,0,1.305626,0.0,0.030447
2,1.0,-0.6935,-0.811578,1.169468,0.268231,-0.364572,1.351454,0.639776,0.207373,-1.378675,...,-1.138092,-0.628537,-0.288447,-0.137137,-0.181021,378.66,0,5.939276,0.0,4.285927
3,1.0,-0.493325,-0.112169,1.182516,-0.609727,-0.007469,0.93615,0.192071,0.316018,-1.262503,...,-1.941027,1.241904,-0.460217,0.155396,0.186189,123.5,0,4.824306,0.0,1.397855
4,2.0,-0.59133,0.531541,1.021412,0.284655,-0.295015,0.071999,0.479302,-0.22651,0.744326,...,0.23325,-0.395202,1.041611,0.54362,0.651816,69.99,0,4.262539,0.0,0.792194


In [8]:
# Fraud dataset features
print("Fraud dataset features:")
print(fraud_df.columns.tolist())

# Credit dataset features
print("Credit dataset features:")
print(credit_df.columns.tolist())

# Ready for modeling


Fraud dataset features:
['user_id', 'signup_time', 'purchase_time', 'purchase_value', 'device_id', 'source', 'browser', 'sex', 'age', 'ip_address', 'class', 'purchase_hour', 'purchase_day', 'signup_to_purchase_days', 'log_purchase_value', 'ip_int', 'browser_code', 'value_per_day']
Credit dataset features:
['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class', 'log_amount', 'transaction_hour', 'amount_to_mean']
