In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('/Users/nbakhati/Library/CloudStorage/OneDrive-UniversityofNebraskaatOmaha/Fraud Detection/Code/train_id&T.csv')

After doing some feature analysis on the EDA side, we will do some feature engineering before model selection by adding new columns.

In [3]:
# Prepare a single script block to add engineered features to the dataset
def engineer_features(df):

    # Log transformation of transaction amount
    df['TransactionAmt_log'] = np.log1p(df['TransactionAmt'])

    # Frequency encoding
    df['card1_freq'] = df['card1'].map(df['card1'].value_counts())
    df['card2_freq'] = df['card2'].map(df['card2'].value_counts())

    # Extract hour from TransactionDT
    df['TransactionHour'] = (df['TransactionDT'] // 3600) % 24

    # US address inference
    df['is_US'] = df['addr1'].between(100, 999)

    # US email domains
    us_domains = [
        'gmail.com', 'yahoo.com', 'hotmail.com', 'aol.com', 'outlook.com', 'mail.com',
        'comcast.net', 'att.net', 'verizon.net', 'icloud.com', 'live.com', 'ymail.com',
        'msn.com', 'sbcglobal.net', 'bellsouth.net', 'me.com', 'optonline.net',
        'cox.net', 'charter.net', 'rocketmail.com'
    ]

    # Domain category mapping function
    def map_email_type(domain):
        if pd.isna(domain):
            return 'missing'
        elif domain in us_domains:
            return 'us'
        elif '.mx' in domain:
            return 'mx'
        elif '.es' in domain:
            return 'es'
        elif '.de' in domain:
            return 'de'
        elif '.co.uk' in domain:
            return 'uk'
        elif '.co.jp' in domain:
            return 'jp'
        elif 'protonmail' in domain:
            return 'private'
        elif domain in ['anonymous.com', 'gmail']:
            return 'suspicious'
        else:
            return 'other'

    df['P_email_cat'] = df['P_emaildomain'].apply(map_email_type)
    df['R_email_cat'] = df['R_emaildomain'].apply(map_email_type)

    return df

# Apply feature engineering
df = engineer_features(df)

# Show updated columns
df[['TransactionAmt_log', 'card1_freq', 'card2_freq', 'TransactionHour', 'is_US', 'P_email_cat', 'R_email_cat']].head()


Unnamed: 0,TransactionAmt_log,card1_freq,card2_freq,TransactionHour,is_US,P_email_cat,R_email_cat
0,4.241327,43,,0,True,missing,missing
1,3.401197,683,3056.0,0,True,us,missing
2,4.094345,1108,38145.0,0,True,us,missing
3,3.931826,4209,6137.0,0,True,us,missing
4,3.931826,18,14541.0,0,True,us,missing


Dropped Columns Based on Missingness

In [4]:
# Calculate missing value percentage
missing_percent = df.isnull().mean()

# Identify columns to drop (>30% missing)
cols_over_30 = missing_percent[missing_percent > 0.3].index.tolist()

# Exclude specific columns from being dropped
exclude_cols = ['id_30', 'id_31', 'DeviceInfo']

cols_over_30 = [col for col in cols_over_30 if col not in exclude_cols]
df.drop(columns=cols_over_30, inplace=True)


In [5]:
# Separate numerical and categorical columns
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
cat_cols = df.select_dtypes(include='object').columns

# Impute numeric columns with median
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Impute categorical columns with 'UNK'
df[cat_cols] = df[cat_cols].fillna('UNK')

# Confirm imputation
df[num_cols].isnull().sum().sum(), df[cat_cols].isnull().sum().sum()


(0, 0)

In [6]:
df.head

<bound method NDFrame.head of         TransactionID  isFraud  TransactionDT  TransactionAmt ProductCD  \
0             2987000        0          86400           68.50         W   
1             2987001        0          86401           29.00         W   
2             2987002        0          86469           59.00         W   
3             2987003        0          86499           50.00         W   
4             2987004        0          86506           50.00         H   
...               ...      ...            ...             ...       ...   
590535        3577535        0       15811047           49.00         W   
590536        3577536        0       15811049           39.50         W   
590537        3577537        0       15811079           30.95         W   
590538        3577538        0       15811088          117.00         W   
590539        3577539        0       15811131          279.95         W   

        card1  card2  card3       card4  card5  ...        id_30  \
0

In [7]:
output_path = "/Users/nbakhati/Library/CloudStorage/OneDrive-UniversityofNebraskaatOmaha/Fraud Detection/Code/new_fraud_dataset.csv"
df.to_csv(output_path, index=False)