# Preprocessing Script




## Import Libraries

In [3]:
import pandas as pd
import numpy as np

In [2]:
#preprocessing using sklearn
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
import joblib


## Load dataset

In [3]:
df = pd.read_csv('../data/kenya_fraud_detection_dataset.csv')

## Check data

In [4]:
# quick checks
print("shape:", df.shape)

shape: (10000, 14)


In [5]:
print("\ncolumns:", df.columns.tolist())



columns: ['Unnamed: 0', 'transaction_id', 'user_id', 'transaction_type', 'amount', 'location', 'device_type', 'network_provider', 'user_type', 'time_of_day(morning, afternoon, evening, night)', 'is_foreign_number', 'is_sim_recently_swapped', 'has_multiple_accounts', 'datetime']


In [6]:
print("\ndtypes:\n", df.dtypes)


dtypes:
 Unnamed: 0                                           int64
transaction_id                                      object
user_id                                             object
transaction_type                                    object
amount                                             float64
location                                            object
device_type                                         object
network_provider                                    object
user_type                                           object
time_of_day(morning, afternoon, evening, night)    float64
is_foreign_number                                    int64
is_sim_recently_swapped                              int64
has_multiple_accounts                                int64
datetime                                            object
dtype: object


In [7]:
display(df.head())

Unnamed: 0.1,Unnamed: 0,transaction_id,user_id,transaction_type,amount,location,device_type,network_provider,user_type,"time_of_day(morning, afternoon, evening, night)",is_foreign_number,is_sim_recently_swapped,has_multiple_accounts,datetime
0,0,TX100000,user_8270,Withdraw Cash,2646.35,Nakuru,Feature Phone,Telkom Kenya,individual,,0,0,0,2024-06-16 21:45:13
1,1,TX100001,user_1860,Send Money,2844.69,Garissa,iOS,Safaricom,agent,,0,0,0,2024-06-05 0:49:25
2,2,TX100002,user_6390,Deposit Cash,2384.46,Nyeri,Feature Phone,Telkom Kenya,agent,,0,0,1,2024-06-13 15:54:02
3,3,TX100003,user_6191,Withdraw Cash,1846.01,Nairobi,iOS,Safaricom,individual,,0,0,1,2024-06-10 1:05:49
4,4,TX100004,user_6734,Send Money,1017.6,Machakos,Feature Phone,Telkom Kenya,individual,,0,0,0,2024-06-27 2:28:53


## Drop unncessary columns

In [8]:
#  drop unused columns (safe with errors='ignore')

drop_cols = ['transaction_id', 'user_id', 'datetime', 'date', 'hour', 'amount']
df = df.drop(columns=[c for c in drop_cols if c in df.columns], errors='ignore')

print("After drop — shape:", df.shape)
print("Remaining columns:", df.columns.tolist())

After drop — shape: (10000, 10)
Remaining columns: ['Unnamed: 0', 'transaction_type', 'location', 'device_type', 'network_provider', 'user_type', 'time_of_day(morning, afternoon, evening, night)', 'is_foreign_number', 'is_sim_recently_swapped', 'has_multiple_accounts']


## Define Which columns are Numerical or categorical

In [9]:
# define feature groups
numeric_features = [
    'log_amount',      # numeric (created during EDA)
    'dayofweek',       # numeric 0-6
    'is_foreign_number',
    'is_sim_recently_swapped',
    'has_multiple_accounts'
]


categorical_features = [
    'transaction_type',
    'location',
    'device_type',
    'network_provider',
    'user_type',
    'time_of_day'
]

# show which of these actually exist in df
numeric_present = [c for c in numeric_features if c in df.columns]
categorical_present = [c for c in categorical_features if c in df.columns]

print("Numeric present:", numeric_present)
print("Categorical present:", categorical_present)

Numeric present: ['is_foreign_number', 'is_sim_recently_swapped', 'has_multiple_accounts']
Categorical present: ['transaction_type', 'location', 'device_type', 'network_provider', 'user_type']


In [10]:
# show unique counts for categorical columns (helps decide encoding strategy)
for c in categorical_present:
    print(c, "unique:", df[c].nunique())


transaction_type unique: 14
location unique: 10
device_type unique: 3
network_provider unique: 3
user_type unique: 2


##Build a preprocessing pipeline

In [12]:
# numeric transformer
num_transformer = Pipeline([
    ('scaler', StandardScaler())
])

# categorical transformer
cat_transformer = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# build column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numeric_present),
        ('cat', cat_transformer, categorical_present)
    ],
    remainder='drop'
)

# fit + transform
X_pre = preprocessor.fit_transform(df)

# feature names
num_out = numeric_present
cat_out = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_present)
feature_names = list(num_out) + list(cat_out)

# make dataframe
X_df = pd.DataFrame(X_pre.toarray(), columns=feature_names, index=df.index)

print("Preprocessed feature shape:", X_df.shape)
display(X_df.head())

Preprocessed feature shape: (10000, 35)


Unnamed: 0,is_foreign_number,is_sim_recently_swapped,has_multiple_accounts,transaction_type_ Withdraw Cash,transaction_type_ Buy Airtime,transaction_type_ Send Money,transaction_type_ Buy Airtime.1,transaction_type_Buy Airtime,transaction_type_Deposit Cash,transaction_type_Lipa na M-Pesa,...,location_Nyeri,location_Thika,device_type_Android,device_type_Feature Phone,device_type_iOS,network_provider_Airtel,network_provider_Safaricom,network_provider_Telkom Kenya,user_type_agent,user_type_individual
0,-0.184443,-0.23373,-0.326063,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,-0.184443,-0.23373,-0.326063,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,-0.184443,-0.23373,3.066892,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
3,-0.184443,-0.23373,3.066892,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4,-0.184443,-0.23373,-0.326063,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0


## Save Outputs

In [13]:
# Save the preprocessed dataset
X_df.to_csv("../data/X_preprocessed.csv", index=False)

In [14]:
# Save the preprocessor object

joblib.dump(preprocessor, "../models/preprocessor.joblib")

['../models/preprocessor.joblib']

In [15]:
print("Saved X_preprocessed.csv and preprocessor.joblib")

Saved X_preprocessed.csv and preprocessor.joblib


In [4]:

testing = pd.read_csv("../data/cleaned_Kenya_Fraud_data.csv")  
print(testing.columns.tolist())


['transaction_id', 'user_id', 'transaction_type', 'amount', 'location', 'device_type', 'network_provider', 'user_type', 'is_foreign_number', 'is_sim_recently_swapped', 'has_multiple_accounts', 'datetime', 'date', 'hour', 'dayofweek', 'time_of_day', 'log_amount']


In [5]:
testing2 = pd.read_csv("../data/transactions_with_anomalies.csv")  
print(testing2.columns.tolist())

['is_foreign_number', 'is_sim_recently_swapped', 'has_multiple_accounts', 'transaction_type_   Withdraw   Cash', 'transaction_type_  Buy    Airtime', 'transaction_type_  Send   Money', 'transaction_type_ Buy   Airtime  ', 'transaction_type_Buy Airtime', 'transaction_type_Deposit Cash', 'transaction_type_Lipa   na M-Pesa', 'transaction_type_Lipa na    M-Pesa', 'transaction_type_Lipa na M-Pesa', 'transaction_type_Pay    Bill', 'transaction_type_Pay Bill', 'transaction_type_Send   Money', 'transaction_type_Send Money', 'transaction_type_Withdraw Cash', 'location_Eldoret', 'location_Garissa', 'location_Kisumu', 'location_Machakos', 'location_Meru', 'location_Mombasa', 'location_Nairobi', 'location_Nakuru', 'location_Nyeri', 'location_Thika', 'device_type_Android', 'device_type_Feature Phone', 'device_type_iOS', 'network_provider_Airtel', 'network_provider_Safaricom', 'network_provider_Telkom Kenya', 'user_type_agent', 'user_type_individual']
