In [1]:
import pandas as pd
import numpy as np
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#

In [65]:
train_dataset = pd.read_csv("../dataset/fraudTrain.csv")
test_dataset = pd.read_csv("../dataset/fraudTest.csv")
train_dataset.shape, test_dataset.shape
df = pd.concat([train_dataset, test_dataset])
df.to_csv("../dataset/CreditCardData.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [4]:
df = df.drop(columns=['Unnamed: 0'], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1852394 entries, 0 to 555718
Data columns (total 22 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   trans_date_trans_time  object 
 1   cc_num                 int64  
 2   merchant               object 
 3   category               object 
 4   amt                    float64
 5   first                  object 
 6   last                   object 
 7   gender                 object 
 8   street                 object 
 9   city                   object 
 10  state                  object 
 11  zip                    int64  
 12  lat                    float64
 13  long                   float64
 14  city_pop               int64  
 15  job                    object 
 16  dob                    object 
 17  trans_num              object 
 18  unix_time              int64  
 19  merch_lat              float64
 20  merch_long             float64
 21  is_fraud               int64  
dtypes: float64(5), int64(5),

In [66]:
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
# Extract date and time separately
df['trans_date'] = df['trans_date_trans_time'].dt.strftime("%Y-%m-%d")
df['trans_date'] = pd.to_datetime(df['trans_date'])
df['dob']=pd.to_datetime(df['dob'])
df['trans_month'] = pd.DatetimeIndex(df['trans_date']).month
df['trans_year'] = pd.DatetimeIndex(df['trans_date']).year
df['latitude_distance'] = abs(round(df['merch_lat'] - df['lat'], 2))
df['longitude_distance'] = abs(round(df['merch_long'] - df['long'], 2))
df['gender'] = df['gender'].replace({'F': 0, 'M': 1}).astype("int64")

In [67]:
df_copy = df.copy()

In [68]:
drop_cols = ['trans_date_trans_time','cc_num','merchant','first','last','street','zip','trans_num','unix_time' ,'city','lat','long','job','dob','merch_lat','merch_long','trans_date', 'state']
df.drop(drop_cols, axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1852394 entries, 0 to 555718
Data columns (total 10 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Unnamed: 0          int64  
 1   category            object 
 2   amt                 float64
 3   gender              int64  
 4   city_pop            int64  
 5   is_fraud            int64  
 6   trans_month         int32  
 7   trans_year          int32  
 8   latitude_distance   float64
 9   longitude_distance  float64
dtypes: float64(3), int32(2), int64(4), object(1)
memory usage: 141.3+ MB


In [69]:
df = df.drop(columns=['Unnamed: 0'], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1852394 entries, 0 to 555718
Data columns (total 9 columns):
 #   Column              Dtype  
---  ------              -----  
 0   category            object 
 1   amt                 float64
 2   gender              int64  
 3   city_pop            int64  
 4   is_fraud            int64  
 5   trans_month         int32  
 6   trans_year          int32  
 7   latitude_distance   float64
 8   longitude_distance  float64
dtypes: float64(3), int32(2), int64(3), object(1)
memory usage: 127.2+ MB


In [49]:
df = pd.get_dummies(df, columns=['category'], drop_first=True)
df[df.filter(like='category_').columns] = df[df.filter(like='category_').columns].astype(int)
df

Unnamed: 0,amt,gender,city_pop,is_fraud,trans_month,trans_year,latitude_distance,longitude_distance,category_food_dining,category_gas_transport,...,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel
0,4.97,0,3495,0,1,2019,0.07,0.87,0,0,...,0,0,0,0,1,0,0,0,0,0
1,107.23,0,149,0,1,2019,0.27,0.02,0,0,...,1,0,0,0,0,0,0,0,0,0
2,220.11,1,4154,0,1,2019,0.97,0.11,0,0,...,0,0,0,0,0,0,0,0,0,0
3,45.00,1,1939,0,1,2019,0.80,0.45,0,1,...,0,0,0,0,0,0,0,0,0,0
4,41.96,1,99,0,1,2019,0.25,0.83,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555714,43.77,1,519,0,12,2020,0.55,0.56,0,0,...,0,1,0,0,0,0,0,0,0,0
555715,111.84,1,28739,0,12,2020,0.62,0.75,0,0,...,0,0,0,1,0,0,0,0,0,0
555716,86.88,0,3684,0,12,2020,0.46,0.81,0,0,...,0,0,0,1,0,0,0,0,0,0
555717,7.99,1,129,0,12,2020,0.15,0.63,0,0,...,0,0,0,0,0,0,0,0,0,1


In [70]:
X = df.drop(columns=['is_fraud'])
y = df['is_fraud']

In [71]:
num_features = [ column for column in X.columns if X[column].dtype != 'object']
cat_features = [ column for column in X.columns if column not in num_features ]

In [79]:
df.category.value_counts()

category
gas_transport     188029
grocery_pos       176191
home              175460
shopping_pos      166463
kids_pets         161727
shopping_net      139322
entertainment     134118
food_dining       130729
personal_care     130085
health_fitness    122553
misc_pos          114229
misc_net           90654
grocery_net        64878
travel             57956
Name: count, dtype: int64

In [72]:
num_features, cat_features

(['amt',
  'gender',
  'city_pop',
  'trans_month',
  'trans_year',
  'latitude_distance',
  'longitude_distance'],
 ['category'])

In [43]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from category_encoders import TargetEncoder

In [87]:
# Splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

from category_encoders.target_encoder import TargetEncoder  

# Initialize Target Encoder
target_encoder = TargetEncoder()

# Fit and transform on training data
X_train['category'] = target_encoder.fit_transform(X_train['category'], y_train)

# Transform test data
X_test['category'] = target_encoder.transform(X_test['category'])
X_train.head()

Unnamed: 0,category,amt,gender,city_pop,trans_month,trans_year,latitude_distance,longitude_distance
477236,0.001925,75.72,0,2504700,12,2020,0.24,0.54
60683,0.004221,79.74,0,2383912,2,2019,0.37,0.59
1062243,0.001462,74.92,1,1446,3,2020,0.18,0.82
341727,0.004221,44.3,0,2135,10,2020,0.54,0.31
279211,0.001462,28.02,1,626,10,2020,0.13,0.82


In [None]:
# Preprocessing Pipelines
num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', TargetEncoder())
])

preprocessor = ColumnTransformer([
    ('num', num_transformer, num_features),
    ("cat", cat_transformer, cat_features)
])

# Apply preprocessing
X_train_processed = preprocessor.fit_transform(X_train, y_train)
X_test_processed = preprocessor.transform(X_test)

# Apply SMOTE
smote = SMOTE(sampling_strategy='minority', random_state=42)
X_train_processed, y_train = smote.fit_resample(X_train_processed, y_train)



In [88]:
# Apply SMOTE
smote = SMOTE(sampling_strategy='minority', random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Preprocessing Pipelines
num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

preprocessor = ColumnTransformer([
    ('num', num_transformer, num_features)
])

# Apply preprocessing
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [61]:
X_train_processed.shape, y_train.shape

((2948388, 20), (2948388,))

In [85]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train_processed, y_train)

KeyboardInterrupt: 

In [89]:
# # Convert transformed NumPy array to DataFrame
# X_train_resampled = pd.DataFrame(X_train_resampled, columns=[f'feature_{i}' for i in range(X_train_resampled.shape[1])])

# # Transform test data (without SMOTE)
# X_test_transformed = preprocessor.transform(X_test)
# X_test_transformed = pd.DataFrame(X_test_transformed, columns=[f'feature_{i}' for i in range(X_test_transformed.shape[1])])

# Train Logistic Regression separately
model = LogisticRegression()
model.fit(X_train_processed, y_train)

In [90]:
# Predict
y_pred = model.predict(X_train_processed)
from sklearn.metrics import classification_report

print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.93      0.85   1474194
           1       0.92      0.75      0.82   1474194

    accuracy                           0.84   2948388
   macro avg       0.85      0.84      0.84   2948388
weighted avg       0.85      0.84      0.84   2948388



In [91]:
# Predict
y_pred = model.predict(X_test_processed)
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.93      0.96    368549
           1       0.05      0.73      0.10      1930

    accuracy                           0.93    370479
   macro avg       0.53      0.83      0.53    370479
weighted avg       0.99      0.93      0.96    370479

