In [2]:
#IMPORT LIBRARIES 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, cohen_kappa_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler


#from mpl_toolkits.basemap import Basemap
#from sklearn.feature_selection import mutual_info_classif


In [3]:
#LOADING DATA
data = pd.read_csv('credit_card_transactions.csv')
data = data.dropna()

x = data.drop(columns=['is_fraud'])
y = data['is_fraud']

# Set a custom sampling strategy
# Example: Maintain 2:1 ratio of majority to minority samples
# For instance, if minority class has 100 samples, we want majority class to have 200 samples
majority_count = y.value_counts().max()  # Total samples in the majority class
minority_count = y.value_counts().min()  # Total samples in the minority class

print(f"Original training set shape: {x.shape}, {y.shape}")

# Define sampling_strategy based on the desired ratio
desired_majority_count = 10 * minority_count  # Desired majority samples (2:1 ratio)
sampling_strategy = {0: desired_majority_count, 1: minority_count}  # Map class to desired counts

# Apply RandomUnderSampler with the custom strategy
rus = RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=42)
x, y = rus.fit_resample(x, y)

# Check the shape of the new training set
print(f"Resampled training set shape: {x.shape}, {y.shape}")


Original training set shape: (1100702, 23), (1100702,)
Resampled training set shape: (69993, 23), (69993,)


In [4]:
#FEATURE ENGINEERING AND PREPROCESSING
categorical_features = ['merchant', 'category', 'gender', 'city', 'state', 'job']
numeric_features = ['amt', 'city_pop']

# Preprocessor: OneHotEncoding categorical variables and scaling numeric ones
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features), 
        ('num', StandardScaler(), numeric_features)
    ])

In [5]:
# Preprocessing pipeline (without SMOTE)
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

In [6]:
#TRAIN AND TEST SLIP
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)

#LOGISTIC REGRESSION
# Preprocess the data
x_train_preprocessed = pipeline.fit_transform(x_train)
x_test_preprocessed = pipeline.transform(x_test)

# Apply SMOTE separately after preprocessing
smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train_preprocessed, y_train)

# Define the parameter distribution
param_dist_logreg = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga', 'lbfgs'],
    'max_iter': [500, 1000, 2000]
}

# Initialize LogisticRegression
logreg = LogisticRegression(random_state=42)

# Use RandomizedSearchCV to search for best parameters
random_search_logreg = RandomizedSearchCV(
    logreg, param_distributions=param_dist_logreg, n_iter=10, cv=5, random_state=42, n_jobs=-1
)

# Fit the model on training data
random_search_logreg.fit(x_train_resampled, y_train_resampled)

# Print the best parameters
print("Best Parameters for Logistic Regression:", random_search_logreg.best_params_)

# Use the best estimator for predictions
best_logreg = random_search_logreg.best_estimator_
y_pred = best_logreg.predict(x_test_preprocessed)

acc_score = accuracy_score(y_test, y_pred)
print("Accuracy: {:.3f}".format(acc_score))
rec_score = recall_score(y_test, y_pred)
print("Recall: {:.3f}".format(rec_score))
prec_score = precision_score(y_test, y_pred)
print("Precision Score: {:.3f}".format(prec_score))
F1_score = f1_score(y_test, y_pred)
print("F1-Score: {:.3f}".format(F1_score))
kappa = cohen_kappa_score(y_test, y_pred)
print("Kappa Score: {:.3f}".format(kappa))

  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters for Logistic Regression: {'solver': 'liblinear', 'max_iter': 1000, 'C': 100}
Accuracy: 0.890
Recall: 0.786
Precision Score: 0.445
F1-Score: 0.569
Kappa Score: 0.511
