### Importing Libraries

In [23]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, average_precision_score

### Read and Check Data

In [None]:
df = pd.read_csv('Final Transactions.csv')
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

### Understand and Analysis of Data

In [None]:
fraud_distribution = df['TX_FRAUD'].value_counts()

plt.figure(figsize=(5, 5))
fraud_distribution.plot.pie(
    labels=['Legitimate Transaction (TARGET=0)', 'Fraudulent Transaction (TARGET=1)'],
    autopct='%1.1f%%',
    startangle=90,
    colors=['#1f77b4', '#d62728']  
)
plt.title('Allocation of TX_FRAUD')
plt.ylabel('') 
plt.show()

Since the data is quite unbalanced, with 13.5% fraudulent transactions and 86.5% legitimate transactions, we use SMOTE (Synthetic Minority Oversampling Technique) to address the imbalance. 

However, before applying SMOTE, we split the data to prevent oversampling from affecting the validation dataset, ensuring the model is evaluated on real, unaltered data.

In [28]:
feature = df[['CUSTOMER_ID', 'TERMINAL_ID', 'TX_AMOUNT', 'TX_TIME_SECONDS', 'TX_TIME_DAYS', 'TX_FRAUD']]
scaler = StandardScaler()
standardized_features = scaler.fit_transform(feature)

We select specific features to feed into the model. Then, we use StandardScaler to standardize the numerical features, which helps improve model performance.

In [29]:
X = feature.drop(columns=['TX_FRAUD'])
y = feature['TX_FRAUD']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [30]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

df_train_resampled = pd.concat([pd.DataFrame(X_train_resampled, columns=X_train.columns), pd.DataFrame(y_train_resampled, columns=['TX_FRAUD'])], axis=1)
df_train_resampled = df_train_resampled.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
fraud_distribution_resampled = df_train_resampled['TX_FRAUD'].value_counts()

plt.figure(figsize=(5, 5))
fraud_distribution_resampled.plot.pie(
    labels=['Legitimate Transaction (TARGET=0)', 'Fraudulent Transaction (TARGET=1)'],
    autopct='%1.1f%%',
    startangle=90,
    colors=['#1f77b4', '#d62728']  
)
plt.title('Allocation of TX_FRAUD After SMOTE')
plt.ylabel('') 
plt.show()

### Modeling

In [None]:
models = {"Logistic Regression": LogisticRegression(),
          "K-Nearest Neighbors": KNeighborsClassifier(),
          "Decision Tree": DecisionTreeClassifier(),
          "Random Forest": RandomForestClassifier(),
          "Gradient Boosting": GradientBoostingClassifier()}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} accuracy: {acc:.3f}")

So, we have the highest accuracy with Random Forest and Gradient Boosting. We will use GridSearchCV to see if we can boost the accuracy for both.

### Model Optimization

In [None]:
dt_clf = DecisionTreeClassifier()

param_grid = {
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10]
}

grid_search = GridSearchCV(dt_clf, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_test, y_test)

print("Best hyperparameters: ", grid_search.best_params_)