In [2]:
import pandas as pd

In [3]:
from constants import DATA_DIR
df = pd.read_csv(DATA_DIR / "2" / "Base.csv")

In [None]:
pd.set_option('display.max_columns', None)
df.head(10)

In [None]:
df.describe()

In [None]:
df.info()

### Dealing with missing values

In [None]:
print("The following columns have values of -1, which are often used as a NaN (missing value):\n")

for column in df.columns:
    missing = df[df[column] == -1]

    if len(missing) > 0:
        print(f"{column}: {len(missing)}")


In [None]:
# For the features with comparatively little missing values, we just take the median of the feature
for column in ["credit_risk_score", "device_distinct_emails_8w", "session_length_in_minutes", "current_address_months_count"]:
    median_risk_score = df[column].median()

    mask = df[column] == -1
    df.loc[mask, column] = int(median_risk_score)

In [None]:
# We drop device_fraud_count as all values are 0
df = df.drop(columns=["device_fraud_count"])

### Dealing with categorical values

In [None]:
# It looks as if some categorical features contains integers. This is because of some rows that only have one integer value.

categorical_cols =  df.columns[df.dtypes == 'object']
df_categorical = df[categorical_cols]

for column in df_categorical.columns:
    print(f"{column}:")
    values = df[column].unique()
    print(values)

In [None]:
# There seem to be rows for which all values are set to a certain value in [1, 5, 52, 122]
# More efficient: check for non-NA, single unique value per row using numpy
import numpy as np

# Convert DataFrame to numpy array for fast, vectorized operation
df_np = df.to_numpy()
# For each row, check if all elements are the same (ignoring NaNs if present)
same_value_mask = np.all(df_np == df_np[:, [0]], axis=1)
rows_with_same_value = pd.Series(same_value_mask, index=df.index)

df.loc[rows_with_same_value].head()

In [None]:
# Get the index of those rows
rows_to_remove = df[rows_with_same_value].index.tolist()

len(rows_to_remove)

print(f"Rows where all columns have the same value: {len(rows_to_remove)}")

df = df.drop(index=rows_to_remove)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

mean_fraud_rate = df['fraud_bool'].mean()

# For all categorical features, plot the mean fraud rate per unique value,
# annotate with the count per value using a barplot with text annotation
for col in df.select_dtypes(include='object').columns:
    plt.figure(figsize=(10, 6))
    
    # Calculate mean fraud rate and count for each category
    fraud_rate = df.groupby(col)['fraud_bool'].mean()
    counts = df[col].value_counts().sort_index()
    
    # Barplot of fraud rate per category
    ax = sns.barplot(x=fraud_rate.index, y=fraud_rate.values, color='skyblue')
    
    # Draw mean fraud rate as a horizontal line
    plt.axhline(mean_fraud_rate, color='red', linestyle='--', label=f'Mean fraud rate ({mean_fraud_rate:.4f})')

    # Annotate with counts inside the bars (vertically centered, color depends on bar height for visibility)
    for i, (label, count) in enumerate(counts.items()):
        bar_height = fraud_rate[label]
        y = bar_height / 2  # place text at half the bar height
        color = "black" if bar_height < 0.5 else "white"
        plt.text(
            i, y, f"n={count}",
            horizontalalignment='center',
            verticalalignment='center',
            fontsize=10,
            color=color
        )
    
    plt.title(f'Fraud Ratio and Counts for {col}')
    plt.xlabel(col)
    plt.ylabel('Mean fraud_bool')
    plt.ylim(0, 1.1 * max(fraud_rate.max(), mean_fraud_rate))
    plt.legend()
    plt.tight_layout()
    plt.show()


## Turn the categorical columns into one hot encoded features

In [None]:
# For all categorical features, use one-hot encoding
categorical_types = df.select_dtypes(include=['object']).columns

print(f"Number of columns before one-hot encoding: {len(df.columns)}")
# Perform one-hot encoding on those columns
df = pd.get_dummies(df, columns=categorical_types, drop_first=True)

print(f"Number of columns after one-hot encoding: {len(df.columns)}")

In [None]:
# Group by income and get fraud rate and count per income value
income_fraud_rate = df.groupby('income')['fraud_bool'].mean()
income_counts = df['income'].value_counts().sort_index()
mean_fraud_rate = df['fraud_bool'].mean()

plt.figure(figsize=(10, 6))
bar = sns.barplot(x=[f"{x:.2f}" for x in income_fraud_rate.index], y=income_fraud_rate.values, color='skyblue')

# Display count inside the bar (vertically centered, in white or black depending on fraud rate for contrast)
for i, (income, count) in enumerate(income_counts.items()):
    y = income_fraud_rate[income] / 2  # half way up the bar
    # Choose white or black font depending on bar height for visibility
    color = "black" if income_fraud_rate[income] < 0.5 else "white"
    plt.text(i, y, f"n={count}", ha='center', va='center', fontsize=10, color=color)

plt.axhline(mean_fraud_rate, color='red', linestyle='--', label=f'Mean fraudster rate ({mean_fraud_rate:.4f})')
plt.title('Mean Fraudster Ratio per Income Value')
plt.xlabel('Income')
plt.ylabel('Mean fraud_bool')
plt.ylim(0, 1.1 * max(income_fraud_rate.max(), mean_fraud_rate))
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
df.head()

In [None]:
df.info()

In [None]:
from load_data import split_data, preprocess_data, normalize
df_preprocessed = preprocess_data(df)

df_preprocessed.head()


In [None]:
X_train, X_val, X_test, y_train, y_val, y_test = split_data(df_preprocessed)
pd.set_option('display.max_columns', None)
X_train.head(20)

# X_train_norm, X_val_norm, X_test_norm, scaler = normalize(X_train, X_val, X_test)

In [None]:
X_train_norm, X_val_norm, X_test_norm, scaler = normalize(X_train, X_val, X_test)
pd.set_option('display.max_columns', None)
X_train_norm.head(20)

In [None]:
# training 
from sklearn.metrics import classification_report
import xgboost as xgb

# Set the scale_pos_weight parameter to address class imbalance
# This acts as a global weight for the positive class (label=1).
# Optionally, you can also use sample_weight in fit(), see comments below.

scale_pos_weight = 99  # (weight for 1) / (weight for 0) = 99 / 1 = 99

# Initialize XGBoost model with scale_pos_weight
model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    scale_pos_weight=scale_pos_weight
)

# Alternatively, for finer control, you could use the sample_weight parameter in fit():
# import numpy as np
# sample_weight = np.where(y_train == 1, 99, 1)
# model.fit(X_train, y_train, sample_weight=sample_weight)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_val)

# classification report
print(classification_report(y_val, y_pred))

In [None]:
import matplotlib.pyplot as plt

plt.scatter(
    df["credit_risk_score"],
    df["fraud_bool"],
    alpha=0.4,
)

plt.xlabel("Income")
plt.ylabel("Proposed Credit Limit")
plt.title("Income vs Proposed Credit Limit")

plt.grid(True, linestyle="--", linewidth=0.5)
plt.tight_layout()
plt.show()
corr = df["credit_risk_score"].corr(df["fraud_bool"])
print(f"Pearson correlation: {corr:.4f}")