<a href="https://colab.research.google.com/github/pastrop/kaggle/blob/master/clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [None]:
def identify_commission_rate_groups(df, transaction_col='transaction_amount',
                                     commission_col='commission_fee',
                                     min_rate_diff=0.001,  # 0.1% minimum difference
                                     min_cluster_size=5):
    """
    Identify groups of transactions by commission rates using DBSCAN.

    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing transaction data
    transaction_col : str
        Column name for transaction amounts
    commission_col : str
        Column name for commission fees
    min_rate_diff : float
        Minimum expected difference between commission rates (default: 0.001 = 0.1%)
    min_cluster_size : int
        Minimum number of transactions to form a valid cluster

    Returns:
    --------
    pandas.DataFrame
        Original DataFrame with added columns:
        - 'commission_rate': calculated ratio
        - 'cluster_id': cluster assignment (-1 for outliers)
        - 'cluster_commission_rate': median rate for the cluster
    """

    # Create a copy to avoid modifying original
    result_df = df.copy()

    # Calculate commission rates (ratios)
    result_df['commission_rate'] = (
        result_df[commission_col] / result_df[transaction_col]
    )

    # Handle edge cases
    # Remove transactions with zero or negative amounts
    valid_mask = result_df[transaction_col] > 0
    result_df = result_df[valid_mask].copy()

    # Prepare data for clustering (reshape to 2D array)
    X = result_df['commission_rate'].values.reshape(-1, 1)

    # Set eps based on minimum rate difference
    # Use half of min_rate_diff to ensure we don't merge distinct rates
    eps = min_rate_diff / 2

    # Apply DBSCAN
    # eps: maximum distance for points to be in same neighborhood
    # min_samples: minimum points to form a dense region (cluster)
    dbscan = DBSCAN(eps=eps, min_samples=min_cluster_size)
    result_df['cluster_id'] = dbscan.fit_predict(X)

    # Calculate median commission rate for each cluster
    cluster_rates = {}
    for cluster_id in result_df['cluster_id'].unique():
        if cluster_id == -1:
            continue  # Skip outliers

        cluster_mask = result_df['cluster_id'] == cluster_id
        cluster_rates[cluster_id] = result_df.loc[cluster_mask, 'commission_rate'].median()

    # Assign cluster rates
    result_df['cluster_commission_rate'] = result_df['cluster_id'].map(cluster_rates)

    # For outliers, set to NaN
    result_df.loc[result_df['cluster_id'] == -1, 'cluster_commission_rate'] = np.nan

    return result_df


def summarize_commission_groups(df_with_clusters):
    """
    Generate a summary of identified commission rate groups.

    Parameters:
    -----------
    df_with_clusters : pandas.DataFrame
        DataFrame returned by identify_commission_rate_groups

    Returns:
    --------
    pandas.DataFrame
        Summary statistics for each cluster
    """
    summary = []

    for cluster_id in sorted(df_with_clusters['cluster_id'].unique()):
        cluster_data = df_with_clusters[df_with_clusters['cluster_id'] == cluster_id]

        if cluster_id == -1:
            label = 'Outliers'
        else:
            label = f'Group {cluster_id}'

        summary.append({
            'Group': label,
            'Cluster_ID': cluster_id,
            'Count': len(cluster_data),
            'Commission_Rate_%': cluster_data['cluster_commission_rate'].iloc[0] * 100
                                 if cluster_id != -1 else np.nan,
            'Rate_Std_Dev': cluster_data['commission_rate'].std(),
            'Min_Rate_%': cluster_data['commission_rate'].min() * 100,
            'Max_Rate_%': cluster_data['commission_rate'].max() * 100,
        })

    return pd.DataFrame(summary)


def visualize_commission_groups(df_with_clusters, figsize=(14, 6)):
    """
    Visualize the identified commission rate groups.

    Parameters:
    -----------
    df_with_clusters : pandas.DataFrame
        DataFrame returned by identify_commission_rate_groups
    figsize : tuple
        Figure size (width, height)
    """
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)

    # Plot 1: Histogram of commission rates colored by cluster
    for cluster_id in sorted(df_with_clusters['cluster_id'].unique()):
        cluster_data = df_with_clusters[df_with_clusters['cluster_id'] == cluster_id]

        if cluster_id == -1:
            label = 'Outliers'
            alpha = 0.5
        else:
            label = f'Group {cluster_id} ({cluster_data["cluster_commission_rate"].iloc[0]*100:.2f}%)'
            alpha = 0.7

        ax1.hist(cluster_data['commission_rate'] * 100,
                bins=50, alpha=alpha, label=label)

    ax1.set_xlabel('Commission Rate (%)')
    ax1.set_ylabel('Frequency')
    ax1.set_title('Distribution of Commission Rates by Group')
    ax1.legend()
    ax1.grid(True, alpha=0.3)

    # Plot 2: Scatter plot of transaction amount vs commission fee
    for cluster_id in sorted(df_with_clusters['cluster_id'].unique()):
        cluster_data = df_with_clusters[df_with_clusters['cluster_id'] == cluster_id]

        if cluster_id == -1:
            label = 'Outliers'
            marker = 'x'
            alpha = 0.5
        else:
            label = f'Group {cluster_id}'
            marker = 'o'
            alpha = 0.6

        ax2.scatter(cluster_data['transaction_amount'],
                   cluster_data['commission_fee'],
                   alpha=alpha, label=label, marker=marker, s=20)

    ax2.set_xlabel('Transaction Amount')
    ax2.set_ylabel('Commission Fee')
    ax2.set_title('Transaction Amount vs Commission Fee')
    ax2.legend()
    ax2.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()

In [None]:
# Example usage

# Generate sample data with multiple commission rates and outliers
np.random.seed(42)

n_transactions = 1000

# Create transactions with different commission rates
# Group 1: 3.5% commission (350 transactions)
group1 = pd.DataFrame({
    'transaction_amount': np.random.uniform(5, 1000, 350),
})
group1['commission_fee'] = group1['transaction_amount'] * 0.035 + np.random.normal(0, 0.5, 350)

# Group 2: 3.8% commission (300 transactions)
group2 = pd.DataFrame({
    'transaction_amount': np.random.uniform(5, 1000, 300),
})
group2['commission_fee'] = group2['transaction_amount'] * 0.038 + np.random.normal(0, 0.5, 300)

# Group 3: 4.1% commission (250 transactions)
group3 = pd.DataFrame({
    'transaction_amount': np.random.uniform(5, 1000, 250),
})
group3['commission_fee'] = group3['transaction_amount'] * 0.041 + np.random.normal(0, 0.5, 250)

# Add outliers (100 transactions with random rates)
outliers = pd.DataFrame({
    'transaction_amount': np.random.uniform(5, 1000, 100),
})
outliers['commission_fee'] = outliers['transaction_amount'] * np.random.uniform(0.01, 0.06, 100)

# Combine all data
df = pd.concat([group1, group2, group3, outliers], ignore_index=True)

# Shuffle the data
df = df.sample(frac=1).reset_index(drop=True)

print("Original data shape:", df.shape)
print("\nFirst few rows:")
print(df.head())

In [None]:
# Identify commission rate groups
df_clustered = identify_commission_rate_groups(
    df,
    min_rate_diff=0.001,  # 0.1% minimum difference
    min_cluster_size=10
)

# Generate summary
summary = summarize_commission_groups(df_clustered)
print("\n" + "="*70)
print("COMMISSION RATE GROUPS SUMMARY")
print("="*70)
print(summary.to_string(index=False))

# Count outliers
n_outliers = (df_clustered['cluster_id'] == -1).sum()
print(f"\nTotal outliers detected: {n_outliers} ({n_outliers/len(df)*100:.1f}%)")

# Visualize results
visualize_commission_groups(df_clustered)

# Show some example outliers
if n_outliers > 0:
    print("\nExample outliers:")
    outlier_examples = df_clustered[df_clustered['cluster_id'] == -1].head()
    print(outlier_examples[['transaction_amount', 'commission_fee', 'commission_rate']])