In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from ipywidgets import interact
from sklearn.impute import KNNImputer
import tensorflow as tf
from scipy import stats
import math
from math import sqrt, pi
import matplotlib.cm as cm
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import DBSCAN


In [None]:
df=pd.read_csv(r'C:\Users\engmo\OneDrive\Desktop\python_trials\bank_transactions_data_2.csv')

# Getting to know the data

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
print(f'Channel unique values: {df["Channel"].unique()}')
print(f'Account ID unique values: {len(df["AccountID"].unique())}')
print(f'Transaction Type values: {df["TransactionType"].unique()}')
print(f'Merchant Id Unique values: {len(df["MerchantID"].unique())}')

# Feature Engineering

In [None]:
df["TransactionDate"] = pd.to_datetime(df["TransactionDate"])
df["PreviousTransactionDate"] = pd.to_datetime(df["PreviousTransactionDate"])
df["DaysBetweenTransaction"] = (df["PreviousTransactionDate"] - df["TransactionDate"]).dt.days
df["DaysBetweenTransaction"] = df["DaysBetweenTransaction"].astype("float64")

In [None]:
categorical_cols = ["TransactionType", "Channel","CustomerOccupation"]# to be used in model
other_cols =  [col for col in df.columns if df[col].dtype in ["float64", "int64"]]

cols = categorical_cols + other_cols
x = df[cols].copy()
l= df[cols].copy()# another copy that won't be transformed
x = pd.get_dummies(x, columns=categorical_cols)
l=pd.get_dummies(l,columns=categorical_cols)

In [None]:
x['percentage of transaction from account']=x['TransactionAmount']/x['AccountBalance']
l['percentage of transaction from account']=l['TransactionAmount']/l['AccountBalance'] # is the dataframe that won't be transformed and would be used in comparison later

In [None]:
l.head()

In [None]:
x.head()

In [None]:
x.tail()

# univariate Analysis

In [None]:
for i in x.select_dtypes(include="number").columns:
  sns.boxplot(data=x, x=i)
  plt.show()

In [None]:
for i in x.select_dtypes(include="number").columns:#loop about the column not a range
  #sns.histplot(data[i])
  plt.figure(figsize=(15, 6))  # Optional: Set figure size for better visibility
  sns.histplot(x[i], stat="count")

  plt.show()

# Multivariate Analysis


In [None]:
correlation_matrix = x.corr()

plt.figure(figsize=(100, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True, cbar_kws={"shrink": .8})
plt.title('Heat Map of Correlation Matrix')
plt.show()


In [None]:
'''''
correlation_matrix = y.corr()

plt.figure(figsize=(100, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True, cbar_kws={"shrink": .8})
plt.title('Heat Map of Correlation Matrix')
plt.show()
'''''


## important notes
1-as age increases account balance increases but transactions amount decreases

2-credit is more used online or in branch and depit is more used from ATM

3-more log in attempts increases slighltly wiht duration

4-Despite the fact that students have lower account balances yet they make more amount of money in transactions than other occupations

5-Doctors have the highest accounts yet the least spending behaviour

6-except for the age and the transaction balance correlation which is 0.32 and the correlations between channels and types of payment , slighltly weak correlations are found among other features 
so clustering is needed to find complex patterns based on related features and to try to detect anomalies per cluster for possible frauds 

# Dataset prepation 

## transforming features chosen in x data frame  (not the one hot encoded features) to normal distirbution using scipy fixing skewnees and checking using log like hood and shapiro test

In [None]:
'''''
def log_likelihood(data, lambda_value):
    n = len(data)
    mean = np.mean(data)
    variance = np.var(data)
    log_likelihood_value= -0.5 * (np.log(2 * np.pi * variance) + (1 / variance) * np.sum((data - mean) ** 2))*n
    return log_likelihood_value

features_to_transform=[
          'TransactionAmount',
          'DaysBetweenTransaction',
          'AccountBalance',
          'TransactionDuration',
          'percentage of transaction from account',
          'CustomerAge'
]
# Loop through each feature
for feature in features_to_transform:
    # Check for non-positive values
    if (x[feature] <= 0).any():
        # Option 1: Remove non-positive values
        x = x[x[feature] > 0]
    
    # Reset index after filtering
    x.reset_index(drop=True, inplace=True)

# Display the filtered DataFrame
print(x)



from scipy import stats

# Convert to NumPy array and apply Box-Cox transformation
for i in features_to_transform:
    original=x[i]
    transformed_values, lambda_value = stats.boxcox(x[i].to_numpy())

# Add transformed values back to the DataFrame
    x[i] = transformed_values #updated

    print(x[i])
    plt.figure(figsize=(15, 6))  # Optional: Set figure size for better visibility
    sns.histplot(x[i], stat="count")

    # Calculate log-likelihood for transformed data
    print(f'Optimal lambda for {i} data is: {lambda_value}')
    ll_value = log_likelihood(x[i].dropna().to_numpy(), lambda_value)
    print(f'Log-Likelihood of transformed {i} is: {ll_value}')
    plt.figure(figsize=(15, 6))

    # Original Data Histogram
    plt.subplot(1, 2, 1)
    sns.histplot(original, kde=True, stat="count")
    plt.title(f'Original  Data {i} Distribution')
    plt.xlabel(f'Values of {i}')
    plt.ylabel('Frequency')

    # Transformed Data Histogram
    plt.subplot(1, 2, 2)
    sns.histplot(x[i], kde=True, stat="count")  # Using the updated Diphtheria column
    plt.title(f'Transformed {i} Data Distribution (Box-Cox)')
    plt.xlabel(f'Transformed Values of {i}')
    plt.ylabel('Frequency')

    plt.tight_layout()
    plt.show()
    plt.tight_layout()
    plt.show()

    # Shapiro-Wilk test for transformed data
    shapiro_test = stats.shapiro(x[i].dropna())
    print(f'Shapiro-Wilk Test: Statistic={shapiro_test.statistic}, p-value={shapiro_test.pvalue}')

    # Q-Q plot for transformed data
    plt.figure(figsize=(10, 6))
    stats.probplot(x[i].dropna(), dist="norm", plot=plt)
    plt.title(f'Q-Q Plot of Transformed {i} Data')
    plt.show()

'''''

## Standarization

In [None]:


## Standarizning Data in both data frames 


# Assuming x is your DataFrame
# Define features as all column names of the DataFrame
features = x.columns.tolist()  # Get all column names as a list


for i in features:
    # Original values
    original = x[i].dropna()  
    original_2 = l[i].dropna() # Drop NaN values for accurate calculations

    # Calculate mean and standard deviation
    mean = original.mean()
    mean2= original_2.mean()
    std_dev = original.std()
    std_dev2 = original_2.std()

    # Apply Z-score normalization
    x[i] = (original - mean) / std_dev
    l[i]=(original_2-mean2)/std_dev2

    # Print transformed values
    print(f" standatized  Transformed values for {i}:\n{x[i]}")
    print(f"standaruzed original values for {i}:\n{l[i]}")


# clustering data into groups based on specefic features to try find complex patterns and remove farthest points in each cluster
Features to cluster on are

**transaction type**

**percentage of transaction from account total**

**transaction channel**

**TransactionAmount**

**DaysBetweenTransaction**

**AccountBalance**

**TransactionDuration**

**CustomerAge**

**LoginAttempts**

**Customer_occupation**

In [None]:
x.head()

## K__means clustering 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples
import matplotlib.cm as cm


# Range of number of clusters for Elbow Method
range_n_clusters = range(2, 11)
inertia = []

# Calculate inertia for each number of clusters
for n_clusters in range_n_clusters:
    clusterer = KMeans(n_clusters=n_clusters, init="k-means++", n_init='auto', random_state=1)
    cluster_labels = clusterer.fit_predict(x)
    
    # Store the inertia
    inertia.append(clusterer.inertia_)

# Plotting the Elbow Method
plt.figure(figsize=(8, 6))
plt.plot(range_n_clusters, inertia, marker='o')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia (Sum of Squared Distances)')
plt.xticks(range_n_clusters)
plt.grid()
plt.show()

# Silhouette Analysis (your existing code)
for n_clusters in range_n_clusters:
    fig, ax1 = plt.subplots(1, 1, figsize=(8, 6))

    ax1.set_xlim([-0.1, 1])
    ax1.set_ylim([0, len(x) + (n_clusters + 1) * 10])

    clusterer = KMeans(n_clusters=n_clusters, init="k-means++", n_init='auto', random_state=1)
    cluster_labels = clusterer.fit_predict(x)

    silhouette_avg = silhouette_score(x, cluster_labels)
    print(
        "For n_clusters =",
        n_clusters,
        "The average silhouette_score is :",
        silhouette_avg,
    )

    sample_silhouette_values = silhouette_samples(x, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(
            np.arange(y_lower, y_upper),
            0,
            ith_cluster_silhouette_values,
            facecolor=color,
            edgecolor=color,
            alpha=0.7,
        )

        ax1.text(-1, y_lower + 0.5 * size_cluster_i, str(i))
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
    ax1.set_yticks([])  # Clear the y-axis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    plt.show()


## Hierarchical clustering

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, silhouette_samples
import matplotlib.cm as cm
from sklearn.metrics import pairwise_distances


# Range of cluster numbers to test
n_clusters_range = range(2, 11)  # Testing for 2 to 5 clusters

# List to store inertia values
inertia = []

# Calculate inertia for each number of clusters
for n_clusters in n_clusters_range:
    agglomerative = AgglomerativeClustering(n_clusters=n_clusters)
    agglomerative_labels = agglomerative.fit_predict(x)

    # Calculate inertia
    inertia_value = 0
    for i in range(n_clusters):
        cluster_points = x[agglomerative_labels == i]
        if len(cluster_points) > 0:
            inertia_value += np.sum(pairwise_distances(cluster_points, metric='euclidean').sum(axis=1))
    
    inertia.append(inertia_value)

# Plotting the Elbow Method
plt.figure(figsize=(8, 6))
plt.plot(n_clusters_range, inertia, marker='o')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia (Sum of Squared Distances)')
plt.xticks(n_clusters_range)
plt.grid()
plt.show()

# Silhouette Analysis
for n_clusters in n_clusters_range:
    fig, ax1 = plt.subplots(1, 1, figsize=(8, 6))

    agglomerative = AgglomerativeClustering(n_clusters=n_clusters)
    agglomerative_labels = agglomerative.fit_predict(x)

    # Only proceed if there are more than 1 cluster
    if len(set(agglomerative_labels)) > 1:
        silhouette_avg = silhouette_score(x, agglomerative_labels)
        sample_silhouette_values = silhouette_samples(x, agglomerative_labels)

        print(f"For n_clusters = {n_clusters}, The average silhouette_score is : {silhouette_avg}")

        # Initialize the silhouette plot
        y_lower = 10
        for i in range(len(set(agglomerative_labels))):
            ith_cluster_silhouette_values = sample_silhouette_values[agglomerative_labels == i]
            ith_cluster_silhouette_values.sort()

            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.nipy_spectral(float(i) / len(set(agglomerative_labels)))
            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0,
                              ith_cluster_silhouette_values,
                              facecolor=color,
                              edgecolor=color,
                              alpha=0.7)

            ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

            y_lower = y_upper + 10  # 10 for the 0 samples

        ax1.set_title(f"The silhouette plot for n_clusters = {n_clusters}.")
        ax1.set_xlabel("The silhouette coefficient values")
        ax1.set_ylabel("Cluster label")

        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

        ax1.set_yticks([])  # Clear the y-axis labels / ticks
        ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

        plt.show()
    else:
        print(f"For n_clusters = {n_clusters}, there are not enough clusters to compute silhouette score.")


## conclusion from both clustering techniques (hirarchial and k-means) and by checking both the elbow method and the schilouette score it has been determined that a good no of clusters is 6 where this on silhouette test no big differences have taken place after 6 clusters and also by elpow method 6 clusters provide a good inertia  

   despite the fact that more clusters gives better inertia and better silohette score yet this can also mean overfitting to data 

## fraud detection by considering farthest point in each cluster as an anomaly

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA # will be used just to be able to visualize 


n_clusters = 6

# Perform hierarchical clustering
agglomerative = AgglomerativeClustering(n_clusters=n_clusters)
final_labels = agglomerative.fit_predict(x)

# Calculate silhouette score
if len(set(final_labels)) > 1:
    silhouette_avg = silhouette_score(x, final_labels)
    print(f"The average silhouette_score for {n_clusters} clusters is : {silhouette_avg}")
else:
    print(f"Not enough clusters to compute silhouette score.")

# Create a DataFrame to hold the points and their corresponding cluster labels
clustered_data = x.copy()
clustered_data['Cluster'] = final_labels

# Identify the farthest 5% of points in each cluster for potential fraud detection
fraud_indices = []
fraud_counts = np.zeros(n_clusters)  # Array to hold the count of frauds in each cluster

for i in range(n_clusters):
    # Get points in the current cluster
    cluster_points = x[final_labels == i]
    
    # Calculate distances from the centroid of the cluster
    centroid = np.mean(cluster_points, axis=0)
    distances = np.linalg.norm(cluster_points - centroid, axis=1)
    
    # Determine the threshold for the farthest 5%
    threshold_index = int(len(distances) * 0.95)  # 95% index
    farthest_points_indices = np.argsort(distances)[threshold_index:]  # Get the farthest 5%
    
    # Store the indices of these points
    current_fraud_indices = np.where(final_labels == i)[0][farthest_points_indices]
    fraud_indices.extend(current_fraud_indices)
    
    # Count the number of frauds in the current cluster
    fraud_counts[i] = len(current_fraud_indices)

# Convert to a NumPy array for easier manipulation
fraud_indices = np.array(fraud_indices)

# Apply PCA for dimensionality reduction
pca = PCA(n_components=2)
x_pca = pca.fit_transform(x)

# Visualization of each cluster with potential frauds
plt.figure(figsize=(10, 8))

# Plot all points with their clusters
for i in range(n_clusters):
    cluster_points = x_pca[final_labels == i]
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f'Cluster {i}', alpha=0.5)

# Plot potential frauds
fraud_points = x_pca[fraud_indices]
plt.scatter(fraud_points[:, 0], fraud_points[:, 1], color='red', marker='x', s=100, label='Potential Fraud')

plt.title('Clusters with Potential Frauds (PCA Reduced)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend()
plt.grid()
plt.show()

# Print the indices of potential frauds
print("\nIndices of potential frauds:", fraud_indices)  
print("Number of frauds in each cluster:", fraud_counts)
