In [112]:
import os
os.environ["JAVA_HOME"] = "/usr"

In [113]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

df = spark.sql("Select 'spark' as hello ")

df.show()

+-----+
|hello|
+-----+
|spark|
+-----+



In [114]:
df = spark.read.csv("../data/raw/Base.csv", header=True, inferSchema=True)
df.show()

+----------+------------------+---------------------+-------------------------+----------------------------+------------+------------------+----------------------+------------+------------+------------------+------------------+------------------+--------------------+--------------------------------+-----------------+-----------------+-------------+--------------+----------------+------------------+-----------------+---------------+---------------------+---------------+--------+-------------------------+---------+------------------+-------------------------+------------------+-----+
|fraud_bool|            income|name_email_similarity|prev_address_months_count|current_address_months_count|customer_age|days_since_request|intended_balcon_amount|payment_type|zip_count_4w|       velocity_6h|      velocity_24h|       velocity_4w|bank_branch_count_8w|date_of_birth_distinct_emails_4w|employment_status|credit_risk_score|email_is_free|housing_status|phone_home_valid|phone_mobile_valid|bank_month

                                                                                

### Normality tests on original DF

We will be testing the following columns:

- income
- name_email_similarity
- prev_address_months_count
- current_address_months_count
- customer_age
- days_since_request
- intended_balcon_amount
- zip_count_4w
- velocity_6h
- velocity_24h
- velocity_4w
- bank_branch_count_8w
- date_of_birth_distinct_emails_4w
- credit_risk_score
- bank_months_count
- proposed_credit_limit
- session_length_in_minutes
- device_distinct_emails
- month

The only numeric column that will be ignored is "device_fraud_count" because all values are the same.

In [115]:
sampled_df = df.sample(0.1, seed=42)
pdf = sampled_df.toPandas()
pdf

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,...,has_other_cards,proposed_credit_limit,foreign_request,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
0,0,0.8,0.723322,23,9,30,0.038436,-1.764405,AB,859,...,0,1000.0,0,INTERNET,1.260981,windows,1,1,0,0
1,0,0.6,0.772867,96,124,40,0.015487,-0.959755,AB,4033,...,0,1500.0,0,INTERNET,4.607511,other,1,1,0,0
2,1,0.9,0.570800,-1,157,60,0.005908,35.337538,AA,4079,...,0,1500.0,0,INTERNET,4.036394,windows,1,1,0,0
3,0,0.8,0.842606,152,5,30,0.007110,-1.130478,AB,2395,...,1,1500.0,0,INTERNET,3.984538,other,1,1,0,0
4,0,0.8,0.858070,-1,243,50,0.024083,-1.081313,AB,1116,...,0,200.0,0,INTERNET,7.071571,windows,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99982,0,0.1,0.101774,-1,151,20,0.034714,-0.372048,AC,2162,...,0,200.0,0,INTERNET,2.911697,linux,1,1,0,7
99983,0,0.7,0.580939,-1,32,40,0.027216,23.489870,AA,850,...,1,200.0,0,INTERNET,4.880790,other,0,1,0,7
99984,0,0.9,0.737479,-1,58,40,0.001053,-1.270928,AB,2131,...,0,200.0,0,INTERNET,5.500124,other,1,1,0,7
99985,0,0.1,0.267651,53,12,40,0.024352,-1.281570,AC,756,...,0,200.0,0,INTERNET,36.401462,linux,1,1,0,7


In [116]:
import pandas as pd
from scipy.stats import shapiro

# List of columns to test for normality
columns = [
    'income',
    'name_email_similarity',
    'prev_address_months_count',
    'current_address_months_count',
    'customer_age',
    'days_since_request',
    'intended_balcon_amount',
    'zip_count_4w',
    'velocity_6h',
    'velocity_24h',
    'velocity_4w',
    'bank_branch_count_8w',
    'date_of_birth_distinct_emails_4w',
    'credit_risk_score',
    'bank_months_count',
    'proposed_credit_limit',
    'session_length_in_minutes',
    'month'
]

# Initialize lists to store results
results = []

# Perform Shapiro-Wilk test for each column
for col in columns:
    stat, p_val = shapiro(pdf[col])
    normality = 'Normal' if p_val >= 0.05 else 'Not Normal'
    results.append({
        'Column': col,
        'Statistic': stat,
        'p-value': p_val,
        'Normality': normality
    })

# Create DataFrame from results
results_df = pd.DataFrame(results)

# Display the DataFrame
results_df

  res = hypotest_fun_out(*samples, **kwds)


Unnamed: 0,Column,Statistic,p-value,Normality
0,income,0.873639,8.415578e-119,Not Normal
1,name_email_similarity,0.946103,5.4319050000000004e-96,Not Normal
2,prev_address_months_count,0.458727,1.850312e-163,Not Normal
3,current_address_months_count,0.83717,4.570386e-126,Not Normal
4,customer_age,0.918719,1.110357e-106,Not Normal
5,days_since_request,0.183924,2.286619e-177,Not Normal
6,intended_balcon_amount,0.561585,1.525452e-156,Not Normal
7,zip_count_4w,0.871719,3.156347e-119,Not Normal
8,velocity_6h,0.973516,7.36169e-79,Not Normal
9,velocity_24h,0.985806,3.087297e-65,Not Normal


Based on the statistical test results, no column is normally distributed, as we saw on the EDA.

### Numerical columns on the original DF vs Fraudulent Accounts

We will test the following numerical columns:
- income
- name_email_similarity
- prev_address_months_count
- current_address_months_count
- customer_age
- days_since_request
- intended_balcon_amount
- zip_count_4w
- velocity_6h
- velocity_24h
- velocity_4w
- bank_branch_count_8w
- date_of_birth_distinct_emails_4w
- credit_risk_score
- bank_months_count
- proposed_credit_limit
- session_length_in_minutes
- device_distinct_emails
- month

In [117]:
from scipy.stats import mannwhitneyu

mw_results = []

# Perform Mann-Whitney U test for each column against fraud_bool
for col in columns:
    try:
        group0 = pdf[pdf['fraud_bool'] == 0][col]
        group1 = pdf[pdf['fraud_bool'] == 1][col]
        
        stat, p_val = mannwhitneyu(group0, group1, alternative='two-sided')
        
        significance = 'Significant' if p_val < 0.05 else 'Not significant'
        
        mw_results.append({
            'Column': col,
            'Statistic': stat,
            'p-value': p_val,
            'Significance': significance
        })
    except Exception as e:
        print(f"Error processing {col}: {str(e)}")
        mw_results.append({
            'Column': col,
            'Statistic': None,
            'p-value': None,
            'Significance': f"Error: {str(e)}"
        })

mw_results_df = pd.DataFrame(mw_results)

mw_results_df

Unnamed: 0,Column,Statistic,p-value,Significance
0,income,38822015.5,2.899121e-72,Significant
1,name_email_similarity,68177938.0,1.976419e-36,Significant
2,prev_address_months_count,67166738.5,1.80671e-47,Significant
3,current_address_months_count,41592560.0,2.6245209999999998e-50,Significant
4,customer_age,38140981.5,3.350707e-81,Significant
5,days_since_request,61688374.5,3.920352e-09,Significant
6,intended_balcon_amount,62216619.0,1.240164e-10,Significant
7,zip_count_4w,53403862.0,0.007183427,Significant
8,velocity_6h,61041411.0,1.81048e-07,Significant
9,velocity_24h,59069092.0,0.001492382,Significant


The following columns have a significant difference between the fraudulent and non fraudulent accounts:
- income
- name_email_similarity
- prev_address_months_count
- current_address_months_count
- customer_age
- velocity_6h
- velocity_24h
- velocity_4w
- bank_branch_count_8w
- date_of_birth_distinct_emails_4w
- credit_risk_score
- proposed_credit_limit
- device_distinct_emails_8w
- days_since_request
- intended_balcon_amount
- zip_count_4w
- bank_months_count
- month

While these don't have a significant difference:
- session_length_in_minutes


### Categorical Columns on the processed dataset

We will analyze the relationship between the following categorical columns:  
- payment_type  
- employment_status  
- housing_status  
- source  
- device_os  

The following columns are listed as "binary" and will be treated as categorical:  
- email_is_free  
- phone_home_valid  
- phone_mobile_valid  
- has_other_cards  
- foreign_request  
- keep_alive_session  
- fraud_bool

We will also consider the column device_distinct_emails_8w as categorical, since it makes more sense this way. It can be 0, 1 or 2.

In [118]:
df = spark.read.parquet("../data/processed/Base.parquet", header=True, inferSchema=True)
df.show()

+----------+------------------+---------------------+-------------------------+----------------------------+------------+--------------------+----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------------------+--------------------+-------------+----------------+------------------+-----------------+---------------+---------------------+---------------+-------------------------+------------------+-------------------------+-----+----------------------------------+-------------------------------------+---------------------+---------------------------+-------------------------------+---------------------+--------------------+---------------------+--------------------+-----------------------------+-----------------------------------------+--------------------------+--------------------------+------------------------------+----------------------------------+--------------+---------------------+-------

In [119]:
sampled_df = df.sample(0.1, seed=42)
pdf = sampled_df.toPandas()
pdf

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,zip_count_4w,velocity_6h,...,payment_type_original,payment_type,employment_status_original,employment_status,housing_status_original,housing_status,source_original,source,device_os_original,device_os
0,0,0.8,0.741409,0.000000,-0.072727,-0.5,1.288345,6.662037,0.372493,0.681771,...,AA,1.0,CA,0.0,BC,0.0,INTERNET,0.0,other,0.0
1,0,0.8,0.536021,0.000000,-0.118182,0.5,-0.762987,-0.005106,0.792741,-0.908613,...,AB,0.0,CF,2.0,BB,1.0,INTERNET,0.0,other,0.0
2,0,0.1,0.759314,0.000000,0.672727,1.0,-0.348000,2.634932,0.703916,0.544406,...,AA,1.0,CE,5.0,BB,1.0,INTERNET,0.0,windows,2.0
3,0,0.8,0.852761,0.000000,-0.236364,0.5,-0.039329,3.353227,0.587393,0.539598,...,AD,3.0,CA,0.0,BC,0.0,INTERNET,0.0,linux,1.0
4,0,0.8,0.880921,14.583333,-0.372727,-0.5,0.312774,8.526668,0.884432,0.728977,...,AA,1.0,CA,0.0,BC,0.0,INTERNET,0.0,windows,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100265,0,0.9,0.211946,0.000000,-0.163636,-0.5,0.531524,6.439899,0.592168,-0.418018,...,AA,1.0,CA,0.0,BC,0.0,INTERNET,0.0,linux,1.0
100266,0,0.9,0.230953,1.000000,-0.363636,0.5,-0.041625,8.997744,-0.911175,-0.517152,...,AA,1.0,CA,0.0,BC,0.0,INTERNET,0.0,linux,1.0
100267,0,0.9,0.208870,0.000000,-0.127273,-0.5,-0.428438,0.040850,-0.215855,-0.682321,...,AB,0.0,CA,0.0,BC,0.0,INTERNET,0.0,other,0.0
100268,0,0.8,0.473987,0.000000,1.363636,0.5,-0.012934,6.066591,0.380134,-0.005191,...,AA,1.0,CA,0.0,BC,0.0,INTERNET,0.0,other,0.0


In [120]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency

def cramers_v(contingency_table: pd.DataFrame, correction=False):
    """
    Calculate Cramer's V statistic for categorical-categorical association.
    
    Parameters:
    -----------
    contingency_table : pd.DataFrame or np.ndarray
        Contingency table (cross-tabulation) of two categorical variables.
    correction : bool, optional
        Apply bias correction for small sample sizes (default: False).
    
    Returns:
    --------
    float
        Cramer's V statistic (0 to 1, where 0 = no association).
    """
    chi2, _, _, _ = chi2_contingency(contingency_table)
    n = contingency_table.sum().sum()  # Total sample size
    k = min(contingency_table.shape) - 1  # min(rows-1, cols-1)
    
    if correction:
        # Bias correction (Bergsma, 2013)
        cramers_v = np.sqrt(max(0, chi2 / n - k) / k)
    else:
        cramers_v = np.sqrt(chi2 / (n * k))
    
    return cramers_v

columns = [
    'payment_type',
    'employment_status',
    'housing_status',
    'source',
    'device_os',
    'email_is_free',
    'phone_home_valid',
    'phone_mobile_valid',
    'has_other_cards',
    'foreign_request',
    'keep_alive_session',
    'device_distinct_emails_8w'
]

results = []

# Calculate results for each variable
for col in columns:
    contingency_table = pd.crosstab(pdf[col], pdf['fraud_bool'])
    chi2, p_val, dof, _ = chi2_contingency(contingency_table)
    v = cramers_v(contingency_table)
    
    results.append({
        'Column': col,
        'Significant': p_val < 0.05,
        'p_value': p_val,
        'Cramers_V': v,
        'DOF': dof
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Format the DataFrame for better display
results_df['Significance'] = results_df['Significant'].map({True: 'Significant', False: 'Not significant'})
results_df['p_value'] = results_df['p_value'].round(4)
results_df['Cramers_V'] = results_df['Cramers_V'].round(3)

# Select and order columns for final output
final_df = results_df[['Column', 'Significance', 'p_value', 'DOF', 'Cramers_V']]

final_df

Unnamed: 0,Column,Significance,p_value,DOF,Cramers_V
0,payment_type,Significant,0.0,4,0.039
1,employment_status,Significant,0.0,6,0.039
2,housing_status,Significant,0.0,6,0.116
3,source,Not significant,0.6909,1,0.001
4,device_os,Significant,0.0,2,0.077
5,email_is_free,Significant,0.0,1,0.026
6,phone_home_valid,Significant,0.0,1,0.032
7,phone_mobile_valid,Significant,0.0,1,0.014
8,has_other_cards,Significant,0.0,1,0.033
9,foreign_request,Significant,0.0,1,0.014


The source column is the only non-significant column.

The following columns have a significant relationship with a higher effect sizes than the other columns:
- **housing_status** (Cramer's V=0.123, dof=6) - Small to medium effect size  
- **device_os** (Cramer's V=0.086, dof=2) - Small effect size

for the Cramer's V interpretation we used the following table as reference:
| Degrees of freedom | Small | Medium | Large |
|--------------------|-------|--------|-------|
| 1                  | 0.10  | 0.30   | 0.50  |
| 2                  | 0.07  | 0.21   | 0.35  |
| 3                  | 0.06  | 0.17   | 0.29  |
| 4                  | 0.05  | 0.15   | 0.25  |
| 5                  | 0.04  | 0.13   | 0.22  |

### Numeric values on the processed dataset

In [121]:
from scipy.stats import mannwhitneyu

mw_results = []

columns = [
    'income',
    'name_email_similarity',
    'prev_address_months_count',
    'current_address_months_count',
    'customer_age',
    'days_since_request',
    'intended_balcon_amount',
    'zip_count_4w',
    'velocity_6h',
    'velocity_24h',
    'velocity_4w',
    'bank_branch_count_8w',
    'date_of_birth_distinct_emails_4w',
    'credit_risk_score',
    'bank_months_count',
    'proposed_credit_limit',
    'session_length_in_minutes',
    'month'
]

# Perform Mann-Whitney U test for each column against fraud_bool
for col in columns:
    try:
        group0 = pdf[pdf['fraud_bool'] == 0][col]
        group1 = pdf[pdf['fraud_bool'] == 1][col]
        
        stat, p_val = mannwhitneyu(group0, group1, alternative='two-sided')
        significance = 'Significant' if p_val < 0.05 else 'Not significant'
        
        mw_results.append({
            'Column': col,
            'Statistic': stat,
            'p-value': p_val,
            'Significance': significance
        })
    except Exception as e:
        print(f"Error processing {col}: {str(e)}")
        mw_results.append({
            'Column': col,
            'Statistic': None,
            'p-value': None,
            'Significance': f"Error: {str(e)}"
        })

mw_results_df = pd.DataFrame(mw_results)

mw_results_df

Unnamed: 0,Column,Statistic,p-value,Significance
0,income,38456206.0,2.2834599999999997e-57,Significant
1,name_email_similarity,63221713.0,1.701743e-25,Significant
2,prev_address_months_count,64843388.0,5.566173e-52,Significant
3,current_address_months_count,38932957.5,9.770237e-53,Significant
4,customer_age,37522401.0,3.357602e-67,Significant
5,days_since_request,57618702.0,6.706934e-06,Significant
6,intended_balcon_amount,58332544.5,1.451015e-07,Significant
7,zip_count_4w,52596955.5,0.4152251,Not significant
8,velocity_6h,57853059.5,2.024695e-06,Significant
9,velocity_24h,56796880.0,0.0002806261,Significant


The results obtained on the Mann-Whitney applied to the raw DF were almost completely reproduced in the test applied to the processed DF. The only not significant column is the session length in minutes.

### Conclusion

The only non-significant columns are the **source** and **session_length_in_minutes**. This is expected, as the dataset is meant to be used to assess biases, and therefore some groups have higher rates of fraud account creation requests than others.