# Exploratory Data Analysis (EDA)

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
df = pd.read_parquet("../data/train_2023.parquet")
len(df)

In [None]:
print(df.isnull().sum())

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
len(df[df['churn_due_to_fraud'] == True])

In [None]:
plt.figure(figsize=(6, 4))
ax = sns.countplot(x='churn_due_to_fraud', data=df)

# Annotate each bar with its count
for p in ax.patches:
    ax.text(p.get_x() + p.get_width() / 2, 
            p.get_height() + 0.5,  # Position above the bar
            int(p.get_height()),  # The count
            ha='center')  # Center align the text

plt.title('Churn Due to Fraud Distribution')
plt.show()

In [10]:
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
correlation_matrix = df[numeric_cols].corr()

In [None]:
# Correlation matrix for numeric columns
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
correlation_matrix = df[numeric_cols].corr()

print("\nCorrelation Matrix:")
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
years = range(2008, 2024)
file_path_pattern = "../data/train_{year}.parquet"

total_churn_due_to_fraud = 0
total_records = 0

for year in years:
    file_path = file_path_pattern.format(year=year)
    try:
        df = pd.read_parquet(file_path)

        year_total_records = len(df)
        total_records += year_total_records

        churn_count = df['churn_due_to_fraud'].sum()
        total_churn_due_to_fraud += churn_count

        print(f"Year {year}: {churn_count} records (Total records: {year_total_records})")
    except Exception as e:
        print(f"Error processing file for year {year}: {e}")

print(f"Total records across all datasets: {total_records}")
print(f"Total records where 'churn_due_to_fraud' is True: {total_churn_due_to_fraud}")

# Feature Engineering

In [14]:
df_features = df.drop(columns=['Id', 'name', 'address'])
df_features['date_of_birth'] = pd.to_datetime(df_features['date_of_birth'])
df_features['date'] = pd.to_datetime(df_features['date'])

### Feature 1: Age

In [None]:
df_features['age'] = (df_features['date'] - df_features['date_of_birth']).dt.days // 365

### Calculate transaction frequencies (how many transactions were made)

In [None]:
df_features['atm_transfer_frequency'] = df_features['atm_transfer_in'] + df_features['atm_transfer_out']
df_features['bank_transfer_frequency'] = df_features['bank_transfer_in'] + df_features['bank_transfer_out']
df_features['crypto_transfer_frequency'] = df_features['crypto_in'] + df_features['crypto_out']

### Calculate ratio of incoming to outgoing transfers

In [None]:
df_features['atm_in_out_ratio'] = df_features['atm_transfer_in'] / (df_features['atm_transfer_out'] + 1)
df_features['bank_in_out_ratio'] = df_features['bank_transfer_in'] / (df_features['bank_transfer_out'] + 1)
df_features['crypto_in_out_ratio'] = df_features['crypto_in'] / (df_features['crypto_out'] + 1)

### For categorical features (country, touchpoints, csat_scores, job)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

# Load the data
train_data = pd.read_parquet("../data/train_2023.parquet")
test_data = pd.read_parquet('../data/test.parquet')

# Check for missing values
print(train_data.isnull().sum())

# Convert date column to datetime format
train_data['date'] = pd.to_datetime(train_data['date'])
test_data['date'] = pd.to_datetime(test_data['date'])

# Feature Engineering: Create age from date_of_birth
train_data['date_of_birth'] = pd.to_datetime(train_data['date_of_birth'])
train_data['age'] = (train_data['date'] - train_data['date_of_birth']).dt.days // 365

test_data['date_of_birth'] = pd.to_datetime(test_data['date_of_birth'])
test_data['age'] = (test_data['date'] - test_data['date_of_birth']).dt.days // 365

# Day of the week, month, and year features
train_data['day_of_week'] = train_data['date'].dt.dayofweek
train_data['month'] = train_data['date'].dt.month
train_data['year'] = train_data['date'].dt.year

test_data['day_of_week'] = test_data['date'].dt.dayofweek
test_data['month'] = test_data['date'].dt.month
test_data['year'] = test_data['date'].dt.year

# Calculate the number of touchpoints per day
train_data['touchpoints_count'] = train_data['touchpoints'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else 0)
test_data['touchpoints_count'] = test_data['touchpoints'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else 0)

# Calculate the average CSAT score (assuming CSAT scores are stored as a dictionary-like string)
# We need to convert the csat_scores column from string to actual dictionary type for calculation
import ast

def get_avg_csat_score(csat_str):
    try:
        csat_dict = ast.literal_eval(csat_str)  # convert string to dictionary
        return np.mean(list(csat_dict.values())) if csat_dict else np.nan
    except:
        return np.nan

train_data['avg_csat_score'] = train_data['csat_scores'].apply(get_avg_csat_score)
test_data['avg_csat_score'] = test_data['csat_scores'].apply(get_avg_csat_score)

# Transaction features: create summary statistics for transactions
def create_transaction_features(df):
    df['atm_transfer_in_total'] = df['atm_transfer_in'] + df['atm_transfer_out']
    df['bank_transfer_total'] = df['bank_transfer_in'] + df['bank_transfer_out']
    df['crypto_transfer_total'] = df['crypto_in'] + df['crypto_out']
    df['bank_transfer_volume'] = df['bank_transfer_in_volume'] + df['bank_transfer_out_volume']
    df['crypto_transfer_volume'] = df['crypto_in_volume'] + df['crypto_out_volume']
    return df

train_data = create_transaction_features(train_data)
test_data = create_transaction_features(test_data)

# Rolling averages (e.g., over the last 7 days) for transaction volume
train_data['rolling_7_days_bank_transfer_volume'] = train_data['bank_transfer_volume'].rolling(7).mean()
test_data['rolling_7_days_bank_transfer_volume'] = test_data['bank_transfer_volume'].rolling(7).mean()

train_data['rolling_7_days_crypto_transfer_volume'] = train_data['crypto_transfer_volume'].rolling(7).mean()
test_data['rolling_7_days_crypto_transfer_volume'] = test_data['crypto_transfer_volume'].rolling(7).mean()

# Feature encoding for categorical variables (one-hot encoding)
categorical_cols = ['country', 'job', 'from_competitor']
train_data = pd.get_dummies(train_data, columns=categorical_cols, drop_first=True)
test_data = pd.get_dummies(test_data, columns=categorical_cols, drop_first=True)

# Drop non-essential columns
drop_cols = ['Id', 'name', 'address', 'date_of_birth', 'date', 'csat_scores', 'customer_id']
train_data = train_data.drop(columns=drop_cols)
test_data = test_data.drop(columns=drop_cols)

In [None]:
for i, n in enumerate(train_data.columns):
    print(f"{i} {n}")

In [None]:
train_data