# Exploratory Data Analysis (EDA)

In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
import pandas as pd

# Initialize an empty list to store DataFrames
dataframes = []

# Loop through the years 2008 to 2023
for year in range(2008, 2024):
    # Construct the file path for the current year
    file_path = f"../data/train_{year}.parquet"
    
    # Read the Parquet file into a DataFrame and append it to the list
    try:
        df = pd.read_parquet(file_path)
        dataframes.append(df)
    except Exception as e:
        print(f"Failed to read file {file_path}: {e}")

# Concatenate all DataFrames into a single DataFrame
if dataframes:
    combined_df = pd.concat(dataframes, ignore_index=True)
    print(f"Total rows in the combined DataFrame: {len(combined_df)}")
else:
    print("No files were successfully read.")

In [None]:
combined_df['customer_id'].nunique()

In [None]:
combined_df.columns

In [None]:
# Step 1: Get the most recent date in the dataset
max_date = combined_df['date'].max()

# Step 2: Group by customer_id to get the latest event date for each customer
latest_event = combined_df.groupby('customer_id')['date'].max().reset_index()

# Step 3: Calculate churn condition
# A customer is considered churned if their last event is older than 12 months from max_date
latest_event['churn'] = latest_event['date'] < (max_date - pd.DateOffset(months=12))

# Step 4: Merge this back into the original dataframe
combined_df = pd.merge(combined_df, latest_event[['customer_id', 'churn']], on='customer_id', how='left')

# Now, the combined_df will have the new 'churn' column
combined_df.head()

In [None]:
# Count the number of distinct customer_ids where churn is True or False
churn_true_count = combined_df[combined_df['churn'] == True]['customer_id'].nunique()
churn_false_count = combined_df[combined_df['churn'] == False]['customer_id'].nunique()

print(f"Number of distinct customers with churn = True: {churn_true_count}")
print(f"Number of distinct customers with churn = False: {churn_false_count}")

In [None]:
combined_df[combined_df['churn'] == True]

# Example Churn member: customer_id = 1

In [None]:
combined_df[combined_df['customer_id'] == 2]

In [None]:
min(combined_df[combined_df['customer_id'] == 1]['date'])

In [None]:
max(combined_df[combined_df['customer_id'] == 1]['date'])

In [None]:
import matplotlib.pyplot as plt

# Filter the DataFrame for customer_id = 1
customer_data = combined_df[combined_df['customer_id'] == 2]

# Calculate the total amount in and out over time
customer_data['total_in'] = customer_data['atm_transfer_in'] + customer_data['bank_transfer_in'] + customer_data['crypto_in']
customer_data['total_out'] = customer_data['atm_transfer_out'] + customer_data['bank_transfer_out'] + customer_data['crypto_out']

# Group by date to sum the total in and out amounts per day
daily_data = customer_data.groupby('date')[['total_in', 'total_out']].sum()

# Plot the data
plt.figure(figsize=(10, 6))
plt.plot(daily_data.index, daily_data['total_in'], label='Total In', color='green')
plt.plot(daily_data.index, daily_data['total_out'], label='Total Out', color='red')

# Add labels and title
plt.title('Total Amount In and Out for Customer ID = 1 Over Time')
plt.xlabel('Date')
plt.ylabel('Amount')
plt.legend()

# Rotate date labels for better readability
plt.xticks(rotation=45)

# Show the plot
plt.tight_layout()
plt.show()


In [None]:
combined_df.info()

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(data=combined_df, x='country', order=combined_df['country'].value_counts().index)
plt.title('Count of Entries by Country', fontsize=16)
plt.xlabel('Country', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(rotation=45, ha='right', fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Filter the DataFrame to include only rows where churn_due_to_fraud is True
churn_true_df = combined_df[combined_df['churn_due_to_fraud'] == True]

# Set up the figure
plt.figure(figsize=(12, 6))

# Create the countplot for filtered data
sns.countplot(
    data=churn_true_df,
    x='country',
    order=churn_true_df['country'].value_counts().index,
    color='red'  # Color the bars red
)

# Add title and labels
plt.title('Count of Entries by Country (Churn Due to Fraud Only)', fontsize=16)
plt.xlabel('Country', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(rotation=45, ha='right', fontsize=12)

# Adjust layout
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
combined_df[combined_df["customer_id"] == 3367]

In [None]:
max(combined_df["date"])

In [None]:
print(df.isnull().sum())

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
len(df[df['churn_due_to_fraud'] == True])

In [None]:
plt.figure(figsize=(6, 4))
ax = sns.countplot(x='churn_due_to_fraud', data=df)

# Annotate each bar with its count
for p in ax.patches:
    ax.text(p.get_x() + p.get_width() / 2, 
            p.get_height() + 0.5,  # Position above the bar
            int(p.get_height()),  # The count
            ha='center')  # Center align the text

plt.title('Churn Due to Fraud Distribution')
plt.show()

In [60]:
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
correlation_matrix = df[numeric_cols].corr()

In [None]:
# Correlation matrix for numeric columns
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
correlation_matrix = df[numeric_cols].corr()

print("\nCorrelation Matrix:")
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
years = range(2008, 2024)
file_path_pattern = "../data/train_{year}.parquet"

total_churn_due_to_fraud = 0
total_records = 0

for year in years:
    file_path = file_path_pattern.format(year=year)
    try:
        df = pd.read_parquet(file_path)

        year_total_records = len(df)
        total_records += year_total_records

        churn_count = df['churn_due_to_fraud'].sum()
        total_churn_due_to_fraud += churn_count

        print(f"Year {year}: {churn_count} records (Total records: {year_total_records})")
    except Exception as e:
        print(f"Error processing file for year {year}: {e}")

print(f"Total records across all datasets: {total_records}")
print(f"Total records where 'churn_due_to_fraud' is True: {total_churn_due_to_fraud}")

# Feature Engineering

In [63]:
df_features = df.drop(columns=['Id', 'name', 'address'])
df_features['date_of_birth'] = pd.to_datetime(df_features['date_of_birth'])
df_features['date'] = pd.to_datetime(df_features['date'])

### Feature 1: Age

In [64]:
df_features['age'] = (df_features['date'] - df_features['date_of_birth']).dt.days // 365

### Calculate transaction frequencies (how many transactions were made)

In [65]:
df_features['atm_transfer_frequency'] = df_features['atm_transfer_in'] + df_features['atm_transfer_out']
df_features['bank_transfer_frequency'] = df_features['bank_transfer_in'] + df_features['bank_transfer_out']
df_features['crypto_transfer_frequency'] = df_features['crypto_in'] + df_features['crypto_out']

### Calculate ratio of incoming to outgoing transfers

In [66]:
df_features['atm_in_out_ratio'] = df_features['atm_transfer_in'] / (df_features['atm_transfer_out'] + 1)
df_features['bank_in_out_ratio'] = df_features['bank_transfer_in'] / (df_features['bank_transfer_out'] + 1)
df_features['crypto_in_out_ratio'] = df_features['crypto_in'] / (df_features['crypto_out'] + 1)

### For categorical features (country, touchpoints, csat_scores, job)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

# Load the data
train_data = pd.read_parquet("../data/train_2023.parquet")
test_data = pd.read_parquet('../data/test.parquet')

# Check for missing values
print(train_data.isnull().sum())

# Convert date column to datetime format
train_data['date'] = pd.to_datetime(train_data['date'])
test_data['date'] = pd.to_datetime(test_data['date'])

# Feature Engineering: Create age from date_of_birth
train_data['date_of_birth'] = pd.to_datetime(train_data['date_of_birth'])
train_data['age'] = (train_data['date'] - train_data['date_of_birth']).dt.days // 365

test_data['date_of_birth'] = pd.to_datetime(test_data['date_of_birth'])
test_data['age'] = (test_data['date'] - test_data['date_of_birth']).dt.days // 365

# Day of the week, month, and year features
train_data['day_of_week'] = train_data['date'].dt.dayofweek
train_data['month'] = train_data['date'].dt.month
train_data['year'] = train_data['date'].dt.year

test_data['day_of_week'] = test_data['date'].dt.dayofweek
test_data['month'] = test_data['date'].dt.month
test_data['year'] = test_data['date'].dt.year

# Calculate the number of touchpoints per day
train_data['touchpoints_count'] = train_data['touchpoints'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else 0)
test_data['touchpoints_count'] = test_data['touchpoints'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else 0)

# Calculate the average CSAT score (assuming CSAT scores are stored as a dictionary-like string)
# We need to convert the csat_scores column from string to actual dictionary type for calculation
import ast

def get_avg_csat_score(csat_str):
    try:
        csat_dict = ast.literal_eval(csat_str)  # convert string to dictionary
        return np.mean(list(csat_dict.values())) if csat_dict else np.nan
    except:
        return np.nan

train_data['avg_csat_score'] = train_data['csat_scores'].apply(get_avg_csat_score)
test_data['avg_csat_score'] = test_data['csat_scores'].apply(get_avg_csat_score)

# Transaction features: create summary statistics for transactions
def create_transaction_features(df):
    df['atm_transfer_in_total'] = df['atm_transfer_in'] + df['atm_transfer_out']
    df['bank_transfer_total'] = df['bank_transfer_in'] + df['bank_transfer_out']
    df['crypto_transfer_total'] = df['crypto_in'] + df['crypto_out']
    df['bank_transfer_volume'] = df['bank_transfer_in_volume'] + df['bank_transfer_out_volume']
    df['crypto_transfer_volume'] = df['crypto_in_volume'] + df['crypto_out_volume']
    return df

train_data = create_transaction_features(train_data)
test_data = create_transaction_features(test_data)

# Rolling averages (e.g., over the last 7 days) for transaction volume
train_data['rolling_7_days_bank_transfer_volume'] = train_data['bank_transfer_volume'].rolling(7).mean()
test_data['rolling_7_days_bank_transfer_volume'] = test_data['bank_transfer_volume'].rolling(7).mean()

train_data['rolling_7_days_crypto_transfer_volume'] = train_data['crypto_transfer_volume'].rolling(7).mean()
test_data['rolling_7_days_crypto_transfer_volume'] = test_data['crypto_transfer_volume'].rolling(7).mean()

# Feature encoding for categorical variables (one-hot encoding)
categorical_cols = ['country', 'job', 'from_competitor']
train_data = pd.get_dummies(train_data, columns=categorical_cols, drop_first=True)
test_data = pd.get_dummies(test_data, columns=categorical_cols, drop_first=True)

# Drop non-essential columns
drop_cols = ['Id', 'name', 'address', 'date_of_birth', 'date', 'csat_scores', 'customer_id']
train_data = train_data.drop(columns=drop_cols)
test_data = test_data.drop(columns=drop_cols)

In [None]:
for i, n in enumerate(train_data.columns):
    print(f"{i} {n}")

In [None]:
train_data