# About

Questions:
- Are there any data abnormalities?
- What is a suitable inactivity window for churn?
- What is the baseline rates? (For benchmarking models later)

# Preparation

## Libraries

In [1]:
import pandas as pd

In [2]:
from dotenv import load_dotenv
import os

In [3]:
import maika_eda_pandas as mk

## Environment

In [4]:
load_dotenv()

True

In [5]:
SEED_CUSTOMERS=os.getenv("SEED_CUSTOMERS")
SEED_TRANSACTIONS=os.getenv("SEED_TRANSACTIONS")

## Data

In [6]:
customers_df = pd.read_csv(f"../{SEED_CUSTOMERS}")

In [7]:
transactions_df = pd.read_csv(f"../{SEED_TRANSACTIONS}")

In [8]:
mk.read_data_info(customers_df)

Number of columns: 3
Column names: ['customer_id', 'signup_date', 'true_lifetime_days']
Number of rows: 3,000
Data Preview: 

  customer_id signup_date  true_lifetime_days
0      C00000  2025-08-22                 204
1      C00001  2025-03-07                 365
2      C00002  2025-08-18                  48
3      C00003  2025-09-22                  84
4      C00004  2025-05-28                 113


In [9]:
mk.read_data_info(transactions_df)

Number of columns: 3
Column names: ['customer_id', 'transaction_date', 'amount']
Number of rows: 46,704
Data Preview: 

  customer_id transaction_date  amount
0      C00000       2025-09-10  195.78
1      C00000       2025-09-12   50.87
2      C00000       2025-10-01  133.25
3      C00000       2025-10-16   37.44
4      C00000       2025-10-18  101.95


# EDA

## customers_df

In [10]:
mk.data_overview_table(customers_df)


Styler.applymap has been deprecated. Use Styler.map instead.



Unnamed: 0,Data Type,Non Null Count,Non Null %,Missing Count,Missing %,Unique Count,Unique %,Zero Count,Negative Count,Zero %,Negative %,Row Count
customer_id,object,3000,100.00%,0,0.00%,3000,100.00%,0,0,0.00%,0.00%,3000
signup_date,object,3000,100.00%,0,0.00%,335,11.17%,0,0,0.00%,0.00%,3000
true_lifetime_days,int64,3000,100.00%,0,0.00%,330,11.00%,0,0,0.00%,0.00%,3000


## transactions_df

### Overview

In [11]:
mk.data_overview_table(transactions_df)


Styler.applymap has been deprecated. Use Styler.map instead.



Unnamed: 0,Data Type,Non Null Count,Non Null %,Missing Count,Missing %,Unique Count,Unique %,Zero Count,Negative Count,Zero %,Negative %,Row Count
customer_id,object,46704,100.00%,0,0.00%,2892,6.19%,0,0,0.00%,0.00%,46704
transaction_date,object,46704,100.00%,0,0.00%,363,0.78%,0,0,0.00%,0.00%,46704
amount,float64,46704,100.00%,0,0.00%,15059,32.24%,0,0,0.00%,0.00%,46704


### Transform

In [19]:
transactions_df['transaction_date'] = pd.to_datetime(transactions_df['transaction_date']).dt.date

### 1D

### transaction_date

In [20]:
styled_df, fig = mk.frequency_table_and_bar(transactions_df, 'transaction_date')
mk.stack_plotly_figure_with_styled_dataframe(fig, styled_df)

### amount

In [25]:
fig = mk.create_histogram_plotly(transactions_df, "amount")

In [26]:
df = mk.distribution_statistics_table(transactions_df, None, 'amount')