<a href="https://colab.research.google.com/github/nihemelandu/churn_clv_prediction/blob/main/02a_Initial_Data_Exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gcsfs --quiet

In [2]:
from google.colab import auth
auth.authenticate_user()

In [3]:
import numpy as np
import pandas as pd

In [4]:
import warnings
warnings.filterwarnings('ignore')

# Configuration - easy to modify in notebook
SAMPLE_RATE = 0.005
RANDOM_SEED = 42
MIN_SAMPLE_SIZE = 10000
MAX_REASONABLE_PRICE = 10000
EXPECTED_DATE_START = '2019-10-01'
EXPECTED_DATE_END = '2019-12-01'

print("✓ Libraries imported and configuration set")

✓ Libraries imported and configuration set


What data is used and where did this data come from?

Data Loading & Sampling

1.  Load data with appropriate sampling strategy
2.  Set random seed for reproducibility
3.  Document sample size and sampling rationale
4. Verify sample loaded successfully

In [5]:
# Data Loading with Reproducible Sampling

print("=== LOADING DATA WITH REPRODUCIBLE SAMPLING ===")

# Set seed for reproducibility
np.random.seed(RANDOM_SEED)

# Load samples from both files
files = ['gs://churn_clv_data_bucket/2019-Oct.csv', 'gs://churn_clv_data_bucket/2019-Nov.csv']
samples = []

for file in files:
    print(f"\nLoading {file}...")
    try:
        sample = pd.read_csv(file, skiprows=lambda i: i > 0 and np.random.random() > SAMPLE_RATE)
        print(f"✓ Loaded {len(sample):,} rows from {file}")
        samples.append(sample)
    except FileNotFoundError:
        print(f"✗ {file} not found")
    except Exception as e:
        print(f"✗ Error loading {file}: {e}")

# Combine samples
if samples:
    sample_df = pd.concat(samples, ignore_index=True)
    print(f"\n✓ Combined sample: {len(sample_df):,} rows, {sample_df.shape[1]} columns")

    if len(sample_df) < MIN_SAMPLE_SIZE:
        print(f"⚠️  Warning: Sample size below minimum {MIN_SAMPLE_SIZE:,}")
else:
    raise ValueError("No data could be loaded!")

=== LOADING DATA WITH REPRODUCIBLE SAMPLING ===

Loading gs://churn_clv_data_bucket/2019-Oct.csv...
✓ Loaded 212,521 rows from gs://churn_clv_data_bucket/2019-Oct.csv

Loading gs://churn_clv_data_bucket/2019-Nov.csv...
✓ Loaded 337,154 rows from gs://churn_clv_data_bucket/2019-Nov.csv

✓ Combined sample: 549,675 rows, 9 columns


Schema Validation

*   Data type consistency checks
*   Column name validation
*   Expected vs actual data types comparison
*   Data type optimization opportunities

In [6]:
# Schema Validation

print("=== SCHEMA VALIDATION ===")

# Expected columns for eCommerce events
expected_columns = ['event_time', 'event_type', 'product_id', 'category_id',
                   'category_code', 'brand', 'price', 'user_id', 'user_session']

print(f"Actual columns: {sample_df.columns.tolist()}")

# Check for missing/extra columns
missing_cols = [col for col in expected_columns if col not in sample_df.columns]
extra_cols = [col for col in sample_df.columns if col not in expected_columns]

if missing_cols:
    print(f"✗ Missing columns: {missing_cols}")
else:
    print("✓ All expected columns present")

if extra_cols:
    print(f"⚠️  Extra columns: {extra_cols}")

# Data types
print(f"\nData types:")
print(sample_df.dtypes)

=== SCHEMA VALIDATION ===
Actual columns: ['event_time', 'event_type', 'product_id', 'category_id', 'category_code', 'brand', 'price', 'user_id', 'user_session']
✓ All expected columns present

Data types:
event_time        object
event_type        object
product_id         int64
category_id        int64
category_code     object
brand             object
price            float64
user_id            int64
user_session      object
dtype: object


In [7]:
#Data Type Validation

print("=== DATA TYPE VALIDATION ===")

# Validate numeric columns
numeric_cols = ['product_id', 'category_id', 'price', 'user_id']
for col in numeric_cols:
    if col in sample_df.columns:
        try:
            pd.to_numeric(sample_df[col], errors='raise')
            print(f"✓ {col} is properly numeric")
        except ValueError:
            print(f"✗ {col} has non-numeric values")

# Validate datetime
if 'event_time' in sample_df.columns:
    try:
        sample_df['event_time_parsed'] = pd.to_datetime(sample_df['event_time'], utc=True)

        # Check date range
        min_date = sample_df['event_time_parsed'].min()
        max_date = sample_df['event_time_parsed'].max()

        print(f"✓ event_time parsed successfully")
        print(f"Date range: {min_date.date()} to {max_date.date()}")

        # Validate expected date range
        expected_start = pd.to_datetime(EXPECTED_DATE_START, utc=True)
        expected_end = pd.to_datetime(EXPECTED_DATE_END, utc=True)

        if min_date < expected_start or max_date > expected_end:
            print(f"⚠️  Dates outside expected range {EXPECTED_DATE_START} to {EXPECTED_DATE_END}")
        else:
            print("✓ Dates within expected range")

    except Exception as e:
        print(f"✗ event_time validation failed: {e}")

=== DATA TYPE VALIDATION ===
✓ product_id is properly numeric
✓ category_id is properly numeric
✓ price is properly numeric
✓ user_id is properly numeric
✓ event_time parsed successfully
Date range: 2019-10-01 to 2019-11-30
✓ Dates within expected range


In [8]:
# Data Quality Analysis

print("=== DATA QUALITY ANALYSIS ===")

# Missing values
print("Missing values by column:")
missing_summary = sample_df.isnull().sum()
missing_pct = (missing_summary / len(sample_df)) * 100

missing_df = pd.DataFrame({
    'Missing_Count': missing_summary,
    'Missing_Percentage': missing_pct.round(2)
}).sort_values('Missing_Count', ascending=False)

# Show only columns with missing values
missing_issues = missing_df[missing_df['Missing_Count'] > 0]
if len(missing_issues) > 0:
    display(missing_issues)
else:
    print("✓ No missing values found")

=== DATA QUALITY ANALYSIS ===
Missing values by column:


Unnamed: 0,Missing_Count,Missing_Percentage
category_code,177156,32.23
brand,76458,13.91


In [10]:
# Uniqueness Analysis

print("=== UNIQUENESS ANALYSIS ===")

uniqueness_stats = []
for col in sample_df.columns:
    unique_count = sample_df[col].nunique()
    unique_pct = (unique_count / len(sample_df)) * 100
    uniqueness_stats.append({
        'Column': col,
        'Unique_Values': unique_count,
        'Uniqueness_Pct': round(unique_pct, 2),
        'Data_Type': str(sample_df[col].dtype)
    })

uniqueness_df = pd.DataFrame(uniqueness_stats)
display(uniqueness_df)

=== UNIQUENESS ANALYSIS ===


Unnamed: 0,Column,Unique_Values,Uniqueness_Pct,Data_Type
0,event_time,504069,91.7,object
1,event_type,3,0.0,object
2,product_id,74253,13.51,int64
3,category_id,650,0.12,int64
4,category_code,128,0.02,object
5,brand,3039,0.55,object
6,price,35669,6.49,float64
7,user_id,426829,77.65,int64
8,user_session,530289,96.47,object
9,event_time_parsed,504069,91.7,"datetime64[ns, UTC]"


In [11]:
# Business Logic Validation

print("=== BUSINESS LOGIC VALIDATION ===")

# Event types validation
if 'event_type' in sample_df.columns:
    print("Event types found:")
    event_counts = sample_df['event_type'].value_counts()
    display(event_counts)

    expected_events = ['view', 'cart', 'purchase', 'remove_from_cart']
    actual_events = set(sample_df['event_type'].unique())
    unexpected = actual_events - set(expected_events)

    if unexpected:
        print(f"⚠️  Unexpected event types: {unexpected}")
    else:
        print("✓ All event types are expected")

# Price validation
if 'price' in sample_df.columns:
    print(f"\nPrice validation:")
    print(sample_df['price'].describe())

    zero_prices = (sample_df['price'] == 0).sum()
    negative_prices = (sample_df['price'] < 0).sum()
    high_prices = (sample_df['price'] > MAX_REASONABLE_PRICE).sum()

    print(f"\nPrice issues:")
    print(f"Zero prices: {zero_prices:,}")
    print(f"Negative prices: {negative_prices:,}")
    print(f"High prices (>${MAX_REASONABLE_PRICE:,}): {high_prices:,}")

=== BUSINESS LOGIC VALIDATION ===
Event types found:


Unnamed: 0_level_0,count
event_type,Unnamed: 1_level_1
view,522020
cart,19381
purchase,8274


✓ All event types are expected

Price validation:
count    549675.000000
mean        292.024049
std         358.140139
min           0.000000
25%          67.960000
50%         164.980000
75%         360.110000
max        2574.070000
Name: price, dtype: float64

Price issues:
Zero prices: 1,224
Negative prices: 0
High prices (>$10,000): 0


In [12]:
# Cell 8: Memory Usage Analysis

print("=== MEMORY USAGE ANALYSIS ===")

# Current sample memory
memory_usage = sample_df.memory_usage(deep=True)
total_memory_mb = memory_usage.sum() / (1024 * 1024)

print(f"Sample size: {len(sample_df):,} rows")
print(f"Sample memory: {total_memory_mb:.1f} MB")

print(f"\nMemory by column:")
for col, usage in memory_usage.items():
    usage_mb = usage / (1024 * 1024)
    print(f"{col}: {usage_mb:.1f} MB")

# Estimate full dataset
scale_factor = 285_000_000 / len(sample_df)
estimated_gb = (total_memory_mb * scale_factor) / 1024
print(f"\nEstimated full dataset: {estimated_gb:.1f} GB")

=== MEMORY USAGE ANALYSIS ===
Sample size: 549,675 rows
Sample memory: 189.0 MB

Memory by column:
Index: 0.0 MB
event_time: 37.7 MB
event_type: 27.8 MB
product_id: 4.2 MB
category_id: 4.2 MB
category_code: 30.8 MB
brand: 27.1 MB
price: 4.2 MB
user_id: 4.2 MB
user_session: 44.6 MB
event_time_parsed: 4.2 MB

Estimated full dataset: 95.7 GB


In [13]:
# Cell 10: Capture Data Quality Issues for 02b
print("=== CAPTURING DATA QUALITY ISSUES ===")

# Structure all findings for 02b
data_quality_issues = {
    'sample_info': {
        'total_rows': len(sample_df),
        'total_columns': sample_df.shape[1],
        'memory_mb': float(sample_df.memory_usage(deep=True).sum() / (1024*1024))
    },
    'missing_values': {},
    'duplicates': {},
    'outliers': {},
    'categorical_issues': {},
    'data_types': {}
}

# Capture missing values
missing_summary = sample_df.isnull().sum()
for col in missing_summary[missing_summary > 0].index:
    data_quality_issues['missing_values'][col] = {
        'count': int(missing_summary[col]),
        'percentage': float((missing_summary[col] / len(sample_df)) * 100)
    }

# Capture duplicates
data_quality_issues['duplicates']['exact'] = int(sample_df.duplicated().sum())
if all(col in sample_df.columns for col in ['user_id', 'product_id', 'event_type', 'event_time']):
    business_dups = sample_df.duplicated(subset=['user_id', 'product_id', 'event_type', 'event_time']).sum()
    data_quality_issues['duplicates']['business_logic'] = int(business_dups)

# Capture price outliers
if 'price' in sample_df.columns:
    price_stats = sample_df['price'].describe()
    data_quality_issues['outliers']['price'] = {
        'negative': int((sample_df['price'] < 0).sum()),
        'zero': int((sample_df['price'] == 0).sum()),
        'very_high': int((sample_df['price'] > 10000).sum()),
        'stats': {
            'count': float(price_stats['count']),
            'mean': float(price_stats['mean']),
            'std': float(price_stats['std']),
            'min': float(price_stats['min']),
            'max': float(price_stats['max']),
            '25%': float(price_stats['25%']),
            '50%': float(price_stats['50%']),
            '75%': float(price_stats['75%'])
        }
    }

# Capture categorical issues (with JSON-safe conversion)
for col in ['event_type', 'brand', 'category_code']:
    if col in sample_df.columns:
        # Convert value_counts to regular dict with JSON-safe values
        top_values_series = sample_df[col].value_counts().head()
        top_values_dict = {}
        for key, value in top_values_series.items():
            # Handle NaN keys and ensure JSON serializable
            safe_key = str(key) if pd.notna(key) else 'NaN'
            top_values_dict[safe_key] = int(value)

        # Get sample values safely
        sample_values = []
        for val in sample_df[col].dropna().head(5):
            sample_values.append(str(val))

        data_quality_issues['categorical_issues'][col] = {
            'unique_count': int(sample_df[col].nunique()),
            'top_values': top_values_dict,
            'sample_values': sample_values
        }

# Capture data types (convert pandas dtypes to strings)
for col, dtype in sample_df.dtypes.items():
    data_quality_issues['data_types'][col] = str(dtype)

# Save for 02b
import json
try:
    with open('data_quality_issues.json', 'w') as f:
        json.dump(data_quality_issues, f, indent=2)
    print("✓ Data quality issues saved to 'data_quality_issues.json'")
except Exception as e:
    print(f"✗ JSON save failed: {e}")
    # Fallback to pickle
    import pickle
    with open('data_quality_issues.pkl', 'wb') as f:
        pickle.dump(data_quality_issues, f)
    print("✓ Data quality issues saved to 'data_quality_issues.pkl' (pickle format)")

# Display summary
print(f"\n=== ISSUES SUMMARY ===")
print(f"Missing value columns: {list(data_quality_issues['missing_values'].keys())}")
print(f"Duplicate counts: {data_quality_issues['duplicates']}")
if 'price' in data_quality_issues['outliers']:
    price_issues = data_quality_issues['outliers']['price']
    print(f"Price issues - Negative: {price_issues['negative']}, Zero: {price_issues['zero']}, High: {price_issues['very_high']}")

print(f"\n✅ Ready for Notebook 02b - Data Cleaning")

=== CAPTURING DATA QUALITY ISSUES ===
✓ Data quality issues saved to 'data_quality_issues.json'

=== ISSUES SUMMARY ===
Missing value columns: ['category_code', 'brand']
Duplicate counts: {'exact': 4, 'business_logic': 5}
Price issues - Negative: 0, Zero: 1224, High: 0

✅ Ready for Notebook 02b - Data Cleaning


In [14]:
# Summary and Next Steps

print("=== EXPLORATION SUMMARY ===")
print("✓ Reproducible sampling completed")
print("✓ Schema validation performed")
print("✓ Data quality issues identified")
print("✓ Memory usage analyzed")

print(f"\nKey findings:")
print(f"- Sample size: {len(sample_df):,} rows")
print(f"- Date range: {sample_df['event_time_parsed'].min().date()} to {sample_df['event_time_parsed'].max().date()}")
print(f"- Unique users: {sample_df['user_id'].nunique():,}")
print(f"- Event types: {sample_df['event_type'].nunique()}")

print(f"\nReady for Notebook 02b - Data Cleaning")

=== EXPLORATION SUMMARY ===
✓ Reproducible sampling completed
✓ Schema validation performed
✓ Data quality issues identified
✓ Memory usage analyzed

Key findings:
- Sample size: 549,675 rows
- Date range: 2019-10-01 to 2019-11-30
- Unique users: 426,829
- Event types: 3

Ready for Notebook 02b - Data Cleaning


Your data tells you what it wants to be:
→ Comparing categories? Bar charts work great
→ Showing change over time? Line charts are your friend
→ Looking for relationships? Use Scatter plots
→ Breaking down a whole? Pie charts (sparingly) or stacked bars
→ Showing distribution? Histograms tell the real story

I now spend 5 minutes before every analysis asking myself:
— Am I comparing?
— Am I tracking change?
— Am I looking for patterns?
— Am I showing composition?