In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

Libraries imported successfully!
Pandas version: 2.2.3
NumPy version: 2.1.3


In [15]:
# Load the cleaned datasets
transactions_df = pd.read_csv('Merged_Customer_Transaction_Data_Cleaned.csv')
products_df = pd.read_csv('Product_Offering_Data_Cleaned.csv')

print("Datasets loaded successfully! Amazing!")
print(f"\nTransaction Data shape: {transactions_df.shape}")
print(f"Product Data shape: {products_df.shape}")

Datasets loaded successfully! Amazing!

Transaction Data shape: (5000, 8)
Product Data shape: (10, 5)


In [3]:
print("=" * 60)
print("TRANSACTION DATA - OVERVIEW")
print("=" * 60)
print(f"\nShape: {transactions_df.shape}")
print(f"Columns: {transactions_df.columns.tolist()}")
print(f"\nFirst few rows:")
display(transactions_df.head(10))

TRANSACTION DATA - OVERVIEW

Shape: (5000, 8)
Columns: ['Transaction_ID', 'Customer_ID', 'Transaction_Date', 'Transaction_Amount', 'Transaction_Type', 'Satisfaction_Score', 'Feedback_Comments', 'Likelihood_to_Recommend']

First few rows:


Unnamed: 0,Transaction_ID,Customer_ID,Transaction_Date,Transaction_Amount,Transaction_Type,Satisfaction_Score,Feedback_Comments,Likelihood_to_Recommend
0,1,393,2023-01-01 00:00:00,3472.0,Purchase,1.0,Excellent,3
1,2,826,2023-01-01 01:00:00,2460.0,Bill Payment,6.0,Needs Improvement,3
2,3,916,2023-01-01 02:00:00,10.0,Purchase,6.0,Needs Improvement,3
3,4,109,2023-01-01 03:00:00,72.0,Investment,3.0,Good Service,9
4,5,889,2023-01-01 04:00:00,1793.0,Investment,8.0,Very Satisfied,3
5,6,348,2023-01-01 05:00:00,3824.0,Loan Payment,6.0,Unsatisfactory,8
6,7,50,2023-01-01 06:00:00,235.0,Loan Payment,6.0,Unsatisfactory,4
7,8,916,2023-01-01 07:00:00,1052.0,Loan Payment,6.0,Needs Improvement,3
8,9,105,2023-01-01 08:00:00,854.0,Purchase,8.0,Good Service,4
9,10,420,2023-01-01 09:00:00,2690.0,Investment,5.0,Good Service,8


In [4]:
print("\n" + "=" * 60)
print("PRODUCT DATA - OVERVIEW")
print("=" * 60)
print(f"\nShape: {products_df.shape}")
print(f"Columns: {products_df.columns.tolist()}")
print(f"\nAll rows:")
display(products_df)


PRODUCT DATA - OVERVIEW

Shape: (10, 5)
Columns: ['Product_ID', 'Product_Name', 'Product_Type', 'Risk_Level', 'Target_Income_Group']

All rows:


Unnamed: 0,Product_ID,Product_Name,Product_Type,Risk_Level,Target_Income_Group
0,1,Platinum Credit Card,Credit Card,Medium,Medium
1,2,Gold Savings Account,Savings Account,Low,Low
2,3,High-Yield Investment Account,Investment,High,High
3,4,Mortgage Loan,Loan,Medium,High
4,5,Auto Loan,Loan,Medium,Medium
5,6,Personal Loan,Loan,Medium,Low
6,7,Youth Savings Account,Savings Account,Low,Low
7,8,Retirement Investment Fund,Investment,High,High
8,9,Business Loan,Loan,Medium,Medium
9,10,Travel Credit Card,Credit Card,Medium,Medium


In [None]:
# Create a really good working copy
df = transactions_df.copy()

print("=" * 60)
print("STEP 1: SPLITTING TRANSACTION_DATE INTO DATE AND TIME")
print("=" * 60)

# Convert to datetime just to be sure
df['Transaction_Date'] = pd.to_datetime(df['Transaction_Date'])

# Extract date and time and separate into new columns
df['Transaction_Date_Only'] = df['Transaction_Date'].dt.date
df['Transaction_Time'] = df['Transaction_Date'].dt.time

print(f"\nBefore split:")
print(f"  Transaction_Date dtype: {transactions_df['Transaction_Date'].dtype}")
print(f"\nAfter split:")
print(f"  Transaction_Date_Only dtype: {df['Transaction_Date_Only'].dtype}")
print(f"  Transaction_Time dtype: {df['Transaction_Time'].dtype}")

# Replace the original date column
df['Transaction_Date'] = df['Transaction_Date_Only']
df = df.drop('Transaction_Date_Only', axis=1)

# Reorder columns
column_order = [
    'Transaction_ID', 'Customer_ID', 'Transaction_Date', 'Transaction_Time',
    'Transaction_Amount', 'Transaction_Type', 'Satisfaction_Score', 
    'Feedback_Comments', 'Likelihood_to_Recommend'
]
df = df[column_order]

print(f"\nSample data after split:")
display(df.head(10))

STEP 1: SPLITTING TRANSACTION_DATE INTO DATE AND TIME

Before split:
  Transaction_Date dtype: object

After split:
  Transaction_Date_Only dtype: object
  Transaction_Time dtype: object

Sample data after split:


Unnamed: 0,Transaction_ID,Customer_ID,Transaction_Date,Transaction_Time,Transaction_Amount,Transaction_Type,Satisfaction_Score,Feedback_Comments,Likelihood_to_Recommend
0,1,393,2023-01-01,00:00:00,3472.0,Purchase,1.0,Excellent,3
1,2,826,2023-01-01,01:00:00,2460.0,Bill Payment,6.0,Needs Improvement,3
2,3,916,2023-01-01,02:00:00,10.0,Purchase,6.0,Needs Improvement,3
3,4,109,2023-01-01,03:00:00,72.0,Investment,3.0,Good Service,9
4,5,889,2023-01-01,04:00:00,1793.0,Investment,8.0,Very Satisfied,3
5,6,348,2023-01-01,05:00:00,3824.0,Loan Payment,6.0,Unsatisfactory,8
6,7,50,2023-01-01,06:00:00,235.0,Loan Payment,6.0,Unsatisfactory,4
7,8,916,2023-01-01,07:00:00,1052.0,Loan Payment,6.0,Needs Improvement,3
8,9,105,2023-01-01,08:00:00,854.0,Purchase,8.0,Good Service,4
9,10,420,2023-01-01,09:00:00,2690.0,Investment,5.0,Good Service,8


In [6]:
print("\n" + "=" * 60)
print("STEP 2: EXTRACTING TEMPORAL FEATURES")
print("=" * 60)

# Convert Transaction_Date back to datetime for extraction
temp_datetime = pd.to_datetime(df['Transaction_Date'].astype(str) + ' ' + df['Transaction_Time'].astype(str))

# Extract temporal features
df['Day_of_Week'] = temp_datetime.dt.day_name()  # Monday, Tuesday, etc.
df['Hour_of_Day'] = temp_datetime.dt.hour  # 0-23
df['Month'] = temp_datetime.dt.month  # 1-12
df['Week_of_Year'] = temp_datetime.dt.isocalendar().week

# Create business hours flag (9 AM to 5 PM = hours 9-16)
df['Is_Business_Hours'] = ((df['Hour_of_Day'] >= 9) & (df['Hour_of_Day'] < 17)).astype(int)

# Create weekend flag
df['Is_Weekend'] = (df['Day_of_Week'].isin(['Saturday', 'Sunday'])).astype(int)

print(f"\nTemporal features created:")
print(f"  - Day_of_Week")
print(f"  - Hour_of_Day (0-23)")
print(f"  - Month (1-12)")
print(f"  - Week_of_Year")
print(f"  - Is_Business_Hours (1=9AM-5PM, 0=other)")
print(f"  - Is_Weekend (1=Sat/Sun, 0=weekday)")

print(f"\nSample:")
display(df[['Transaction_ID', 'Transaction_Date', 'Transaction_Time', 'Day_of_Week', 
            'Hour_of_Day', 'Is_Business_Hours', 'Is_Weekend']].head(10))


STEP 2: EXTRACTING TEMPORAL FEATURES

Temporal features created:
  - Day_of_Week
  - Hour_of_Day (0-23)
  - Month (1-12)
  - Week_of_Year
  - Is_Business_Hours (1=9AM-5PM, 0=other)
  - Is_Weekend (1=Sat/Sun, 0=weekday)

Sample:


Unnamed: 0,Transaction_ID,Transaction_Date,Transaction_Time,Day_of_Week,Hour_of_Day,Is_Business_Hours,Is_Weekend
0,1,2023-01-01,00:00:00,Sunday,0,0,1
1,2,2023-01-01,01:00:00,Sunday,1,0,1
2,3,2023-01-01,02:00:00,Sunday,2,0,1
3,4,2023-01-01,03:00:00,Sunday,3,0,1
4,5,2023-01-01,04:00:00,Sunday,4,0,1
5,6,2023-01-01,05:00:00,Sunday,5,0,1
6,7,2023-01-01,06:00:00,Sunday,6,0,1
7,8,2023-01-01,07:00:00,Sunday,7,0,1
8,9,2023-01-01,08:00:00,Sunday,8,0,1
9,10,2023-01-01,09:00:00,Sunday,9,1,1


In [7]:
print("\n" + "=" * 60)
print("STEP 3: CREATING PRODUCT RISK MAPPING")
print("=" * 60)

# Create a mapping from Transaction_Type to product characteristics
# Based on the product data we have

transaction_to_product_mapping = {
    'Purchase': {'typical_product_type': 'Credit Card', 'risk_level': 'Medium', 'income_group': 'Medium'},
    'Bill Payment': {'typical_product_type': 'Credit Card', 'risk_level': 'Medium', 'income_group': 'Medium'},
    'Investment': {'typical_product_type': 'Investment', 'risk_level': 'High', 'income_group': 'High'},
    'Loan Payment': {'typical_product_type': 'Loan', 'risk_level': 'Medium', 'income_group': 'Medium'}
}

# Map the risk level and income group to transactions
df['Implied_Risk_Level'] = df['Transaction_Type'].map(lambda x: transaction_to_product_mapping[x]['risk_level'])
df['Implied_Income_Group'] = df['Transaction_Type'].map(lambda x: transaction_to_product_mapping[x]['income_group'])

print(f"\nProduct Risk Mapping created:")
print(f"  Purchase â†’ Risk: Medium, Income: Medium")
print(f"  Bill Payment â†’ Risk: Medium, Income: Medium")
print(f"  Investment â†’ Risk: High, Income: High")
print(f"  Loan Payment â†’ Risk: Medium, Income: Medium")

print(f"\nSample:")
display(df[['Customer_ID', 'Transaction_Type', 'Implied_Risk_Level', 'Implied_Income_Group']].head(10))


STEP 3: CREATING PRODUCT RISK MAPPING

Product Risk Mapping created:
  Purchase â†’ Risk: Medium, Income: Medium
  Bill Payment â†’ Risk: Medium, Income: Medium
  Investment â†’ Risk: High, Income: High
  Loan Payment â†’ Risk: Medium, Income: Medium

Sample:


Unnamed: 0,Customer_ID,Transaction_Type,Implied_Risk_Level,Implied_Income_Group
0,393,Purchase,Medium,Medium
1,826,Bill Payment,Medium,Medium
2,916,Purchase,Medium,Medium
3,109,Investment,High,High
4,889,Investment,High,High
5,348,Loan Payment,Medium,Medium
6,50,Loan Payment,Medium,Medium
7,916,Loan Payment,Medium,Medium
8,105,Purchase,Medium,Medium
9,420,Investment,High,High


In [8]:
print("\n" + "=" * 60)
print("STEP 4: AGGREGATING TO CUSTOMER LEVEL")
print("=" * 60)

# Group by Customer_ID to create customer-level features

# Spending features
spending_features = df.groupby('Customer_ID').agg({
    'Transaction_Amount': ['sum', 'mean', 'max', 'min', 'std', 'count'],
}).round(2)
spending_features.columns = ['Total_Spending', 'Avg_Transaction_Amount', 'Max_Transaction', 
                             'Min_Transaction', 'Std_Transaction_Amount', 'Transaction_Count']

# Satisfaction features
satisfaction_features = df.groupby('Customer_ID').agg({
    'Satisfaction_Score': ['mean', 'std', 'min', 'max'],
    'Likelihood_to_Recommend': ['mean', 'min', 'max']
}).round(2)
satisfaction_features.columns = ['Avg_Satisfaction_Score', 'Std_Satisfaction', 
                                 'Min_Satisfaction', 'Max_Satisfaction',
                                 'Avg_Likelihood_to_Recommend', 'Min_Likelihood', 'Max_Likelihood']

# Transaction type distribution
transaction_type_dist = df.groupby(['Customer_ID', 'Transaction_Type']).size().unstack(fill_value=0)
transaction_type_dist.columns = [f'Count_{col}' for col in transaction_type_dist.columns]

# Calculate percentages for transaction types
for col in transaction_type_dist.columns:
    transaction_type_dist[col.replace('Count_', 'Pct_')] = \
        (transaction_type_dist[col] / transaction_type_dist.sum(axis=1) * 100).round(2)

print(f"\nCustomer-level aggregations:")
print(f"  Spending features: Total, Avg, Max, Min, Std, Count")
print(f"  Satisfaction features: Avg, Std, Min, Max, Likelihood metrics")
print(f"  Transaction type distribution: Counts and percentages")

print(f"\nSpending features sample:")
display(spending_features.head())


STEP 4: AGGREGATING TO CUSTOMER LEVEL

Customer-level aggregations:
  Spending features: Total, Avg, Max, Min, Std, Count
  Satisfaction features: Avg, Std, Min, Max, Likelihood metrics
  Transaction type distribution: Counts and percentages

Spending features sample:


Unnamed: 0_level_0,Total_Spending,Avg_Transaction_Amount,Max_Transaction,Min_Transaction,Std_Transaction_Amount,Transaction_Count
Customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,16836.0,2806.0,4993.0,156.0,2062.31,6
2,4907.0,2453.5,2850.0,2057.0,560.74,2
3,1538.0,1538.0,1538.0,1538.0,,1
4,8295.0,4147.5,4736.0,3559.0,832.26,2
5,14798.0,2959.6,4878.0,1508.0,1386.58,5


In [9]:
print("\n" + "=" * 60)
print("STEP 5: TEMPORAL BEHAVIOR FEATURES")
print("=" * 60)

# Time-based behavior
temporal_features = df.groupby('Customer_ID').agg({
    'Is_Business_Hours': 'mean',  # % of transactions during business hours
    'Is_Weekend': 'mean',  # % of transactions on weekends
    'Hour_of_Day': ['mean', 'std'],  # Average transaction time
}).round(2)
temporal_features.columns = ['Pct_Business_Hours', 'Pct_Weekend_Transactions', 
                             'Avg_Hour_of_Day', 'Std_Hour_of_Day']

# Transaction recency and frequency
temp_dates = pd.to_datetime(df['Transaction_Date'].astype(str))
date_features = df.groupby('Customer_ID').agg({
    'Transaction_Date': ['min', 'max']  # First and last transaction
}).reset_index(drop=True)

# Calculate days since first transaction and days since last transaction
max_date = temp_dates.max()
first_trans = df.groupby('Customer_ID')['Transaction_Date'].min()
last_trans = df.groupby('Customer_ID')['Transaction_Date'].max()

tenure_days = (pd.to_datetime(last_trans) - pd.to_datetime(first_trans)).dt.days
recency_days = (max_date - pd.to_datetime(last_trans)).dt.days

tenure_features = pd.DataFrame({
    'Customer_ID': tenure_days.index,
    'Tenure_Days': tenure_days.values,
    'Days_Since_Last_Transaction': recency_days.values
}).set_index('Customer_ID')

print(f"\nTemporal behavior features:")
print(f"  - Pct_Business_Hours: % of transactions during 9AM-5PM")
print(f"  - Pct_Weekend_Transactions: % of transactions on weekends")
print(f"  - Avg_Hour_of_Day: Average hour customer transacts (0-23)")
print(f"  - Tenure_Days: Days since first transaction")
print(f"  - Days_Since_Last_Transaction: Recency metric")

print(f"\nSample:")
display(temporal_features.head())


STEP 5: TEMPORAL BEHAVIOR FEATURES

Temporal behavior features:
  - Pct_Business_Hours: % of transactions during 9AM-5PM
  - Pct_Weekend_Transactions: % of transactions on weekends
  - Avg_Hour_of_Day: Average hour customer transacts (0-23)
  - Tenure_Days: Days since first transaction
  - Days_Since_Last_Transaction: Recency metric

Sample:


Unnamed: 0_level_0,Pct_Business_Hours,Pct_Weekend_Transactions,Avg_Hour_of_Day,Std_Hour_of_Day
Customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.17,0.17,6.83,6.85
2,0.0,0.5,13.5,13.44
3,0.0,0.0,23.0,
4,0.5,0.5,8.0,9.9
5,0.6,0.4,12.8,8.38


In [10]:
print("\n" + "=" * 60)
print("STEP 6: RISK AND ENGAGEMENT FEATURES")
print("=" * 60)

# Risk exposure based on transaction types
risk_features = df.groupby('Customer_ID').agg({
    'Implied_Risk_Level': lambda x: (x == 'High').sum()  # Count of high-risk transactions
}).rename(columns={'Implied_Risk_Level': 'High_Risk_Transaction_Count'})

# Engagement breadth - how many different transaction types does customer use?
engagement_breadth = df.groupby('Customer_ID')['Transaction_Type'].nunique().to_frame()
engagement_breadth.columns = ['Engagement_Breadth']

# Feedback sentiment scoring
# Map feedback comments to sentiment
feedback_sentiment = {
    'Excellent': 5,
    'Very Satisfied': 4,
    'Good Service': 3,
    'Needs Improvement': 2,
    'Unsatisfactory': 1
}
df['Feedback_Sentiment'] = df['Feedback_Comments'].map(feedback_sentiment)

# Aggregate feedback sentiment
feedback_features = df.groupby('Customer_ID').agg({
    'Feedback_Sentiment': ['mean', 'std']
}).round(2)
feedback_features.columns = ['Avg_Feedback_Sentiment', 'Std_Feedback_Sentiment']

print(f"\nRisk and Engagement features:")
print(f"  - High_Risk_Transaction_Count: # of high-risk (Investment) transactions")
print(f"  - Engagement_Breadth: # of different transaction types used")
print(f"  - Avg_Feedback_Sentiment: Average sentiment (1=Poor, 5=Excellent)")
print(f"  - Std_Feedback_Sentiment: Consistency of sentiment")

print(f"\nSample:")
display(risk_features.head())
print(f"\nEngagement:")
display(engagement_breadth.head())


STEP 6: RISK AND ENGAGEMENT FEATURES

Risk and Engagement features:
  - High_Risk_Transaction_Count: # of high-risk (Investment) transactions
  - Engagement_Breadth: # of different transaction types used
  - Avg_Feedback_Sentiment: Average sentiment (1=Poor, 5=Excellent)
  - Std_Feedback_Sentiment: Consistency of sentiment

Sample:


Unnamed: 0_level_0,High_Risk_Transaction_Count
Customer_ID,Unnamed: 1_level_1
1,1
2,0
3,0
4,0
5,1



Engagement:


Unnamed: 0_level_0,Engagement_Breadth
Customer_ID,Unnamed: 1_level_1
1,4
2,2
3,1
4,2
5,3


In [11]:
print("\n" + "=" * 60)
print("STEP 7: COMBINING ALL FEATURES INTO ONE DATAFRAME")
print("=" * 60)

# Combine all customer-level features
customer_features = pd.concat([
    spending_features,
    satisfaction_features,
    transaction_type_dist,
    temporal_features,
    tenure_features,
    risk_features,
    engagement_breadth,
    feedback_features
], axis=1)

# Reset index to make Customer_ID a column
customer_features = customer_features.reset_index()
customer_features = customer_features.rename(columns={'index': 'Customer_ID'})

# Fill NaN values for missing transaction types with 0
for col in customer_features.columns:
    if customer_features[col].dtype in ['float64', 'int64']:
        customer_features[col] = customer_features[col].fillna(0)

print(f"\nFinal engineered dataset shape: {customer_features.shape}")
print(f"\nColumns ({len(customer_features.columns)}):")
for i, col in enumerate(customer_features.columns, 1):
    print(f"  {i}. {col}")

print(f"\nFirst few customers:")
display(customer_features.head())


STEP 7: COMBINING ALL FEATURES INTO ONE DATAFRAME

Final engineered dataset shape: (993, 32)

Columns (32):
  1. Customer_ID
  2. Total_Spending
  3. Avg_Transaction_Amount
  4. Max_Transaction
  5. Min_Transaction
  6. Std_Transaction_Amount
  7. Transaction_Count
  8. Avg_Satisfaction_Score
  9. Std_Satisfaction
  10. Min_Satisfaction
  11. Max_Satisfaction
  12. Avg_Likelihood_to_Recommend
  13. Min_Likelihood
  14. Max_Likelihood
  15. Count_Bill Payment
  16. Count_Investment
  17. Count_Loan Payment
  18. Count_Purchase
  19. Pct_Bill Payment
  20. Pct_Investment
  21. Pct_Loan Payment
  22. Pct_Purchase
  23. Pct_Business_Hours
  24. Pct_Weekend_Transactions
  25. Avg_Hour_of_Day
  26. Std_Hour_of_Day
  27. Tenure_Days
  28. Days_Since_Last_Transaction
  29. High_Risk_Transaction_Count
  30. Engagement_Breadth
  31. Avg_Feedback_Sentiment
  32. Std_Feedback_Sentiment

First few customers:


Unnamed: 0,Customer_ID,Total_Spending,Avg_Transaction_Amount,Max_Transaction,Min_Transaction,Std_Transaction_Amount,Transaction_Count,Avg_Satisfaction_Score,Std_Satisfaction,Min_Satisfaction,...,Pct_Business_Hours,Pct_Weekend_Transactions,Avg_Hour_of_Day,Std_Hour_of_Day,Tenure_Days,Days_Since_Last_Transaction,High_Risk_Transaction_Count,Engagement_Breadth,Avg_Feedback_Sentiment,Std_Feedback_Sentiment
0,1,16836.0,2806.0,4993.0,156.0,2062.31,6,10.0,0.0,10.0,...,0.17,0.17,6.83,6.85,181,26,1,4,4.0,0.0
1,2,4907.0,2453.5,2850.0,2057.0,560.74,2,3.0,0.0,3.0,...,0.0,0.5,13.5,13.44,104,68,0,2,4.0,0.0
2,3,1538.0,1538.0,1538.0,1538.0,0.0,1,10.0,0.0,10.0,...,0.0,0.0,23.0,0.0,0,151,0,1,4.0,0.0
3,4,8295.0,4147.5,4736.0,3559.0,832.26,2,7.0,0.0,7.0,...,0.5,0.5,8.0,9.9,159,28,0,2,2.0,0.0
4,5,14798.0,2959.6,4878.0,1508.0,1386.58,5,8.0,0.0,8.0,...,0.6,0.4,12.8,8.38,158,1,1,3,1.0,0.0


In [12]:
print("\n" + "=" * 60)
print("STEP 8: DATA QUALITY CHECK")
print("=" * 60)

print(f"\nDataset shape: {customer_features.shape}")
print(f"\nMissing values:")
missing = customer_features.isnull().sum()
print(missing[missing > 0] if missing.sum() > 0 else "None")

print(f"\nData types:")
print(customer_features.dtypes)

print(f"\nBasic statistics:")
display(customer_features.describe().round(2))


STEP 8: DATA QUALITY CHECK

Dataset shape: (993, 32)

Missing values:
None

Data types:
Customer_ID                      int64
Total_Spending                 float64
Avg_Transaction_Amount         float64
Max_Transaction                float64
Min_Transaction                float64
Std_Transaction_Amount         float64
Transaction_Count                int64
Avg_Satisfaction_Score         float64
Std_Satisfaction               float64
Min_Satisfaction               float64
Max_Satisfaction               float64
Avg_Likelihood_to_Recommend    float64
Min_Likelihood                   int64
Max_Likelihood                   int64
Count_Bill Payment               int64
Count_Investment                 int64
Count_Loan Payment               int64
Count_Purchase                   int64
Pct_Bill Payment               float64
Pct_Investment                 float64
Pct_Loan Payment               float64
Pct_Purchase                   float64
Pct_Business_Hours             float64
Pct_Weekend_Tr

Unnamed: 0,Customer_ID,Total_Spending,Avg_Transaction_Amount,Max_Transaction,Min_Transaction,Std_Transaction_Amount,Transaction_Count,Avg_Satisfaction_Score,Std_Satisfaction,Min_Satisfaction,...,Pct_Business_Hours,Pct_Weekend_Transactions,Avg_Hour_of_Day,Std_Hour_of_Day,Tenure_Days,Days_Since_Last_Transaction,High_Risk_Transaction_Count,Engagement_Breadth,Avg_Feedback_Sentiment,Std_Feedback_Sentiment
count,993.0,993.0,993.0,993.0,993.0,993.0,993.0,993.0,993.0,993.0,...,993.0,993.0,993.0,993.0,993.0,993.0,993.0,993.0,993.0,993.0
mean,500.87,12567.47,2497.35,4048.82,1006.78,1295.06,5.04,5.53,0.0,5.53,...,0.33,0.28,11.52,6.25,128.96,39.37,1.25,2.89,2.8,0.0
std,288.63,6423.29,758.13,975.7,933.28,520.05,2.19,2.78,0.0,2.78,...,0.24,0.23,3.63,2.5,51.4,36.4,1.07,0.88,1.27,0.0
min,1.0,286.0,286.0,286.0,10.0,0.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.5,0.0,0.0,0.0,0.0,1.0,1.0,0.0
25%,251.0,7777.0,2015.0,3592.0,324.0,1004.56,3.0,3.0,0.0,3.0,...,0.17,0.11,9.25,4.89,98.0,12.0,0.0,2.0,2.0,0.0
50%,502.0,11939.0,2457.88,4347.0,745.0,1319.17,5.0,6.0,0.0,6.0,...,0.33,0.25,11.33,6.56,140.0,29.0,1.0,3.0,3.0,0.0
75%,750.0,16531.0,2930.75,4734.0,1369.0,1634.15,6.0,8.0,0.0,8.0,...,0.5,0.43,13.62,7.94,169.0,56.0,2.0,4.0,4.0,0.0
max,1000.0,37281.0,4993.0,7315.0,4993.0,3335.42,13.0,10.0,0.0,10.0,...,1.0,1.0,23.0,14.14,205.0,198.0,5.0,4.0,5.0,0.0


In [13]:
print("\n" + "=" * 60)
print("STEP 9: EXPORTING ENGINEERED DATASET")
print("=" * 60)

# Export to CSV
output_filename = 'Customer_Features_Engineered.csv'
customer_features.to_csv(output_filename, index=False)

print(f"\nEnineered dataset exported to: {output_filename}")
print(f"Total rows: {len(customer_features)}")
print(f"Total features: {len(customer_features.columns) - 1}")
print(f"\nFile ready for clustering analysis!")


STEP 9: EXPORTING ENGINEERED DATASET

Enineered dataset exported to: Customer_Features_Engineered.csv
Total rows: 993
Total features: 31

File ready for clustering analysis!


In [14]:
print("\n" + "=" * 60)
print("FEATURE ENGINEERING SUMMARY")
print("=" * 60)

print(f"\nðŸ“Š ORIGINAL DATA:")
print(f"  - Transaction records: {len(df):,}")
print(f"  - Unique customers: {df['Customer_ID'].nunique()}")
print(f"  - Date range: {df['Transaction_Date'].min()} to {df['Transaction_Date'].max()}")

print(f"\nâœ¨ ENGINEERED DATASET:")
print(f"  - Customer records: {len(customer_features)}")
print(f"  - Total features: {len(customer_features.columns) - 1}")

print(f"\nðŸ“‹ FEATURE CATEGORIES:")
print(f"  1. Spending (6 features):")
print(f"     - Total, Average, Max, Min, Std, Count of transactions")

print(f"\n  2. Satisfaction (7 features):")
print(f"     - Satisfaction scores & likelihood to recommend metrics")

print(f"\n  3. Transaction Type Distribution (8 features):")
print(f"     - Counts & percentages for Purchase, Bill Payment, Investment, Loan Payment")

print(f"\n  4. Temporal Behavior (6 features):")
print(f"     - Business hours %, weekend %, avg hour, tenure, recency")

print(f"\n  5. Risk & Engagement (4 features):")
print(f"     - High-risk transaction count, engagement breadth, feedback sentiment")

print(f"\nðŸ’¾ OUTPUT FILE: {output_filename}")
print(f"\n" + "=" * 60)
print(f"FEATURE ENGINEERING COMPLETE!")
print(f"=" * 60)


FEATURE ENGINEERING SUMMARY

ðŸ“Š ORIGINAL DATA:
  - Transaction records: 5,000
  - Unique customers: 993
  - Date range: 2023-01-01 to 2023-07-28

âœ¨ ENGINEERED DATASET:
  - Customer records: 993
  - Total features: 31

ðŸ“‹ FEATURE CATEGORIES:
  1. Spending (6 features):
     - Total, Average, Max, Min, Std, Count of transactions

  2. Satisfaction (7 features):
     - Satisfaction scores & likelihood to recommend metrics

  3. Transaction Type Distribution (8 features):
     - Counts & percentages for Purchase, Bill Payment, Investment, Loan Payment

  4. Temporal Behavior (6 features):
     - Business hours %, weekend %, avg hour, tenure, recency

  5. Risk & Engagement (4 features):
     - High-risk transaction count, engagement breadth, feedback sentiment

ðŸ’¾ OUTPUT FILE: Customer_Features_Engineered.csv

FEATURE ENGINEERING COMPLETE!
