In [1]:
import pandas as pd 
import numpy as np 
from datetime import datetime, timedelta
from sklearn.preprocessing import StandardScaler
import holidays
from scipy.stats import entropy
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import gc

In [3]:
hist_trans_df = pd.read_csv(r"C:\Users\Aarushi\Desktop\SC4000\SC4000\data\cleaned_data\cleaned_historical_transactions.csv")
# merch_df = pd.read_csv(r"C:\Users\Aarushi\Desktop\SC4000\SC4000\data\cleaned_data\cleaned_merchants.csv")
new_trans_df = pd.read_csv(r"C:\Users\Aarushi\Desktop\SC4000\SC4000\data\cleaned_data\cleaned_new_merchant_transactions.csv")
train_df = pd.read_csv(r"C:\Users\Aarushi\Desktop\SC4000\SC4000\data\cleaned_data\train.csv")
test_df = pd.read_csv(r"C:\Users\Aarushi\Desktop\SC4000\SC4000\data\cleaned_data\test.csv")

In [4]:
# Combine transactions
all_transactions_df = pd.concat([hist_trans_df, new_trans_df])
del hist_trans_df, new_trans_df
gc.collect()

482

In [5]:
# Merge train data with all_transactions based on card_id
train_df = pd.merge(train_df, all_transactions_df, on='card_id', how='left')

KeyboardInterrupt: 

In [6]:
scaler = StandardScaler()
# Brazilian holidays for the relevant period
br_holidays = holidays.BR()

In [7]:
def add_basic_time_features(df):
    """
    Add basic time-based features from purchase_date:
    - hour, day, day_of_week, month, year
    - is_weekend, is_holiday
    - is_start_of_month, is_end_of_month
    """
    df['purchase_datetime'] = pd.to_datetime(df['purchase_date'])
    df['hour'] = df['purchase_datetime'].dt.hour
    df['day'] = df['purchase_datetime'].dt.day
    df['day_of_week'] = df['purchase_datetime'].dt.dayofweek
    df['month'] = df['purchase_datetime'].dt.month
    df['year'] = df['purchase_datetime'].dt.year
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    df['is_holiday'] = df['purchase_datetime'].dt.date.map(lambda x: x in br_holidays).astype(int)
    
    # Part of month features
    df['is_start_of_month'] = (df['day'] <= 5).astype(int)
    df['is_end_of_month'] = (df['day'] >= 25).astype(int)
    
    return df

# Add basic time features
all_transactions_df = add_basic_time_features(all_transactions_df)

In [8]:
# Add time difference between transactions
def add_time_difference_features(df, card_id_col='card_id'):
    """Calculate time differences between transactions for each card"""
    # Sort transactions
    df = df.sort_values([card_id_col, 'purchase_datetime'])
    
    # Calculate time between purchases
    df['time_since_last'] = df.groupby(card_id_col)['purchase_datetime'].diff()
    
    # Convert timedelta to hours for easier analysis
    df['hours_since_last'] = df['time_since_last'].dt.total_seconds() / 3600
    
    return df

# Add time difference features
all_transactions_df = add_time_difference_features(all_transactions_df)

In [None]:
# Add spending pattern features directly to each transaction
def add_spending_pattern_features(df, card_id_col='card_id'):
    """Add relative spending indicators to each transaction"""
    
    # Calculate card-level statistics
    card_stats = df.groupby(card_id_col)['purchase_amount'].agg(['mean', 'std']).reset_index()
    card_stats.columns = [card_id_col, 'card_mean_amount', 'card_std_amount']
    
    # Merge back to get card-level stats for each transaction
    df = pd.merge(df, card_stats, on=card_id_col, how='left')
    
    # Calculate z-score of each transaction relative to card's spending pattern
    df['amount_zscore'] = (df['purchase_amount'] - df['card_mean_amount']) / df['card_std_amount'].replace(0, 1)
    
    # Flag unusually large purchases (more than 2 standard deviations)
    df['is_large_purchase'] = (df['amount_zscore'] > 2).astype(int)
    
    # Flag unusually small purchases
    df['is_small_purchase'] = (df['amount_zscore'] < -1).astype(int)
    
    return df

# Add spending pattern features
all_transactions_df = add_spending_pattern_features(all_transactions_df)

In [8]:
# Add seasonal features
def add_seasonal_features(df, card_id_col='card_id'):
    """Add seasonal indicators to each transaction"""
    
    # Define seasons (for Brazil)
    summer_months = [12, 1, 2]  # Brazilian summer
    winter_months = [6, 7, 8]   # Brazilian winter
    
    # Add season flags
    df['is_summer'] = df['month'].isin(summer_months).astype(int)
    df['is_winter'] = df['month'].isin(winter_months).astype(int)
    
    return df

# Add seasonal features
all_transactions_df = add_seasonal_features(all_transactions_df)

In [None]:
# Add merchant diversity features
def add_merchant_diversity_features(df, card_id_col='card_id'):
    """Add features related to merchant diversity"""
    
    # Flag if transaction is with a new merchant (much more efficient than the rolling approach)
    # First encounter will be False (not visited before), subsequent encounters will be True
    df['merchant_visited_before'] = df.duplicated([card_id_col, 'merchant_category_id']).astype(int)
    
    # Calculate overall merchant diversity metrics (simpler and faster than rolling)
    
    # Count total unique merchants per card
    merchant_counts = df.groupby(card_id_col)['merchant_category_id'].nunique().reset_index()
    merchant_counts.columns = [card_id_col, 'total_unique_merchants']
    
    # Add total unique merchants to original dataframe
    df = pd.merge(df, merchant_counts, on=card_id_col, how='left')
    
    # Calculate ratio of transactions with repeat merchants
    repeat_ratio = df.groupby(card_id_col)['merchant_visited_before'].mean().reset_index()
    repeat_ratio.columns = [card_id_col, 'repeat_merchant_ratio']
    
    # Add repeat merchant ratio to original dataframe
    df = pd.merge(df, repeat_ratio, on=card_id_col, how='left')
    
    return df

# Add merchant diversity features
all_transactions_df = add_merchant_diversity_features(all_transactions_df)

In [None]:
def add_recency_features(df, card_id_col='card_id'):
    """Add recency indicators to each transaction"""
    
    # Get last transaction date for each card
    last_purchase_date = df.groupby(card_id_col)['purchase_datetime'].max().reset_index()
    last_purchase_date.columns = [card_id_col, 'last_purchase_date']
    
    # Merge with transactions
    df = pd.merge(df, last_purchase_date, on=card_id_col, how='left')
    
    # Calculate days since last transaction for each card
    df['days_since_last_transaction'] = (df['last_purchase_date'] - df['purchase_datetime']).dt.days
    
    # Flag recent transactions (last 30 days)
    df['is_recent_transaction'] = (df['days_since_last_transaction'] <= 30).astype(int)
    
    # Flag if transaction is in last 25% of card's transactions
    df['transaction_rank'] = df.groupby(card_id_col)['purchase_datetime'].rank(pct=True)
    df['is_last_quarter_transaction'] = (df['transaction_rank'] >= 0.75).astype(int)
    
    # Drop temporary columns
    df = df.drop(['last_purchase_date'], axis=1)
    
    return df

# Add recency features
all_transactions_df = add_recency_features(all_transactions_df)

In [None]:
# Add installment features
def add_installment_features(df):
    """Add features related to installment payments"""
    
    # Check if installments column exists
    if 'installments' in df.columns:
        # Flag if transaction is installment
        df['is_installment'] = (df['installments'] > 1).astype(int)
        
        # Calculate installment amount if applicable
        df['installment_amount'] = df['purchase_amount'] / df['installments']
        df['installment_amount'] = df['installment_amount'].fillna(df['purchase_amount'])
    
    return df

# Add installment features
all_transactions_df = add_installment_features(all_transactions_df)

In [None]:
# Add ratio/proportion features
def add_ratio_features(df, card_id_col='card_id'):
    """Add ratio features comparing different transaction types"""
    
    # Calculate card-level proportions
    card_proportions = df.groupby(card_id_col).agg({
        'is_weekend': 'mean',  # Proportion of weekend transactions
        'is_holiday': 'mean',  # Proportion of holiday transactions
        'is_installment': 'mean',  # Proportion of installment transactions
        'is_large_purchase': 'mean',  # Proportion of large purchases
        'merchant_visited_before': 'mean',  # Proportion of repeated merchants
    }).reset_index()
    
    # Rename columns
    card_proportions.columns = [
        card_id_col, 
        'weekend_trans_ratio', 
        'holiday_trans_ratio', 
        'installment_trans_ratio',
        'large_purchase_ratio',
        'repeat_merchant_ratio'
    ]
    
    # Merge proportions back to main dataframe
    df = pd.merge(df, card_proportions, on=card_id_col, how='left')
    
    return df

# Add ratio features
all_transactions_df = add_ratio_features(all_transactions_df)

In [None]:
# Add aggregated features
def add_aggregated_features(df, card_id_col='card_id'):
    """Add card-level aggregated features to each transaction"""
    
    # Aggregations by card_id
    aggs = {
        'purchase_amount': ['count', 'sum', 'mean', 'std', 'min', 'max'],
        'month': 'nunique',  # Number of unique months with transactions
        'merchant_category_id': 'nunique',  # Number of unique merchants
        'hours_since_last': ['mean', 'std'],  # Transaction frequency stats
        'installments': ['mean', 'max']  # Installment behavior
    }
    
    # Calculate aggregations
    card_aggs = df.groupby(card_id_col).agg(aggs)
    
    # Flatten column names
    card_aggs.columns = ['_'.join(col).strip() for col in card_aggs.columns.values]
    card_aggs.reset_index(inplace=True)
    
    # Calculate additional derived features
    card_aggs['purchase_amount_range'] = card_aggs['purchase_amount_max'] - card_aggs['purchase_amount_min']
    card_aggs['purchase_amount_cv'] = card_aggs['purchase_amount_std'] / card_aggs['purchase_amount_mean']
    
    # Merge aggregations back to main dataframe
    df = pd.merge(df, card_aggs, on=card_id_col, how='left')
    
    return df

# Add aggregated features
all_transactions_df = add_aggregated_features(all_transactions_df)

In [None]:
# Final processing
# Drop unnecessary columns or create final features
all_transactions_df.drop(['time_since_last'], axis=1, inplace=True, errors='ignore')

# Scale numerical features if needed
# This step depends on your modeling approach - you might want to scale only
# when creating the final training dataset rather than scaling the entire dataframe

# Export the processed dataframe
all_transactions_df.to_csv('all_transactions_with_features.csv', index=False)

print("Feature engineering completed successfully.")
print(f"Shape of final dataframe: {all_transactions_df.shape}")
print(f"Total features: {all_transactions_df.shape[1]}")
print(f"Sample of new features:\n{all_transactions_df.iloc[0]}")