In [2]:
import pandas as pd
import numpy as np
from datetime import timedelta, date

# Function to generate synthetic data
def generate_data(num_days, num_deals, num_portfolios):
    start_date = date.today()
    date_range = [start_date + timedelta(days=x) for x in range(num_days)]

    # Generate data
    data = []
    for current_date in date_range:
        for deal_id in range(1, num_deals + 1):
            for portfolio in range(1, num_portfolios + 1):
                row = {
                    'date': current_date,
                    'deal_id': f'deal_{deal_id}',
                    'Portfolio': f'Portfolio_{portfolio}',
                    'feature_1': np.random.randn(),
                    'feature_2': np.random.randn(),
                    'feature_3': np.random.randn(),
                    'binary_feature_1': np.random.choice([0, 1], p=[0.7, 0.3]),
                    'binary_feature_2': np.random.choice([0, 1], p=[0.7, 0.3])
                }
                data.append(row)

    return pd.DataFrame(data)

# Define which columns are binary and which are floating point
binary_features = ['binary_feature_1', 'binary_feature_2']
float_features = ['feature_1', 'feature_2', 'feature_3']

# Function to aggregate data at the portfolio level by date
def aggregate_data(df, binary_features, float_features):
    agg_dict = {feature: ['min', 'max', 'mean', 'var', lambda x: x.quantile(0.25), lambda x: x.quantile(0.75), 'sum'] for feature in float_features}
    agg_dict.update({feature: ['sum'] for feature in binary_features})  # Summing binary features counts the number of 1s

    # Rename lambda functions for clarity
    for feature in float_features:
        agg_dict[feature][4].__name__ = '25%'
        agg_dict[feature][5].__name__ = '75%'

    # Group by date and Portfolio and apply aggregation
    aggregated_df = df.groupby(['date', 'Portfolio']).agg(agg_dict)
    aggregated_df.columns = ['_'.join(col).strip() for col in aggregated_df.columns.values]  # Flatten MultiIndex columns

    return aggregated_df

# Generate synthetic dataset
df = generate_data(10, 5, 3)  # 10 days, 5 deals per day, 3 portfolios
print(df.head())
# Aggregate the data
aggregated_df = aggregate_data(df, binary_features, float_features)
print('___________________________________________________________________________')
# Display the aggregated DataFrame
print(aggregated_df.head())


         date deal_id    Portfolio  feature_1  feature_2  feature_3  \
0  2024-08-28  deal_1  Portfolio_1  -0.459183  -0.106103  -0.468682   
1  2024-08-28  deal_1  Portfolio_2   0.778182  -1.778243  -0.392431   
2  2024-08-28  deal_1  Portfolio_3   1.374551   0.215027   0.080313   
3  2024-08-28  deal_2  Portfolio_1  -0.147279   1.201052   1.075295   
4  2024-08-28  deal_2  Portfolio_2  -0.804429  -0.031754   0.993130   

   binary_feature_1  binary_feature_2  
0                 0                 0  
1                 0                 0  
2                 0                 0  
3                 1                 1  
4                 0                 0  
___________________________________________________________________________
                        feature_1_min  feature_1_max  feature_1_mean  \
date       Portfolio                                                   
2024-08-28 Portfolio_1      -0.753279       0.896198       -0.085225   
           Portfolio_2      -0.804429    

In [3]:
import pandas as pd
import numpy as np

def aggregate_features(df):
    # Define the columns for which you want to calculate specific aggregations
    numerical_cols = ['feature1', 'feature2', 'feature3']  # replace with your actual feature columns
    binary_cols = ['binary_feature1', 'binary_feature2']  # replace with your actual binary feature columns
    
    # Define aggregation functions for numerical columns
    aggregations = {
        'min': 'min',
        'max': 'max',
        'mean': 'mean',
        'var': 'var',
        'q25': lambda x: np.percentile(x, 25),
        'q75': lambda x: np.percentile(x, 75),
        'iqr': lambda x: np.percentile(x, 75) - np.percentile(x, 25),
        'sum': 'sum'
    }

    # Prepare a dict to hold column-specific aggregations
    aggregations_dict = {}
    for col in numerical_cols:
        aggregations_dict[col] = aggregations
    
    for col in binary_cols:
        aggregations_dict[col] = {'sum': 'sum', 'mean': 'mean'}

    # Apply aggregations
    aggregated = df.groupby(['date', 'Portfolio']).agg(aggregations_dict)
    
    # Flatten the MultiIndex columns
    aggregated.columns = ['_'.join(col).strip() for col in aggregated.columns.values]

    return aggregated

# Example usage
# Assume X_train and X_test are already loaded DataFrames
aggregated_train = aggregate_features(X_train)
aggregated_test = aggregate_features(X_test)

# Optionally, save or display the aggregated data
print(aggregated_train.head())
print(aggregated_test.head())


SpecificationError: nested renamer is not supported