In [2]:
import sys
sys.path.append('../script')  # Correct: point to the actual 'script' folder

from target_engineering import generate_proxy_target
from feature_engineering import build_feature_pipeline, CustomerAggregator, TimeFeaturesExtractor
import pandas as pd

In [3]:


# Load original data
df = pd.read_csv("../data/data.csv")

# Step 1: Generate the processed features
pipeline = build_feature_pipeline()
X_array = pipeline.fit_transform(df)

# Step 2: Convert to DataFrame
X_df = pd.DataFrame(X_array.toarray())  # Use toarray() for sparse matrices

# Step 3: Generate proxy target
target_df = generate_proxy_target(df)

# Step 4: Merge using CustomerId
# NOTE: CustomerId is not in X_df directly. So extract it from raw df and reset index
customer_ids = df["CustomerId"].reset_index(drop=True)
X_df["CustomerId"] = customer_ids

# Merge processed features with target
merged_df = pd.merge(X_df, target_df, on="CustomerId", how="inner")

# Step 5: Drop CustomerId (optional, for training)
final_X = merged_df.drop(columns=["CustomerId", "is_high_risk"])
final_y = merged_df["is_high_risk"]

# Step 6: Save for modeling
merged_df.to_csv("../data/final_model_dataset.csv", index=False)

print("✅ Merged features + target shape:", final_X.shape)
print("🔖 Target value counts:\n", final_y.value_counts())


✅ Merged features + target shape: (95662, 49)
🔖 Target value counts:
 is_high_risk
0    84636
1    11026
Name: count, dtype: int64
