In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings("ignore")

class EduAutoFE:
    
    def __init__(self, max_results=10):
        self.max_results = max_results
        self.results = []
    
    def _validate_input(self, X, y):
        # Basic checks
        if not isinstance(X, pd.DataFrame):
            raise TypeError("X must be a pandas DataFrame")
        
        if not isinstance(y, pd.Series):
            raise TypeError("y must be a pandas Series")
        
        # Check for non-numeric columns
        cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()
        if cat_cols:
            raise ValueError(f"Found non-numeric columns: {cat_cols}. Encode them first.")
        
        # Check for missing values
        if X.isnull().any().any():
            raise ValueError("X contains missing values. Fill them first.")
        
        return X, y
        
    def _detect_task_type(self, y):
        n_unique = y.nunique()

        # If there are only two different values, it is a Binary Classification problem
        if n_unique == 2:
            print(f"Detected: Binary Classification")
            return "classification"
        else:
            print(f"Detected: Regression")
            return "regression"
    
    def _evaluate_feature(self, X_base, new_feature, y):
        scaler = StandardScaler()
        
        if self.task_type == "regression":
            model = LinearRegression()
            scoring = "r2"
        else:
            model = LogisticRegression(max_iter=1000, random_state=42)
            scoring = "accuracy"
        
        X_with_new = pd.concat([X_base, new_feature], axis=1)
        X_scaled = scaler.fit_transform(X_with_new)
        scores = cross_val_score(model, X_scaled, y, cv=5, scoring=scoring)
        
        return scores.mean(), scores.std()
    
    def _generate_candidates(self, X):
        candidates = []
        
        # Single column transformations
        transforms = {
            "log": lambda x: np.log(x + 1),
            "sqrt": lambda x: np.sqrt(np.abs(x)),
            "square": lambda x: x ** 2,
            "cube": lambda x: x ** 3,
            "inverse": lambda x: 1 / (x + 1e-5),
        }

        # Generate single column transformations
        for col in X.columns:
            for name, func in transforms.items():
                try:
                    transformed = func(X[col])
                    
                    if pd.isna(transformed).any() or np.isinf(transformed).any():
                        continue
                    
                    feature_name = f"{name}({col})"
                    feature_series = pd.Series(transformed, index=X.index, name=feature_name)
                    
                    candidates.append((feature_name, feature_series))
                except:
                    continue
        
        # Pairwise operations
        operations = {
            "multiply": lambda a, b: a * b,
            "divide": lambda a, b: a / (b + 1e-5),
        }
        
        continuous_cols = [col for col in X.columns if X[col].nunique() > 2]

        # Generate pairwise transformations
        for i, col1 in enumerate(continuous_cols):
            for col2 in continuous_cols[i+1:]:
                for op_name, func in operations.items():
                    try:
                        transformed = func(X[col1], X[col2])
                        
                        if pd.isna(transformed).any() or np.isinf(transformed).any():
                            continue
                        
                        feature_name = f"{col1} {op_name} {col2}"
                        feature_series = pd.Series(transformed, index=X.index, name=feature_name)
                        
                        candidates.append((feature_name, feature_series))
                    except:
                        continue
        
        return candidates
    
    def fit(self, X, y):
        # Validate input
        X, y = self._validate_input(X, y)

        # Set task type
        self.task_type = self._detect_task_type(y)


        # Calculate baseline
        print("Calculating baseline...")
        
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        
        if self.task_type == "regression":
            model = LinearRegression()
            scores = cross_val_score(model, X_scaled, y, cv=5, scoring="r2")
            metric = "R2"
        else:
            model = LogisticRegression(max_iter=1000, random_state=42)
            scores = cross_val_score(model, X_scaled, y, cv=5, scoring="accuracy")
            metric = "Accuracy"
        
        baseline_score = scores.mean()
        baseline_std = scores.std()
        
        print(f"Baseline {metric}: {baseline_score:.4f} +/- {baseline_std:.4f}")
        print("Generating candidates...")

        
        # Generate and test candidates
        candidates = self._generate_candidates(X)
        
        print(f"Testing {len(candidates)} candidates...")
        
        for feature_name, feature_series in candidates:
            try:
                new_feature_df = pd.DataFrame({feature_name: feature_series})
                score, std = self._evaluate_feature(X, new_feature_df, y)
                improvement = score - baseline_score
                
                if improvement > 0:
                    self.results.append({
                        "feature": feature_name,
                        "score": score,
                        "std": std,
                        "improvement": improvement
                    })
            except:
                continue

        # Check if there were no features that improved the performance
        if len(self.results) == 0:
            print("No improvements found.")
            return pd.DataFrame()
        
        self.results.sort(key=lambda x: x["improvement"], reverse=True)
        results_df = pd.DataFrame(self.results[:self.max_results])

        # Print the results
        print(f"\nTop {len(results_df)} improvements:")
        for i, row in results_df.iterrows():
            print(f"  {i+1}. {row["feature"]}")
            print(f"     {metric}: {row["score"]:.4f} +/- {row["std"]:.4f} (+{row["improvement"]:.4f})")
        
        return results_df

In [5]:
# Load data
df = pd.read_csv("train.csv")
X = df.drop(columns=["Survived"])
y = df["Survived"]

# Preprocessing
X = X.drop(columns=["PassengerId", "Name", "Cabin", "Ticket"])
X = pd.get_dummies(X, drop_first=True, dtype=int)
X = X.fillna(X.median())

# Test
autofe = EduAutoFE(max_results=10)
results = autofe.fit(X, y)

Detected: Binary Classification
Calculating baseline...
Baseline Accuracy: 0.7857 +/- 0.0184
Generating candidates...
Testing 60 candidates...

Top 10 improvements:
  1. log(Age)
     Accuracy: 0.8081 +/- 0.0237 (+0.0224)
  2. sqrt(Age)
     Accuracy: 0.8036 +/- 0.0210 (+0.0179)
  3. inverse(Age)
     Accuracy: 0.8002 +/- 0.0246 (+0.0146)
  4. Pclass divide Age
     Accuracy: 0.7980 +/- 0.0253 (+0.0123)
  5. square(Age)
     Accuracy: 0.7969 +/- 0.0211 (+0.0112)
  6. SibSp divide Parch
     Accuracy: 0.7957 +/- 0.0180 (+0.0101)
  7. Pclass multiply SibSp
     Accuracy: 0.7946 +/- 0.0218 (+0.0090)
  8. inverse(SibSp)
     Accuracy: 0.7935 +/- 0.0171 (+0.0078)
  9. cube(Age)
     Accuracy: 0.7924 +/- 0.0205 (+0.0067)
  10. Pclass multiply Parch
     Accuracy: 0.7924 +/- 0.0199 (+0.0067)
