In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

class AutoFE:
    
    def __init__(self, max_results=10):
        self.max_results = max_results
        self.results = []
        
    def _evaluate_feature(self, X_base, new_feature, y):
        scaler = StandardScaler()
        model = LogisticRegression(max_iter=1000, random_state=42)
        
        # Concatenate new feature with existing ones
        X_with_new = pd.concat([X_base, new_feature], axis=1)
        
        # Scale and run cross validation
        X_scaled = scaler.fit_transform(X_with_new)
        scores = cross_val_score(model, X_scaled, y, cv=5, scoring='accuracy')
        
        return scores.mean(), scores.std()
    
    def _generate_candidates(self, X):
        candidates = []
        
        # Single column transformations only for now
        transforms = {
            'log': lambda x: np.log(x + 1),
            'sqrt': lambda x: np.sqrt(np.abs(x)),
            'square': lambda x: x ** 2,
        }
        
        for col in X.columns:
            for name, func in transforms.items():
                try:
                    transformed = func(X[col])
                    
                    if pd.isna(transformed).any() or np.isinf(transformed).any():
                        continue
                    
                    feature_name = f'{name}({col})'
                    feature_series = pd.Series(transformed, index=X.index, name=feature_name)
                    
                    candidates.append((feature_name, feature_series))
                except:
                    continue
        
        return candidates
    
    def fit(self, X, y):
        print("Calculating baseline...")
        
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        
        model = LogisticRegression(max_iter=1000, random_state=42)
        scores = cross_val_score(model, X_scaled, y, cv=5, scoring='accuracy')
        
        baseline_score = scores.mean()
        
        print(f"Baseline Accuracy: {baseline_score:.4f}")
        print("Testing transformations...")
        
        candidates = self._generate_candidates(X)
        
        for feature_name, feature_series in candidates:
            try:
                new_feature_df = pd.DataFrame({feature_name: feature_series})
                score, std = self._evaluate_feature(X, new_feature_df, y)
                improvement = score - baseline_score
                
                if improvement > 0:
                    self.results.append({
                        'feature': feature_name,
                        'score': score,
                        'improvement': improvement
                    })
            except:
                continue
        
        self.results.sort(key=lambda x: x['improvement'], reverse=True)
        results_df = pd.DataFrame(self.results[:self.max_results])
        
        if len(results_df) > 0:
            print(f"\nTop {len(results_df)} improvements:")
            for i, row in results_df.iterrows():
                print(f"  {row['feature']}: {row['score']:.4f} (+{row['improvement']:.4f})")
        
        return results_df

In [None]:
# Load data
df = pd.read_csv("/kaggle/input/titanic/train.csv")
X = df.drop(columns=["Survived"])
y = df["Survived"]

# Preprocessing
X = X.drop(columns=['PassengerId', 'Name', 'Cabin', 'Ticket'])
X = pd.get_dummies(X, drop_first=True, dtype=int)
X = X.fillna(X.median())

# Test
autofe = AutoFE(max_results=5)
results = autofe.fit(X, y)