In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings("ignore")

class EduAutoFE:
    
    def __init__(self, max_results=10):
        self.max_results = max_results
        self.results = []
    
    def _validate_input(self, X, y):
        
        # X needs to be a dataframe
        if not isinstance(X, pd.DataFrame):
            raise TypeError(
                "Error: X must be a pandas DataFrame\n"
                "Fix: X = pd.DataFrame(X)"
            )
        
        # Y needs to be a series
        if not isinstance(y, pd.Series):
            raise TypeError(
                "Error: y must be a pandas Series\n"
                "Fix: y = df['target_column']"
            )
        
        # Check for too many columns
        if len(X.columns) > 50:
            raise ValueError(
                f"Error: Too many columns ({len(X.columns)})\n"
                f"This usually means you encoded high-cardinality columns.\n"
                f"Drop them before encoding."
            )
        
        # Check for ID columns
        id_patterns = ["id", "index", "key"]
        for col in X.columns:
            col_lower = col.lower()
            if any(pattern in col_lower for pattern in id_patterns):
                if X[col].dtype in [np.int64, np.float64]:
                    uniqueness = X[col].nunique() / len(X)
                    if uniqueness > 0.95:
                        raise ValueError(f"Error: {col} looks like an ID column. Drop it.")
        
        # Check for text columns
        cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()
        if cat_cols:
            raise ValueError(
                f"Error: Found text columns: {cat_cols}\n"
                f"Fix: X = pd.get_dummies(X, drop_first=True, dtype=int)"
            )
        
        # Check for missing values
        if X.isnull().any().any():
            cols_with_missing = X.columns[X.isnull().any()].tolist()
            raise ValueError(
                f"Error: Missing values in: {cols_with_missing}\n"
                f"Fix: X = X.fillna(X.median())"
            )

        # Check for missing values
        if y.isnull().any():
            raise ValueError(
                f"Error: Missing values in y\n"
                f"Fix: y = y.fillna(y.median())"
            )
        
        return X, y
        
    def _detect_task_type(self, y):
        n_unique = y.nunique()

        # If there are only two different values, it is a Binary Classification problem
        if n_unique == 2:
            print(f"Detected: Binary Classification ({n_unique} classes)")
            print(f"   Model: Logistic Regression\n")
            return "classification"
        else:
            print(f"Detected: Regression ({n_unique} unique values)")
            print(f"   Model: Linear Regression\n")
            return "regression"
    
    def _evaluate_feature(self, X_base, new_feature, y):
        scaler = StandardScaler()
        
        if self.task_type == "regression":
            model = LinearRegression()
            scoring = "r2"
        else:
            model = LogisticRegression(max_iter=1000, random_state=42)
            scoring = "accuracy"
        
        X_with_new = pd.concat([X_base, new_feature], axis=1)
        X_scaled = scaler.fit_transform(X_with_new)
        scores = cross_val_score(model, X_scaled, y, cv=5, scoring=scoring)
        
        return scores.mean(), scores.std()
    
    def _generate_candidates(self, X):
        candidates = []
        
        # Single column transformations with descriptions
        transforms = {
            "log": (lambda x: np.log(x + 1), "Log of {col} - reduces skewness"),
            "sqrt": (lambda x: np.sqrt(np.abs(x)), "Square root of {col} - mild skew reduction"),
            "square": (lambda x: x ** 2, "Square of {col} - captures quadratic relationships"),
            "cube": (lambda x: x ** 3, "Cube of {col} - strong nonlinear relationships"),
            "inverse": (lambda x: 1 / (x + 1e-5), "Inverse of {col} - inverse relationships"),
        }

        # Generate single column transformations
        for col in X.columns:
            for name, (func, desc_template) in transforms.items():
                try:
                    transformed = func(X[col])
                    
                    if pd.isna(transformed).any() or np.isinf(transformed).any():
                        continue
                    
                    feature_name = f"{name}({col})"
                    feature_series = pd.Series(transformed, index=X.index, name=feature_name)
                    description = desc_template.format(col=col)
                    
                    candidates.append((feature_name, feature_series, description))
                except:
                    continue
        
        # Pairwise operations with descriptions
        operations = {
            "multiply": (lambda a, b: a * b, "Product of {col1} and {col2} - interaction effect"),
            "divide": (lambda a, b: a / (b + 1e-5), "Ratio of {col1} to {col2}"),
        }
        
        continuous_cols = [col for col in X.columns if X[col].nunique() > 2]

        # Generate pairwise transformations
        for i, col1 in enumerate(continuous_cols):
            for col2 in continuous_cols[i+1:]:
                for op_name, (func, desc_template) in operations.items():
                    try:
                        transformed = func(X[col1], X[col2])
                        
                        if pd.isna(transformed).any() or np.isinf(transformed).any():
                            continue
                        
                        feature_name = f"{col1} {op_name} {col2}"
                        feature_series = pd.Series(transformed, index=X.index, name=feature_name)
                        description = desc_template.format(col1=col1, col2=col2)
                        
                        candidates.append((feature_name, feature_series, description))
                    except:
                        continue
        
        return candidates
    
    def _print_results(self, df, baseline_score, baseline_std, metric):
        print("\n" + "="*60)
        print(f"{"RESULTS":^60}")
        print("="*60)
        print(f"\nBaseline {metric}: {baseline_score:.4f} +/- {baseline_std:.4f}")
        
        if len(df) > 0:
            best = df.iloc[0]["improvement"]
            print(f"Best improvement: +{best:.4f}")
        
        print(f"\nTop {len(df)} Transformations:")
        print("-"*60)
        
        for i, row in df.iterrows():
            print(f"\n{i+1}. {row["feature"]}")
            print(f"   {metric}: {row["score"]:.4f} +/- {row["std"]:.4f} (+{row["improvement"]:.4f})")
            print(f"   {row["description"]}")
        
        print("\n" + "="*60)

    def fit(self, X, y):
        # Validate input
        X, y = self._validate_input(X, y)

        # Set task type
        self.task_type = self._detect_task_type(y)

        # Calculate baseline
        print("Calculating baseline performance...")
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        # Detect task type and choose model and metric accordingly
        if self.task_type == "regression":
            model = LinearRegression()
            scores = cross_val_score(model, X_scaled, y, cv=5, scoring="r2")
            metric = "R2"
        else:
            model = LogisticRegression(max_iter=1000, random_state=42)
            scores = cross_val_score(model, X_scaled, y, cv=5, scoring="accuracy")
            metric = "Accuracy"
        
        baseline_score = scores.mean()
        baseline_std = scores.std()

        # Print the baseline score
        print(f"Baseline {metric}: {baseline_score:.4f} +/- {baseline_std:.4f}")
        print("Generating candidates...")

        # Generate and test candidates
        candidates = self._generate_candidates(X)
        
        print(f"Testing {len(candidates)} candidates...")
        
        for feature_name, feature_series, description in candidates:
            try:
                new_feature_df = pd.DataFrame({feature_name: feature_series})
                score, std = self._evaluate_feature(X, new_feature_df, y)
                improvement = score - baseline_score
                
                if improvement > 0:
                    self.results.append({
                        "feature": feature_name,
                        "score": score,
                        "std": std,
                        "improvement": improvement,
                        "description": description
                    })
            except:
                continue

        # Check if there were no features that improved the performance
        if len(self.results) == 0:
            print("No improvements found over baseline.")
            return pd.DataFrame()

        # Sort results by improvement
        self.results.sort(key=lambda x: x["improvement"], reverse=True)
        results_df = pd.DataFrame(self.results[:self.max_results])

        # Print the results
        self._print_results(results_df, baseline_score, baseline_std, metric)
        
        return results_df

In [7]:
# Load data
df = pd.read_csv("train.csv")
X = df.drop(columns=["Survived"])
y = df["Survived"]

X

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [8]:
# Preprocessing
X = X.drop(columns=["PassengerId"])
X = X.drop(columns=["Name"])
X = X.drop(columns=["Cabin"])
X = X.drop(columns=["Ticket"])

X = pd.get_dummies(X, drop_first=True, dtype=int)
X = X.fillna(X.median())
y = y.fillna(y.median())

# Run
model = EduAutoFE(max_results=10)
results = model.fit(X, y)

Detected: Binary Classification (2 classes)
   Model: Logistic Regression

Calculating baseline performance...
Baseline Accuracy: 0.7857 +/- 0.0184
Generating candidates...
Testing 60 candidates...

                          RESULTS                           

Baseline Accuracy: 0.7857 +/- 0.0184
Best improvement: +0.0224

Top 10 Transformations:
------------------------------------------------------------

1. log(Age)
   Accuracy: 0.8081 +/- 0.0237 (+0.0224)
   Log of Age - reduces skewness

2. sqrt(Age)
   Accuracy: 0.8036 +/- 0.0210 (+0.0179)
   Square root of Age - mild skew reduction

3. inverse(Age)
   Accuracy: 0.8002 +/- 0.0246 (+0.0146)
   Inverse of Age - inverse relationships

4. Pclass divide Age
   Accuracy: 0.7980 +/- 0.0253 (+0.0123)
   Ratio of Pclass to Age

5. square(Age)
   Accuracy: 0.7969 +/- 0.0211 (+0.0112)
   Square of Age - captures quadratic relationships

6. SibSp divide Parch
   Accuracy: 0.7957 +/- 0.0180 (+0.0101)
   Ratio of SibSp to Parch

7. Pclass mult