In [72]:
import pandas as pd
import numpy as np
import random
import time
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings("ignore")

class EduAutoFE:
    
    def __init__(self, max_results=10, max_minutes=None):
        self.max_results = max_results
        self.max_minutes = max_minutes
        self.results = []
        
    def _validate_input(self, X, y):
        
        # X needs to be a dataframe
        if not isinstance(X, pd.DataFrame):
            raise TypeError(
                "Error: X must be a pandas DataFrame\n\n"
                "Fix: X = pd.DataFrame(X)"
            )
        
        # Y needs to be a series
        if not isinstance(y, pd.Series):
            raise TypeError(
                "Error: y must be a pandas Series\n\n"
                'Fix: y = pd.Series(y) or y = df["target_column"]'
            )
        
        # Check for too many columns
        if len(X.columns) > 100:
            raise ValueError(
                f"Error: Too many columns ({len(X.columns)})\n\n"
                f"This happens when your dataset has too many features\n"
                f"or one-hot encoding created too many columns.\n\n"
                f'Drop with: X = X.drop(columns=["col1", "col2", ...])'
            )
        
        # Check for ID columns
        id_patterns = ["id", "index", "key", "number", "code"]
        for col in X.columns:
            col_lower = col.lower()
            if any(pattern in col_lower for pattern in id_patterns):
                if X[col].dtype in [np.int64, np.float64]:
                    uniqueness = X[col].nunique() / len(X)
                    if uniqueness > 0.95:
                        raise ValueError(
                            f"Error: '{col}' looks like an ID column\n\n"
                            f"Drop with: X = X.drop(columns=['{col}'])"
                        )
        
        # Check for text columns
        cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()
        if cat_cols:
            raise ValueError(
                f"Error: Found text columns: {cat_cols}\n\n"
                f"One-hot encode with: X = pd.get_dummies(X, drop_first=True, dtype=int)\n"
                f'Drop with: X = X.drop(columns=["text_col"])\n'
                f'Label encode with: X["col"] = LabelEncoder().fit_transform(X["col"])'
            )
        
        # Check for missing values in X
        if X.isnull().any().any():
            cols_with_missing = X.columns[X.isnull().any()].tolist()
            raise ValueError(
                f"Error: Missing values in: {cols_with_missing}\n\n"
                f"Fill with median: X = X.fillna(X.median())\n"
                f"Fill with mean: X = X.fillna(X.mean())\n"
                f'Drop with: X = X.drop(columns=["col_with_missing"])'
            )
        
        # Check for missing values in y
        if y.isnull().any():
            raise ValueError(
                f"Error: {y.isnull().sum()} missing values in y\n\n"
                f"Fill with: y = y.fillna(y.median())"
            )
        
        return X, y
    
    def _detect_task_type(self, y):
        n_unique = y.nunique()
        
        if n_unique == 2:
            # Binary classification
            print(f"Detected: Binary Classification ({n_unique} classes)")
            print(f"   Model: Logistic Regression")
            print(f"   Why: Good for learning how features affect binary outcomes\n")
            return "classification"
        else:
            # Regression
            print(f"Detected: Regression ({n_unique} unique values)")
            print(f"   Model: Linear Regression")
            print(f"   Why: Good for learning how features affect continuous outcomes\n")
            return "regression"
    
    def _get_model_and_scoring(self):
        if self.task_type == "regression":
            return LinearRegression(), "r2", "R2"
        else:
            return LogisticRegression(max_iter=1000, random_state=42), "accuracy", "Accuracy"
    
    def _evaluate_feature(self, X_base, new_feature, y):
        scaler = StandardScaler()
        model, scoring, _ = self._get_model_and_scoring()
        
        # Concatenate new feature with existing ones
        X_with_new = pd.concat([X_base, new_feature], axis=1)
        
        # Scale and run cross validation
        X_scaled = scaler.fit_transform(X_with_new)
        scores = cross_val_score(model, X_scaled, y, cv=5, scoring=scoring)
        
        return scores.mean(), scores.std()
    
    def _generate_candidates(self, X):
        candidates = []
        
        # Single column transformations with descriptions
        transforms = {
            "log": (lambda x: np.log(x + 1), "Log transformation of {col}. Compresses the long tail in the high part of heavy-tailed distributions and expands the low part, making data more normally distributed. \nCommon applications: data that spreads over several orders of magnitude, such as prices, populations, incomes, number of reviews, word frequencies, and sales figures."),
            "sqrt": (lambda x: np.sqrt(np.abs(x)), "Square root of {col}. Used for Poisson-distributed data where variance equals the mean. Stabilizes variance so it is no longer dependent on the mean. Also used for compressing the long tail and strengthening the signal. \nCommon applications: count data, event frequencies, patient measurements with extreme values (like weight or blood pressure), and visit counts."),
            "square": (lambda x: x ** 2, "Square of {col}. Polynomial transformation used to capture non-linear patterns in data, which is especially valuable for linear models that have difficulty finding these relationships on their own. Adds higher-order components to create new, more complex features. \nCommon applications: variables with quadratic relationships, such as age effects, distance calculations, and diminishing returns patterns."),
            "cube": (lambda x: x ** 3, "Cube of {col}. Polynomial transformation used to capture non-linear patterns in data, which is especially valuable for linear models that have difficulty finding these relationships on their own. Adds higher-order components to create new, more complex features. \nCommon applications: variables with strong non-linear relationships, such as accelerating growth patterns, S-shaped curves, and compound effects."),
        }

        # Generate single column transformations
        for col in X.columns:
            for name, (func, desc_template) in transforms.items():
                try:
                    transformed = func(X[col])
                    
                    # Skip if we get NaN or infinity
                    if pd.isna(transformed).any() or np.isinf(transformed).any():
                        continue
                    
                    feature_name = f"{name}({col})"
                    feature_series = pd.Series(transformed, index=X.index, name=feature_name)
                    description = desc_template.format(col=col)
                    
                    candidates.append((feature_name, feature_series, description))
                except:
                    continue
        
        # Pairwise operations with descriptions
        operations = {
            "multiply": (lambda a, b: a * b, "Multiplication of {col1} and {col2}. Captures interaction effects and combined impact. \nCommon applications: creating area features (length × width), calculating population (area × density), computing total cost (price × quantity), and modeling combined effects."),
            "divide": (lambda a, b: a / (b + 1e-5), "Division of {col1} by {col2}. Used to create per-unit measures by dividing one variable by another. \nCommon applications: calculating BMI (weight ÷ height²), price efficiency (price ÷ area), density metrics (population ÷ area), and normalized rates."),
        }
        
        # Only do pairwise on continuous columns, not binary ones
        continuous_cols = [col for col in X.columns if X[col].nunique() > 2]

        # Generate pairwise transformations
        for i, col1 in enumerate(continuous_cols):
            for col2 in continuous_cols[i+1:]:
                for op_name, (func, desc_template) in operations.items():
                    try:
                        transformed = func(X[col1], X[col2])
                        
                        # Skip invalid values
                        if pd.isna(transformed).any() or np.isinf(transformed).any():
                            continue
                        
                        feature_name = f"{col1} {op_name} {col2}"
                        feature_series = pd.Series(transformed, index=X.index, name=feature_name)
                        description = desc_template.format(col1=col1, col2=col2)
                        
                        candidates.append((feature_name, feature_series, description))
                    except:
                        continue
        
        return candidates, len(continuous_cols)

    def _print_results(self, df, baseline_score, baseline_std):
        metric = "Accuracy" if self.task_type == "classification" else "R2"
        
        print("\n" + "="*60)
        print(f"{"RESULTS":^60}")
        print("="*60)
        print(f"\nBaseline {metric}: {baseline_score:.4f} +/- {baseline_std:.4f}")
        
        if len(df) > 0:
            best = df.iloc[0]["improvement"]
            print(f"Best improvement: +{best:.4f}")
        
        print(f"\nTop {len(df)} Transformations:")
        print("-"*60)
        
        for i, row in df.iterrows():
            print(f"\n{i+1}. {row["feature"]}")
            print(f"{metric}: {row["score"]:.4f} +/- {row["std"]:.4f} (+{row["improvement"]:.4f})")
            print(f"{row["description"]}")
        
        print("\n" + "="*60)
        
        print("\nWhat does this mean?")
        if self.task_type == "classification":
            print("Accuracy shows what % of predictions are correct.")
            print("Higher accuracy = better model performance.")
            print("+/- std shows consistency across different data splits.")
        else:
            print("R2 shows what % of variation in the target the model explains.")
            print("R2 = 0.8 means the model explains 80% of the pattern.")
            print("+/- std shows consistency across different data splits.")
        
        print("\nNext steps:")
        print("1. Try applying these transformations to your data")
        print("2. Combine multiple good transformations together")
        print("3. Test with other models (Random Forest, XGBoost, etc.)")
        print("4. Remember: these transformations help most with linear models!")
        print("\n" + "="*60)
    
    def fit(self, X, y):
        # Validate input
        X, y = self._validate_input(X, y)
        
        # Set task type
        self.task_type = self._detect_task_type(y)
        
        # Calculate baseline
        print("Calculating baseline performance...")
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        model, scoring, metric = self._get_model_and_scoring()
        scores = cross_val_score(model, X_scaled, y, cv=5, scoring=scoring)
        baseline_score = scores.mean()
        baseline_std = scores.std()
        
        # Generate all candidates
        print(f"\nGenerating candidate transformations...")
        candidates, n_continuous = self._generate_candidates(X)
        
        # Randomize order so time limited runs test different candidates each time
        random.shuffle(candidates)

        # Print the amount of candidates created
        print(f"   Generated {len(candidates)} candidates")
        
        # Print which mode is used
        if self.max_minutes:
            print(f"\nEvaluating candidates (max {self.max_minutes} minutes)...")
        else:
            print(f"\nEvaluating candidates (exhaustive search)...")
        print(f"   This may take a moment...\n")
        
        # Track time
        start_time = time.time()
        candidates_tested = 0
        
        # Evaluate each candidate
        for feature_name, feature_series, description in candidates:
            # Check time limit
            if self.max_minutes:
                elapsed_minutes = (time.time() - start_time) / 60
                if elapsed_minutes >= self.max_minutes:
                    print(f"   Time limit reached ({self.max_minutes} min)")
                    print(f"   Tested {candidates_tested} of {len(candidates)} candidates\n")
                    break
            
            try:
                # Wrap in dataframe for concat
                new_feature_df = pd.DataFrame({feature_name: feature_series})
                
                score, std = self._evaluate_feature(X, new_feature_df, y)
                improvement = score - baseline_score
                
                candidates_tested += 1
                
                # Only keep if it improves the score
                if improvement > 0:
                    self.results.append({
                        "feature": feature_name,
                        "score": score,
                        "std": std,
                        "improvement": improvement,
                        "description": description
                    })
            except:
                continue
        
        # Check if there were no features that improved the performance
        if not self.results:
            print("No improvements found over baseline.")
            return pd.DataFrame()
        
        # Sort results by improvement
        self.results.sort(key=lambda x: x["improvement"], reverse=True)
        results_df = pd.DataFrame(self.results[:self.max_results])
        
        # Print the results
        self._print_results(results_df, baseline_score, baseline_std)
        
        return results_df

In [73]:
# Load your data
df = pd.read_csv("train.csv")
X = df.drop(columns=["Survived"])
y = df["Survived"]

X

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [74]:
# Basic preprocessing

# Drop columns
X = X.drop(columns=["PassengerId"])
X = X.drop(columns=["Name"])
X = X.drop(columns=["Cabin"])
X = X.drop(columns=["Ticket"])

# Encode categorical variables
X = pd.get_dummies(X, drop_first=True, dtype=int)

# Fill missing values
X = X.fillna(X.median())
y = y.fillna(y.median())

# Use the tool
model = EduAutoFE(max_minutes=1)
results = model.fit(X, y)

Detected: Binary Classification (2 classes)
   Model: Logistic Regression
   Why: Good for learning how features affect binary outcomes

Calculating baseline performance...

Generating candidate transformations...
   Generated 52 candidates

Evaluating candidates (max 1 minutes)...
   This may take a moment...


                          RESULTS                           

Baseline Accuracy: 0.7857 +/- 0.0184
Best improvement: +0.0224

Top 10 Transformations:
------------------------------------------------------------

1. log(Age)
Accuracy: 0.8081 +/- 0.0237 (+0.0224)
Log transformation of Age. Compresses the long tail in the high part of heavy-tailed distributions and expands the low part, making data more normally distributed. 
Common applications: data that spreads over several orders of magnitude, such as prices, populations, incomes, number of reviews, word frequencies, and sales figures.

2. sqrt(Age)
Accuracy: 0.8036 +/- 0.0210 (+0.0179)
Square root of Age. Used for Poisson-dis