In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tpot import TPOTClassifier

# Load datasets
train_df = pd.read_csv(r"C:\Users\satyamanishankar\handson\code\data\train.csv")
test_df = pd.read_csv(r"C:\Users\satyamanishankar\handson\code\data\test.csv")

# Rename target variable for TPOT compatibility
train_df.rename(columns={'Survived': 'class'}, inplace=True)

# Drop high-cardinality categorical features
drop_cols = ['Name', 'Ticket', 'Cabin']
train_df = train_df.drop(columns=[col for col in drop_cols if col in train_df.columns], errors='ignore')
test_df = test_df.drop(columns=[col for col in drop_cols if col in test_df.columns], errors='ignore')

# Encode categorical variables
label_enc = LabelEncoder()
for col in ['Sex', 'Embarked']:
    if col in train_df.columns:
        train_df[col] = label_enc.fit_transform(train_df[col].astype(str))
        test_df[col] = label_enc.transform(test_df[col].astype(str))

# Fill missing values with median
train_df.fillna(train_df.median(numeric_only=True), inplace=True)
test_df.fillna(test_df.median(numeric_only=True), inplace=True)

# Split features and target
X = train_df.drop(columns=['class'])
y = train_df['class']

# Split dataset for training/testing
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Run TPOT to find the best model
tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, random_state=42)
tpot.fit(X_train, y_train)

# Evaluate best model on validation set
test_score = tpot.score(X_val, y_val)
print(f"Best Model Test Score: {test_score:.4f}")

# Export best pipeline
tpot.export("best_tpot_pipeline.py")

# Predict on test set
test_predictions = tpot.predict(test_df)

# Save predictions
submission = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': test_predictions})
submission.to_csv("titanic_predictions.csv", index=False)
print("Predictions saved to titanic_predictions.csv")


is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor




is_classifier
is_classifier
is_classifier
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_classifier
is_classifier
is_classifier
is_classifier
is_classifier
is_classifier
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_classifier
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier


Version 0.12.2 of tpot is outdated. Version 1.0.0 was released Wednesday February 26, 2025.


Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.8131685216192258

Generation 2 - Current best internal CV score: 0.8244361272530287


  File "c:\Users\satyamanishankar\python3.12\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\satyamanishankar\python3.12\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\satyamanishankar\python3.12\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\satyamanishankar\python3.12\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^



Generation 3 - Current best internal CV score: 0.8244361272530287

Generation 4 - Current best internal CV score: 0.8244361272530287

Generation 5 - Current best internal CV score: 0.8244361272530287

Best pipeline: ExtraTreesClassifier(RandomForestClassifier(input_matrix, bootstrap=False, criterion=gini, max_features=0.4, min_samples_leaf=10, min_samples_split=4, n_estimators=100), bootstrap=False, criterion=entropy, max_features=0.35000000000000003, min_samples_leaf=18, min_samples_split=9, n_estimators=100)
Best Model Test Score: 0.8212
Predictions saved to titanic_predictions.csv


