# ***Feature Optimization for Classification Problems using Recursive Feature Elimination (RFE)***

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.feature_selection import RFE

# Ignore warnings
warnings.filterwarnings('ignore')


In [None]:
#Load dataset
file_path = '/content/drive/MyDrive/titanic_final_dataset(1).csv'
df = pd.read_csv(file_path)
print("Dataset loaded successfully!")
print(df.info())
print(df.head())


Dataset loaded successfully!
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900 entries, 0 to 899
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   passenger_id      900 non-null    int64  
 1   survived          900 non-null    object 
 2   ticket_class      898 non-null    object 
 3   name              900 non-null    object 
 4   sex               897 non-null    object 
 5   age               764 non-null    float64
 6   siblings_spouses  746 non-null    object 
 7   parents_children  749 non-null    object 
 8   ticket            897 non-null    object 
 9   fare              852 non-null    float64
 10  cabin             255 non-null    object 
 11  embarked          679 non-null    object 
 12  profession        782 non-null    object 
 13  travel_notes      791 non-null    object 
dtypes: float64(2), int64(1), object(11)
memory usage: 98.6+ KB
None
   passenger_id survived ticket_class        

In [None]:
#Handle missing values
print("Missing values before filling:\n", df.isnull().sum())

Missing values before filling:
 passenger_id          0
survived              0
ticket_class          2
name                  0
sex                   3
age                 136
siblings_spouses    154
parents_children    151
ticket                3
fare                 48
cabin               645
embarked            221
profession          118
travel_notes        109
dtype: int64


In [None]:
# Fill missing numerical with median
df.fillna(df.median(numeric_only=True), inplace=True)

# Fill missing categorical with mode
for col in df.select_dtypes(include=['object']):
    df[col].fillna(df[col].mode()[0], inplace=True)

print("Missing values after filling:\n", df.isnull().sum())


Missing values after filling:
 passenger_id        0
survived            0
ticket_class        0
name                0
sex                 0
age                 0
siblings_spouses    0
parents_children    0
ticket              0
fare                0
cabin               0
embarked            0
profession          0
travel_notes        0
dtype: int64


In [None]:
#Encode categorical columns
categorical_columns = ['survived', 'ticket_class', 'sex', 'siblings_spouses', 'parents_children', 'embarked']
LabelEncoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    LabelEncoders[col] = le

In [None]:
#Feature scaling
X = df.drop(columns=['survived'])
y = df['survived']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
#Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
#Baseline Logistic Regression
model_lr = LogisticRegression(max_iter=1000, solver="liblinear")
model_lr.fit(X_train, y_train)

y_pred = model_lr.predict(X_test)
y_proba = model_lr.predict_proba(X_test)[:, 1]

print("Baseline Logistic Regression")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("F1-score :", f1_score(y_test, y_pred))
print("ROC AUC  :", roc_auc_score(y_test, y_proba))


Baseline Logistic Regression
Accuracy : 0.5666666666666667
Precision: 0.3157894736842105
Recall   : 0.08450704225352113
F1-score : 0.13333333333333333
ROC AUC  : 0.46905284920532364


In [None]:
#Baseline Decision Tree
model_tree = DecisionTreeClassifier(random_state=42)
model_tree.fit(X_train, y_train)

y_pred = model_tree.predict(X_test)
y_proba = model_tree.predict_proba(X_test)[:, 1]

print("\nBaseline Decision Tree")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("F1-score :", f1_score(y_test, y_pred))
print("ROC AUC  :", roc_auc_score(y_test, y_proba))


Baseline Decision Tree
Accuracy : 0.5
Precision: 0.379746835443038
Recall   : 0.4225352112676056
F1-score : 0.4
ROC AUC  : 0.4864969634319679


In [None]:
# Choose Logistic Regression as base estimator
estimator = LogisticRegression(max_iter=1000)

# Ask RFE to select best 5 features
rfe = RFE(estimator, n_features_to_select=5)
rfe.fit(X_train, y_train)


In [None]:
# Print selected features
selected_features = X.columns[rfe.support_]
print("\nSelected Features by RFE:", list(selected_features))


Selected Features by RFE: ['name', 'sex', 'parents_children', 'cabin', 'profession']


In [None]:
# Optimized Model (only selected features)

# Get the indices of the selected features from the original DataFrame's columns
selected_feature_indices = [X.columns.get_loc(col) for col in selected_features]

# Train again with selected features only using the indices
model_opt = LogisticRegression(max_iter=1000)
model_opt.fit(X_train[:, selected_feature_indices], y_train)
y_pred_opt = model_opt.predict(X_test[:, selected_feature_indices])
y_proba_opt = model_opt.predict_proba(X_test[:, selected_feature_indices])[:, 1]


print("\n=== Optimized Model (Selected Features) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_opt))
print("Precision:", precision_score(y_test, y_pred_opt))
print("Recall:", recall_score(y_test, y_pred_opt))
print("F1 Score:", f1_score(y_test, y_pred_opt))
print("ROC AUC:", roc_auc_score(y_test, y_proba_opt))
print("\nClassification Report:\n", classification_report(y_test, y_pred_opt))


=== Optimized Model (Selected Features) ===
Accuracy: 0.5944444444444444
Precision: 0.42857142857142855
Recall: 0.08450704225352113
F1 Score: 0.1411764705882353
ROC AUC: 0.4880475513632253

Classification Report:
               precision    recall  f1-score   support

           0       0.61      0.93      0.73       109
           1       0.43      0.08      0.14        71

    accuracy                           0.59       180
   macro avg       0.52      0.51      0.44       180
weighted avg       0.54      0.59      0.50       180



In [None]:
# Compare Results
print("\n>>> Comparison:")
print("Baseline used all features:", list(X.columns))
print("Optimized used selected features:", list(selected_features))


>>> Comparison:
Baseline used all features: ['passenger_id', 'ticket_class', 'name', 'sex', 'age', 'siblings_spouses', 'parents_children', 'ticket', 'fare', 'cabin', 'embarked', 'profession', 'travel_notes']
Optimized used selected features: ['name', 'sex', 'parents_children', 'cabin', 'profession']
