In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score


from sklearn.linear_model import LogisticRegression

In [2]:
# Load data (replace path as needed)
df = pd.read_csv("train.csv")

In [3]:
# Split train/test set
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [4]:
# Feature groups
num_features = ['Age', 'Fare']
cat_features = ['Sex', 'Embarked', 'Pclass']

# Transformers
num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Column Transformer
preprocessor = ColumnTransformer([
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
])

In [5]:
X_train = train_data[num_features + cat_features]
y_train = train_data['Survived']

# Pipeline
lr_pipeline = Pipeline([
     ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# 5-fold cross-validation on training set
scores = cross_val_score(lr_pipeline, X_train, y_train, cv=5, scoring='accuracy')
print("Logistic Regression CV Accuracy:", scores.mean())

Logistic Regression CV Accuracy: 0.7850684526740864


In [6]:
# Apply best estimator to held-out test set
X_test = test_data[num_features + cat_features]
y_test = test_data['Survived']

lr_pipeline.fit(X_train, y_train)

# Predict
y_pred = lr_pipeline.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.7877094972067039


In [8]:
# Load test.csv
test_raw = pd.read_csv("test.csv")

X_submission = test_raw[num_features + cat_features]

submission_preds = lr_pipeline.predict(X_submission)


submission = pd.DataFrame({
    "PassengerId": test_raw["PassengerId"],
    "Survived": submission_preds
})

submission.to_csv("submission_lr_no_fe.csv", index=False)
print("Submission file created: submission.csv")



Submission file created: submission.csv


#### This submission results in an accuracy of 0.76794.