In [145]:
import numpy as np
import pandas as pd

# Model-related imports
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Pre-processing and feature extraction
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.feature_extraction import DictVectorizer

# Model evaluation and hyperparameter tuning
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV

test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')
baseline = pd.read_csv('gender_submission.csv')

In [147]:
def preprocess_data_advanced(data):
    #Titles of people
    data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    data['Title'] = data['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')
    data['Title'] = data['Title'].replace('Mlle', 'Miss')
    data['Title'] = data['Title'].replace('Ms', 'Miss')
    data['Title'] = data['Title'].replace('Mme', 'Mrs')

    #Family Size
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1

    #Groupings by age
    bins = [0, 12, 20, 40, 60, np.inf]
    labels = ['Child', 'Teenager', 'Adult', 'MiddleAge', 'Senior']
    data['AgeGroup'] = pd.cut(data['Age'], bins=bins, labels=labels, right=False)

    #Cabins
    data['Deck'] = data['Cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'M')

    # Handle missing values
    data['Age'].fillna(data['Age'].mean(), inplace=True)
    data['Fare'].fillna(data['Fare'].mean(), inplace=True)
    data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)  # Fill with the most frequent value

    # Log Transformation on Fare // New Addition thoughts?
    data['Fare'] = np.log1p(data['Fare'])

    # Encoding categorical variables
    data = pd.get_dummies(data, columns=['Sex', 'Pclass', 'Embarked', 'AgeGroup', 'Title', 'Deck'])

    features_to_scale = ['Age', 'SibSp', 'Parch', 'Fare', 'FamilySize']
    scaler = StandardScaler()
    data[features_to_scale] = scaler.fit_transform(data[features_to_scale])
    data.drop('Cabin', axis=1, inplace=True)

    return data

In [148]:
train = preprocess_data_advanced(train)
test = preprocess_data_advanced(test)

print("Columns with NaN values in training data:", train.columns[train.isnull().any()].tolist())
print("Columns with NaN values in test data:", test.columns[test.isnull().any()].tolist())


Columns with NaN values in training data: []
Columns with NaN values in test data: []


In [149]:
def align_columns(train, test):
    # Ensure test set has the same columns as the train set
    missing_cols = set(train.columns) - set(test.columns)
    for c in missing_cols:
        test[c] = 0

    # Ensure order of columns in test is same as in train
    test = test[train.columns]

    return train, test

train, test = align_columns(train, test)


In [192]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

rf_clf = RandomForestClassifier(n_estimators=1000, max_depth=5, random_state=1)
gb_clf = GradientBoostingClassifier(n_estimators=1000, learning_rate=1.0, max_depth=8, random_state=1)
lr_clf = LogisticRegression(max_iter=10000, random_state=1)

r
vote_clf = VotingClassifier(estimators=[('rf', rf_clf), ('gb', gb_clf), ('lr', lr_clf)], voting='soft')


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


vote_clf.fit(X_train, y_train)


accuracy = vote_clf.score(X_val, y_val)
print("Validation Accuracy:", accuracy)


if 'Survived' in test.columns:
    X_test = test.drop(['PassengerId', 'Name', 'Ticket', 'Survived'], axis=1)
else:
    X_test = test.drop(['PassengerId', 'Name', 'Ticket'], axis=1)

ensemble_predictions = vote_clf.predict(X_test)

ensemble_result = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': ensemble_predictions})
ensemble_result.to_csv('submission_ensemble.csv', index=False)


ensemble_result = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': ensemble_predictions})
ensemble_result


Validation Accuracy: 0.8435754189944135


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
