In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

Loading datasets

In [2]:

# Load the training and testing datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')


Data Preprocessing

In [3]:

# Drop irrelevant columns
columns_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin']
train_data = train_data.drop(columns_to_drop, axis=1)
test_data = test_data.drop(columns_to_drop, axis=1)

# Separate target variable 'Survived' from features in training data
X = train_data.drop('Survived', axis=1)
y = train_data['Survived']

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)



Data Transformation - One-Hot Encoding and filling in missing values using averages

In [4]:

# Define preprocessing steps for categorical and numerical columns
categorical_cols = ['Sex', 'Embarked']
numerical_cols = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

# Create transformers for categorical and numerical columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing values with the most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore'))     # One-hot encode categorical variables
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))             # Fill missing values with the mean
])

# Combine transformers into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols),
        ('num', numerical_transformer, numerical_cols)
    ])

Using Random Forest Classifier to generate the mode, and training it

In [5]:

# Define the model (Random Forest Classifier)
model = RandomForestClassifier(random_state=42)

# Create and fit the full pipeline for training
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)])

# Fit the model to the training data
clf.fit(X_train, y_train)

# Make predictions on the validation dataset
y_pred = clf.predict(X_val)


Use training data to determine local accuracy. The training data is split into parts earlier in the code. 

In [6]:

# Evaluate the model locally using the validation set
accuracy = accuracy_score(y_val, y_pred)
print(f'Accuracy: {accuracy * 100:,.3f}%')

report = classification_report(y_val, y_pred)
print(report)

# Make predictions on the testing dataset
X_test = test_data  # No 'Survived' column in the testing dataset
y_pred_test = clf.predict(X_test)

# You can now use y_pred_test for further analysis or submission


Accuracy: 81.564%
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       105
           1       0.80      0.74      0.77        74

    accuracy                           0.82       179
   macro avg       0.81      0.80      0.81       179
weighted avg       0.81      0.82      0.81       179



Using model trained above to predict values for testing data

In [7]:
# Create a DataFrame with PassengerId and the predicted 'Survived' values
passenger_ids = pd.read_csv('test.csv')['PassengerId']
results = pd.DataFrame({'PassengerId': passenger_ids, 'Survived': y_pred_test})

# Save the DataFrame to a CSV file
results.to_csv('predictions.csv', index=False)