In [6]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
training_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')


# 1. Check for missing values

In [None]:
training_data.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


### Findings
- Age has missing values - 277
- Cabin has missing values - 687
- Embarked has missing values - 2

### Use Data Wrangler to do
- fill median for age values
- drop Cabin
- drop row for embarked with null values


# 2. Clean train and data 

In [None]:
def clean_data(df):
    # Drop column: 'Cabin'
    df = df.drop(columns=['Cabin'])

    # Replace missing values with the median of each column in: 'Age'
    df = df.fillna({'Age': df['Age'].median()})

    # Drop rows with missing data in column: 'Embarked'
    df = df.dropna(subset=['Embarked'])
    
    return df


csv_train_df = pd.read_csv(r'data/train.csv')

clean_train_df = clean_data(csv_train_df.copy())
clean_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Name         889 non-null    object 
 4   Sex          889 non-null    object 
 5   Age          889 non-null    float64
 6   SibSp        889 non-null    int64  
 7   Parch        889 non-null    int64  
 8   Ticket       889 non-null    object 
 9   Fare         889 non-null    float64
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 83.3+ KB


In [36]:
csv_test_df = pd.read_csv(r'data/test.csv')
clean_test_df = clean_data(csv_test_df.copy())
clean_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          418 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(4)
memory usage: 32.8+ KB


# 3. Feature Engineering

In [35]:
def feature_engineering(df):
    # Drop column: 'PassengerId', 'Name', 'Ticket'
    df = df.drop(columns=['PassengerId', 'Name', 'Ticket'])

    # One-hot encode column: 'Embarked'
    df = pd.get_dummies(df, columns=['Embarked'], prefix='Embarked', drop_first=True, dtype=int)

    # One-hot encode column: 'Sex'
    insert_loc = df.columns.get_loc('Sex')
    df = pd.concat([df.iloc[:,:insert_loc], pd.get_dummies(df.loc[:, ['Sex']], drop_first=True), df.iloc[:,insert_loc+1:]], axis=1)
    
    # Log transform 'Fare' to reduce skewness
    df['log_Fare'] = np.log1p(df['Fare'])
    df = df.drop(columns=['Fare'])

    # Created column 'FamilySize' from formula
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df = df.drop(columns=['SibSp', 'Parch'])

    return df


train_df = feature_engineering(clean_train_df.copy())
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    889 non-null    int64  
 1   Pclass      889 non-null    int64  
 2   Sex_male    889 non-null    bool   
 3   Age         889 non-null    float64
 4   Embarked_Q  889 non-null    int64  
 5   Embarked_S  889 non-null    int64  
 6   log_Fare    889 non-null    float64
 7   FamilySize  889 non-null    int64  
dtypes: bool(1), float64(2), int64(5)
memory usage: 56.4 KB


In [37]:
test_df = feature_engineering(clean_test_df.copy())
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Pclass      418 non-null    int64  
 1   Sex_male    418 non-null    bool   
 2   Age         418 non-null    float64
 3   Embarked_Q  418 non-null    int64  
 4   Embarked_S  418 non-null    int64  
 5   log_Fare    417 non-null    float64
 6   FamilySize  418 non-null    int64  
dtypes: bool(1), float64(2), int64(4)
memory usage: 20.1 KB


# 4. Prepare for model training

In [38]:
from sklearn.model_selection import train_test_split

continuous_features = ['Age', 'log_Fare', 'FamilySize']

X = train_df.drop(columns=['Survived'])
y = train_df['Survived']

# Split data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y # Important for classification to keep proportion of 'Survived' similar in both sets
)


In [None]:
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()

# Fit the scaler ONLY on the training data (X_train)
scaler.fit(X_train[continuous_features])

# Transform both the training and testing sets
X_train[continuous_features] = scaler.transform(X_train[continuous_features])
X_test[continuous_features] = scaler.transform(X_test[continuous_features])

test_df[continuous_features] = scaler.transform(test_df[continuous_features])

print("✅ Scaling complete. Features are now ready for modeling.")
print("\nX_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

✅ Scaling complete. Features are now ready for modeling.

X_train shape: (711, 7)
X_test shape: (178, 7)
test shape: (418, 7)


In [56]:
from sklearn.ensemble import RandomForestClassifier

# 1. Choose a model
model = RandomForestClassifier(n_estimators=500, random_state=42)

# 2. Train the model using the prepared training data
model.fit(X_train, y_train)

print("\nModel training complete.")


Model training complete.


In [57]:
# Assuming your trained model is named 'model'
# and your prepared test features are named 'X_test'

# Generate predictions for the test features
y_pred = model.predict(X_test)

from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.7978


In [None]:
y_pred = model.predict(test_df)

# Create a new DataFrame with the required columns
submission_df = pd.DataFrame({
    'PassengerId': clean_test_df['PassengerId'], # Get the Passenger IDs from the original file
    'Survived': y_pred                  # Your model's predicted output (0 or 1)
})

# Ensure the 'Survived' column is of integer type (0 or 1) as required by most platforms
submission_df['Survived'] = submission_df['Survived'].astype(int)

# Save the DataFrame to a CSV file without the index
submission_df.to_csv('submission/titanic_submission.csv', index=False)

# Display the first few rows to verify the format
print("Submission:")
submission_df.info()

Submission:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   PassengerId  418 non-null    int64
 1   Survived     418 non-null    int64
dtypes: int64(2)
memory usage: 6.7 KB


In [61]:
survived_count = submission_df['Survived'].sum()

print(f"\nTotal predicted Survivors in the submission file: {survived_count}")


Total predicted Survivors in the submission file: 116
