# 1. Importing Libraries/Datasets

In [97]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [24]:
df = pd.read_csv('train.csv')

In [25]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# 2. Feature Engineering

In [26]:
# Drop unnecessary columns, such as Name, Ticket, and Cabin
df = df[['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']].copy()

In [27]:
df['Sex'] = df['Sex'].map({'male': 1, 'female': 0})

# 3. Data Cleaning and Train/Test Setup

In [32]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Fare             0
dtype: int64

177 of the observations are missing a value for Age. I will not drop the column because Age is clearly a very important value for predicting survival rate in this situation, since a common mindset in dangerous events is to prioritize children's lives. Therefore, I will use a small model to predict the Age values for the missing values. 

In [35]:
# Model will be trained on rows with an Age value
age_train = df[df['Age'].notnull()]

In [36]:
age_train

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,1,0,3,1,22.0,1,0,7.2500
1,2,1,1,0,38.0,1,0,71.2833
2,3,1,3,0,26.0,0,0,7.9250
3,4,1,1,0,35.0,1,0,53.1000
4,5,0,3,1,35.0,0,0,8.0500
...,...,...,...,...,...,...,...,...
885,886,0,3,0,39.0,0,5,29.1250
886,887,0,2,1,27.0,0,0,13.0000
887,888,1,1,0,19.0,0,0,30.0000
889,890,1,1,1,26.0,0,0,30.0000


In [38]:
# Model will predict on test data where Age is not included
age_test = df[df['Age'].isnull()]

In [39]:
age_test

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
5,6,0,3,1,,0,0,8.4583
17,18,1,2,1,,0,0,13.0000
19,20,1,3,0,,0,0,7.2250
26,27,0,3,1,,0,0,7.2250
28,29,1,3,0,,0,0,7.8792
...,...,...,...,...,...,...,...,...
859,860,0,3,1,,0,0,7.2292
863,864,0,3,0,,8,2,69.5500
868,869,0,3,1,,0,0,9.5000
878,879,0,3,1,,0,0,7.8958


In [42]:
X_age_train = age_train[['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare']]
y_age_train = age_train['Age']

In [45]:
X_age_test = age_test[['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare']]

In [46]:
# Model fit and predict
model = RandomForestRegressor(n_estimators=100, random_state=1)
model.fit(X_age_train, y_age_train)
predictions = model.predict(X_age_test)

In [49]:
# Maybe consider going back and re-evaluating how accurate the model is on 
# a subset of the known Age values

In [52]:
df.loc[df['Age'].isnull(), 'Age'] = predictions

In [56]:
# Now, we have an Age value for all observations. 

In [57]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,1,0,3,1,22.000000,1,0,7.2500
1,2,1,1,0,38.000000,1,0,71.2833
2,3,1,3,0,26.000000,0,0,7.9250
3,4,1,1,0,35.000000,1,0,53.1000
4,5,0,3,1,35.000000,0,0,8.0500
...,...,...,...,...,...,...,...,...
886,887,0,2,1,27.000000,0,0,13.0000
887,888,1,1,0,19.000000,0,0,30.0000
888,889,0,3,0,16.873333,1,2,23.4500
889,890,1,1,1,26.000000,0,0,30.0000


In [58]:
X_train = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]
y_train = df['Survived']

In [59]:
test_df = pd.read_csv('test.csv')

In [64]:
test_df = test_df.set_index('PassengerId')

In [66]:
test_df = test_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]

In [68]:
test_df.isnull().sum()

Pclass     0
Sex        0
Age       86
SibSp      0
Parch      0
Fare       1
dtype: int64

In [69]:
test_df['Fare'].fillna(test_df['Fare'].median(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['Fare'].fillna(test_df['Fare'].median(), inplace=True)


In [74]:
test_df['Sex'] = test_df['Sex'].map({'male': 1, 'female': 0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['Sex'] = test_df['Sex'].map({'male': 1, 'female': 0})


In [78]:
X_age_test = test_df[test_df['Age'].isna()]

In [81]:
X_age_test = X_age_test[['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare']]

In [82]:
predictions = model.predict(X_age_test)

In [86]:
test_df[test_df['Age'].isna()]

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
902,3,1,,0,0,7.8958
914,1,0,,0,0,31.6833
921,3,1,,2,0,21.6792
925,3,0,,1,2,23.4500
928,3,0,,0,0,8.0500
...,...,...,...,...,...,...
1300,3,0,,0,0,7.7208
1302,3,0,,0,0,7.7500
1305,3,1,,0,0,8.0500
1308,3,1,,0,0,8.0500


In [87]:
test_df.loc[test_df['Age'].isnull(), 'Age'] = predictions

In [90]:
test_df

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
892,3,1,34.500000,0,0,7.8292
893,3,0,47.000000,1,0,7.0000
894,2,1,62.000000,0,0,9.6875
895,3,1,27.000000,0,0,8.6625
896,3,0,22.000000,1,1,12.2875
...,...,...,...,...,...,...
1305,3,1,30.591286,0,0,8.0500
1306,1,0,39.000000,0,0,108.9000
1307,3,1,38.500000,0,0,7.2500
1308,3,1,30.591286,0,0,8.0500


In [91]:
X_test = test_df

In [93]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,1,22.0,1,0,7.25
1,1,0,38.0,1,0,71.2833
2,3,0,26.0,0,0,7.925
3,1,0,35.0,1,0,53.1
4,3,1,35.0,0,0,8.05


In [94]:
X_test.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
892,3,1,34.5,0,0,7.8292
893,3,0,47.0,1,0,7.0
894,2,1,62.0,0,0,9.6875
895,3,1,27.0,0,0,8.6625
896,3,0,22.0,1,1,12.2875


In [96]:
y_train.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

# 4. Model Designing/Testing (on Train data)

In [98]:
# Split first dataset into train and test splits
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.2, random_state = 1)

## Logistic Regression, Attempt 1

In [100]:
model = LogisticRegression(random_state = 1)

In [101]:
model.fit(X_train, y_train)

In [102]:
y_pred = model.predict(X_test)

In [106]:
accuracy = accuracy_score(y_pred, y_test)
cm = confusion_matrix(y_pred, y_test)
classification = classification_report(y_pred, y_test)

In [111]:
print(accuracy)

0.8212290502793296


In [112]:
print(cm)

[[94 20]
 [12 53]]


In [110]:
print(classification)

              precision    recall  f1-score   support

           0       0.89      0.82      0.85       114
           1       0.73      0.82      0.77        65

    accuracy                           0.82       179
   macro avg       0.81      0.82      0.81       179
weighted avg       0.83      0.82      0.82       179



With an accuracy of just over 82%, this is certainly not bad - but I think with some adjustments, I could improve the results. But first, I want to train this model on the whole train dataset and then apply it to the Kaggle test dataset for my first submission. 

In [114]:
model = LogisticRegression(random_state=1)
X_train = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]
y_train = df['Survived']
X_test = test_df
model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [117]:
test_df['Survived'] = predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['Survived'] = predictions


In [119]:
test_df = test_df['Survived']

In [120]:
test_df

PassengerId
892     0
893     0
894     0
895     0
896     1
       ..
1305    0
1306    1
1307    0
1308    0
1309    0
Name: Survived, Length: 418, dtype: int64

In [122]:
test_df.to_csv('titanic_predictions.csv', index=True)

## Kaggle Score for this model: Score: 0.75837

## Planned Adjustments: 
- Split Age into ranges (child versus adult)
- I will consider introducing an interaction feature between age and gender — for example, by creating a variable that multiplies a binary "child" indicator (e.g., Age < 16) with gender. This could help capture the overlapping effect of being both young and female, since both individually correlate with higher survival rates. Without this interaction, the model might be overestimating the effect when both conditions are present, leading to potential overfitting. By explicitly modeling this interaction, we give the model a better opportunity to learn the combined influence, rather than treating them as entirely separate signals.
- Explore potential multicollinearity. Especially look out for Fare and Pclass; I have a feeling they are highly correlated since higher class tickets would cost more. Fare could also correlate with age, since children's tickets would likely be cheaper. 
- Make the two sibling and parent features into one "family size" feature
- 