## Imports

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import tarfile
import urllib.request
import os
import matplotlib.pyplot as plt

## Getting the data

In [None]:
def load_titanic_train_data():

    with tarfile.open("/kaggle/input/titanic-data/titanic.tgz", "r:gz") as tar:
        tar.extractall(path="/kaggle/working")
    
    df_train = pd.read_csv(Path("/kaggle/working/titanic/train.csv"))
    
    return df_train

## Preprocessing

In [None]:
titanic = load_titanic_train_data()

titanic_df_test = pd.read_csv(Path("/kaggle/working/titanic/test.csv"))

In [None]:
titanic.info()

In [None]:
median = titanic["Age"].median()
titanic["Age"] = titanic["Age"].fillna(median) # used median of the ages of the passengers to fill the Null values in the dataframe
# titanic["Age"]

In [None]:
mode_embarked = titanic["Embarked"].mode()[0]
titanic["Embarked"] = titanic["Embarked"].fillna(mode_embarked) # used most occuring value of Embarked to fill NA

titanic.head()

### **Replaced male as 1 and female as 0**

In [None]:
titanic["Sex"] = titanic["Sex"].map({"male":1, "female":0}) 
# titanic["Sex"].value_counts()
titanic.head()

In [None]:
titanic = titanic.drop("Cabin", axis=1) # dropped the cabin column as there was too much null values to tackle

titanic = titanic.drop("Name", axis=1) # dropped the Name column as it is redundant 

titanic = titanic.drop("Ticket", axis=1) # dropped the Ticket column as it is redundant 

titanic = titanic.drop("PassengerId", axis=1) # dropped the Ticket column as it is redundant 


titanic.head()

In [None]:
titanic_cat = titanic[["Embarked"]]

titanic.head()

In [None]:
# # Using Scikit-learn for One-Hot-Encoding


# from sklearn.preprocessing import OneHotEncoder

# cat_encoder = OneHotEncoder()
# titanic_cat_1hot = cat_encoder.fit_transform(titanic_cat) # this returns a SciPy sparse matrix

# titanic_cat_1hot.toarray() # converting to a dense array ie. NumPy array

In [None]:
embarked_1hot_encoded = pd.get_dummies(titanic_cat)

In [None]:
titanic = titanic.drop("Embarked", axis=1)
titanic = pd.concat([titanic, embarked_1hot_encoded], axis=1)

titanic.head()

## Splitting the dataset into X_train and y_train

In [None]:
X_train = titanic.drop("Survived", axis=1)
y_train = titanic["Survived"] # labels (0 or 1)

X_train.shape

In [None]:
y_train

In [None]:
titanic.tail()

## Preparing the test set

In [None]:
X_test = titanic_df_test

In [None]:
X_test.info()

In [None]:
X_test = X_test.dropna(subset=["Fare"])

X_test_dropna_ = X_test # this is for testing purpose

X_test = X_test.drop('Cabin', axis=1)
X_test = X_test.drop('Ticket', axis=1)
X_test = X_test.drop('Name', axis=1)
X_test = X_test.drop('PassengerId', axis=1)


test_age_median = X_test["Age"].median()
X_test["Age"] = X_test["Age"].fillna(test_age_median)

X_test["Sex"] = X_test["Sex"].map({"male":1, "female":0})


test_cat = X_test[["Embarked"]]
test_embarked_1hot_encoded = pd.get_dummies(test_cat)

X_test = X_test.drop("Embarked", axis=1)

X_test = pd.concat([X_test, test_embarked_1hot_encoded], axis=1)
X_test

In [None]:
X_test.info()

## Training using SGD Classifier

In [None]:
y_train_true = (y_train == 1)


from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state = 42)
sgd_clf.fit(X_train, y_train_true)

In [None]:
y_pred = sgd_clf.predict(X_train)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

accuracy_score(y_train, y_pred)

# Evaluation
print("Accuracy:", accuracy_score(y_train, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred))
print("Classification Report:\n", classification_report(y_train, y_pred))

## Training using Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

rf_clf.fit(X_train, y_train)

y_pred = rf_clf.predict(X_train)

In [None]:
# Evaluation
print("Accuracy:", accuracy_score(y_train, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred))
print("Classification Report:\n", classification_report(y_train, y_pred))

In [None]:
rf_predictions = rf_clf.predict(X_test)

## Testing using validation set

In [None]:
from sklearn.model_selection import train_test_split

X_train_new, X_val, y_train_new, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

rf_clf.fit(X_train_new, y_train_new)
y_val_pred = rf_clf.predict(X_val)


# Evaluation
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("Classification Report:\n", classification_report(y_val, y_val_pred))

In [None]:
import matplotlib.pyplot as plt

feat_importances = pd.Series(rf_clf.feature_importances_, index=X_train.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.title("Top Feature Importances")
plt.show()

## Predicting survival using the test set

In [None]:
## predicting using the test set

# n = int(input())
n = 66

single_test_row_df = X_test.iloc[[n]]

# Now predict using this DataFrame
predicted_class = rf_clf.predict(single_test_row_df)

# print(predicted_class) # This will still be an array, but it's correct

if predicted_class == 1:
    print(f"{titanic_df_test.loc[n,'Name']} is one of the Survivors")
else: print(f"{titanic_df_test.loc[n,'Name']} did not Survive")

In [None]:
rf_pred = rf_predictions

X_pred = pd.concat([pd.DataFrame(X_test_dropna_), pd.DataFrame(rf_predictions)], axis=1)

X_pred = X_pred.rename(columns={0: 'Survived'})
X_pred = X_pred.dropna(subset=["Survived"])

X_pred["Survived (prediction)"] = X_pred['Survived'].map({1:True, 0:False}) 

X_pred = X_pred.drop("Survived", axis=1)


X_pred.head()