# Titanic Passanger Survival 

# import necessary libraries

In [None]:

import numpy as np                 
import scipy.stats                
import csv                         
import pandas as pd                
import matplotlib.pyplot as plt    
import math
import seaborn as sns
from scipy.stats import chi2_contingency
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Data Collection and Data Analysis

In [None]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv", index_col='PassengerId')
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

train_data.head(15)

In [None]:
test_data.head(15)

We can begin by removing certain features that don't contribute to the relationships we are trying to establish. For example, the TicketId is unique to each passenger and doesn't provide any meaningful insights. Although we could categorize ticket values by their last digit and assign each passenger to a bucket ranging from 0 to 9, for this report, we will ignore the TicketId.

In [None]:
train_data.describe(include="all") 

In [None]:
test_data.describe(include="all")  

From the chart above, we can gather several insights about the data. Firstly, the training dataset shows missing entries for the ages of passengers, which need to be addressed by filling these gaps to prevent skewness. Similarly, the test dataset has missing values for Age and one missing value for Fare. These issues must be resolved before processing the data.

Secondly, the chart provides statistical tools used on each feature. For instance, the average survival rate in the training dataset is approximately 38.38%, with a high standard deviation of about 0.4865. The mean of Pclass (1, 2, 3) is around 2.265, indicating a higher proportion of passengers in the second and third classes. This will help us identify which features are crucial for our analysis.

Additionally, the mean age is around 29.69 with a standard deviation of approximately 14.18, showing significant deviation from the mean. SibSp, representing the number of siblings or spouses aboard, has a mean of about 0.44, while Parch, indicating the number of parents or children aboard, is also noteworthy. The Fare feature, representing the amount paid by passengers, has a mean of around 35.627. Other features are non-numerical. Furthermore, there are null values in the Age, Cabin, and Embarked sections, which need to be addressed.

In [None]:
print(train_data.dtypes)


The relationship between the embarkation point, class, and survival provides valuable insights into patterns that might exist among these variables. By analyzing this plot, we can identify any correlations between where passengers boarded, their class, and their survival outcomes.

# Testing Feature relation to Survival rates:

During this process, we will first identify the features that significantly impact the survival rate of the passengers. Then, we will clean the dataset to retain only these relevant features for further analysis

# Did the class of the passengers effect their survival rate


In [None]:
#Visualization: 
pivot_class_survived = train_data.pivot_table(index='Pclass', columns='Survived', aggfunc='size', fill_value=0)

#Survival rate by class
pivot_class_survived['Survival Rate'] = pivot_class_survived[1] / (pivot_class_survived[0] + pivot_class_survived[1])
print(pivot_class_survived)

# Plotting the survival rate by class
pivot_class_survived['Survival Rate'].plot(kind='bar', color='skyblue')
plt.ylabel('Survival Rate')
plt.title('Survival Rate by Passenger Class')
plt.show()

chi2, p, dof, ex = chi2_contingency(pivot_class_survived[[0, 1]])
print(f"Chi-Square Statistic: {chi2}, p-value: {p}")


The above observation is significant as it indicates that passengers in higher classes had a better chance of survival compared to those in lower classes. The p-value is less than 0.05, which is the threshold for statistical significance, indicating a strong relationship. Additionally, the high chi-square value suggests a significant deviation from the null hypothesis, which posits that there is no relationship between class and survival rate.

**Does sex influence the survival rate? Given that P-class impacts survival rates, we can further analyze each class by separating passengers based on gender and calculating their respective survival rates**

In [None]:
train_data[['Survived', 'Sex', 'Pclass']].groupby(['Pclass', 'Sex']).mean()

The averages for each class reveal that upper-class females had the highest survival rate. Additionally, the data indicates that females, in general, had a higher survival rate compared to their male counterparts.

In [None]:
contingency_table = pd.crosstab(train_data['Sex'], train_data['Survived'])

# Perform the Chi-Square Test
chi2, p, dof, expected = chi2_contingency(contingency_table)

# Output the results
print(f"Chi-Square Statistic: {chi2}")
print(f"p-value: {p}")

There is an even higher Chi-square value and a lower p-value, indicating a stronger correlation with the survival rate compared to the feature Pclass.

In [None]:
grouped_data = train_data[['Survived', 'Sex', 'Pclass']].groupby(['Pclass', 'Sex']).mean().reset_index()

# Bar graph to visualize the relation
plt.figure(figsize=(10, 6))
sns.barplot(x='Pclass', y='Survived', hue='Sex', data=grouped_data)

plt.title('Survival Rate by Passenger Class and Sex')
plt.xlabel('Passenger Class')
plt.ylabel('Survival Rate')
plt.legend(title='Sex')

# Show the plot
plt.show()

# Does age contribute to the survival rates?


Before visualizing how each age group contributes to survival rates, we have the data ready for analysis.

In [None]:
# Assuming 'train_data' is your DataFrame and it includes 'Survived', 'Sex', and 'Age'
plt.figure(figsize=(10, 10))

# Create a FacetGrid with normalized histograms
g = sns.FacetGrid(train_data, row='Sex', col='Survived')
g.map(plt.hist, 'Age', bins=30, density=True)

# Adding labels and titles for clarity
g.set_xlabels('Age')
g.set_ylabels('Density')
g.set_titles('Sex: {row_name}, Survived: {col_name}')

# Display the plot
plt.show()

In [None]:
age_bins = [0, 18, 30, 40, 50, 60, 120]  # Adjust the bins as you see fit
age_labels = ['0-18', '19-30', '31-40', '41-50', '51-60', '60+']

# Create a new column for the age groups
train_data['AgeGroup'] = pd.cut(train_data['Age'], bins=age_bins, labels=age_labels, right=False)

# Create the contingency table
contingency_table = pd.crosstab(train_data['AgeGroup'], train_data['Survived'])

# Apply the Chi-Square test
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

# Output the results
print(f"Chi-Square Statistic: {chi2}")
print(f"p-value: {p_value}")

In [None]:
num_females = train_data['Sex'].value_counts()['female']
num_males = train_data['Sex'].value_counts()['male']

print(f"Number of females: {num_females}")
print(f"Number of males: {num_males}")

From the above observations, it is evident that there is a relationship between age and survival, but it appears to be less significant. When ages are categorized into buckets and subjected to chi-square analysis with associated p-values, the results indicate a correlation between age and survival rate. However, this relationship is not as pronounced as the other features we have identified thus far.

# Do passengers having siblings or a spouse influence their survival rate?

In [None]:
contingency_table = pd.crosstab(train_data['SibSp'], train_data['Survived'])

# Perform the Chi-Square Test
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

# Output the results
print(f"Chi-Square Statistic: {chi2}")
print(f"p-value: {p_value}")

# Does the presence of parents influence the survival rate?

In [None]:
contingency_table = pd.crosstab(train_data['Parch'], train_data['Survived'])

# Perform the Chi-Square Test
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

# Output the results
print(f"Chi-Square Statistic: {chi2}")
print(f"p-value: {p_value}")

# Fare Rate influence on Survival rates

In [None]:
# Define fare bins and labels
fare_bins = [0, 25, 50, 100, 200, np.inf]
fare_labels = ['0-25', '26-50', '51-100', '101-200', '200+']
train_data['FareBin'] = pd.cut(train_data['Fare'], bins=fare_bins, labels=fare_labels)

# Create the contingency table
contingency_table_fare = pd.crosstab(train_data['FareBin'], train_data['Survived'])

# Perform the Chi-Square Test
chi2_fare, p_value_fare, dof_fare, expected_fare = chi2_contingency(contingency_table_fare)
print(f"Chi-Square Statistic for Fare: {chi2_fare}")
print(f"p-value for Fare: {p_value_fare}")

In [None]:
survival_rate_by_fare = (contingency_table_fare[1] / contingency_table_fare.sum(axis=1)) * 100

# Create a bar plot for visualization
plt.figure(figsize=(10, 6))
sns.barplot(x=survival_rate_by_fare.index, y=survival_rate_by_fare.values)
plt.title('Survival Rate by Fare Bins')
plt.xlabel('Fare Bins')
plt.ylabel('Survival Rate (%)')
plt.show()

# Des where a passenger Embarked influence their Survival Rate

In [None]:
contingency_table_embarked = pd.crosstab(train_data['Embarked'], train_data['Survived'])

# Perform the Chi-Square Test
chi2_embarked, p_value_embarked, dof_embarked, expected_embarked = chi2_contingency(contingency_table_embarked)

# Output the results
print(f"Chi-Square Statistic for Embarked: {chi2_embarked}")
print(f"p-value for Embarked: {p_value_embarked}")

In [None]:
plt.figure(figsize=(12, 8))
sns.countplot(x='Embarked', hue='Pclass', data=train_data)

plt.title('Passenger Count by Embarkation Point and Class')
plt.xlabel('Embarkation Point')
plt.ylabel('Count')
plt.legend(title='Passenger Class', loc='upper right', labels=['1st Class', '2nd Class', '3rd Class'])
plt.show()

plt.figure(figsize=(12, 8))
sns.countplot(x='Embarked', hue='Survived', data=train_data)

plt.title('Survival Rate by Embarkation Point')
plt.xlabel('Embarkation Point')
plt.ylabel('Count')
plt.legend(title='Survived', loc='upper right', labels=['No', 'Yes'])
plt.show()

# Data Cleaning Checklist:
# To-do List:
# 
# 1-Fill in missing values for Age.
# 2-Prepare Embarkment data for insertion into models by categorizing it.
# 3-Encode gender: Male as 1 and Female as 0.

In [None]:
#Clean the data:
train_data['Sex'] = train_data['Sex'].map({'female': 0, 'male': 1}).astype(int)
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
embarked_dummies = pd.get_dummies(train_data['Embarked'], prefix='Embarked')
train_data = pd.concat([train_data, embarked_dummies], axis=1)
train_data = train_data.drop('Embarked', axis=1)
train_data = train_data.drop(['Ticket', 'Name', 'Cabin'], axis=1)

In [None]:
train_data.describe(include="all")

In [None]:
print(train_data.dtypes)

we note the categorical types, which will require further classification before any processing can proceed.

# Correlation Matrix
Now that we've analyzed and cleaned the data, let's gain a comprehensive overview of correlations using a heatmap:

In [None]:
numeric_cols = train_data.select_dtypes(include=[np.number, bool])

# Calculate the correlation matrix
correlation_matrix = numeric_cols.corr()



plt.figure(figsize=(10, 10))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True, linewidths=.5)
plt.title('Correlation Matrix Heatmap')
plt.show()

# 
# Training
# Using Python's built-in libraries.

In [None]:
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
X = train_data[features]
y = train_data['Survived']

# Split the data into new training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

# Train your model on the new training data
model = LogisticRegression()
model.fit(X_train, y_train)

# Validate the model on the new validation set
predictions = model.predict(X_val)
accuracy = accuracy_score(y_val, predictions)
print(f"Validation Accuracy: {accuracy}")

Our Implementation of Logistic Regression:
We use the sigmoid function to map values between [0, 1]. Maximizing the log-likelihood involves taking the gradient, which is computed as the dot product of X.T and the error (y - prediction or sigmoid(z)). After obtaining this gradient, we update and iterate through the process.

In [None]:

def sigmoid(z):
    # Ensure z is a NumPy array to handle operations correctly
    z = np.array(z)
    return 1 / (1 + np.exp(-z))

def log_likelihood(X, y, theta):
    z = np.dot(X, theta)
    predictions = sigmoid(z)
    ll = np.sum(y * np.log(predictions + 1e-9) + (1 - y) * np.log(1 - predictions + 1e-9))
    return ll

def compute_gradient(X, y, theta):
    z = np.dot(X, theta)
  
    predictions = sigmoid(z)
    gradient = np.dot(X.T, (y - predictions)) # dot product with error 
    return gradient


def gradient_descent(X, y, theta, learning_rate, num_iterations):
    for i in range(num_iterations):
        gradient = compute_gradient(X, y, theta)
        theta += learning_rate * gradient
    return theta

def predict(X, theta):
    z = np.dot(X, theta)
    probabilities = sigmoid(z)
    return probabilities

In [None]:
# Assuming train_data and setup are correctly defined as previously discussed

# Feature names and dummy encoding
feature_names = [
    'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
    'Embarked_C', 'Embarked_Q', 'Embarked_S',
    'AgeGroup_19-30', 'AgeGroup_31-40', 'AgeGroup_41-50', 'AgeGroup_51-60', 'AgeGroup_60+',
    'FareBin_26-50', 'FareBin_51-100', 'FareBin_101-200', 'FareBin_200+'
]

train_data = pd.get_dummies(train_data, columns=['AgeGroup', 'FareBin'], drop_first=True)

# Prepare data
X = train_data[feature_names].values
y = train_data['Survived'].values
X = np.hstack((np.ones((X.shape[0], 1)), X))  # Add intercept
X = np.asarray(X, dtype=np.float64)

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

# Train the model
theta = np.zeros((X_train.shape[1], 1))
learning_rate = 0.9
num_iterations = 8000
theta_final = gradient_descent(X_train, y_train.reshape(-1, 1), theta, learning_rate, num_iterations)

# Predict and validate
probabilities = predict(X_val, theta_final)
predictions = (probabilities > 0.5).astype(int)
accuracy = accuracy_score(y_val, predictions)
print(f"Validation Accuracy: {accuracy}")

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Decision Tree
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train, y_train)

# Validate the model on the new validation set
tree_predictions = tree_model.predict(X_val)
tree_accuracy = accuracy_score(y_val, tree_predictions)
print(f"Decision Tree Validation Accuracy Sk-learn: {tree_accuracy}")

# Decision Tree from scratch:

In [None]:
#implement dicision tree
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


# Load datasets
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')

#preproccessing 
# Fill missing values
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
train_data['Embarked'].fillna('S', inplace=True)#filling S because S is the majority, it has least amout of impact on our accuracy

# Convert categorical variables to numeric
train_data['Sex'] = train_data['Sex'].map({'male': 0, 'female': 1})
train_data = pd.get_dummies(train_data, columns=['Embarked'])   #created three columns C Q S
#alternative, using convert them into numerical values
#train_data['Embarked'] = train_data['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})


# Assuming 'train_data' is loaded and prepared
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q','Embarked_S']
X = train_data[features]
y = train_data['Survived']

# Split the data into new training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
import numpy as np
import pandas as pd
from collections import Counter

class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

    def is_leaf_node(self):
        return self.value is not None

def calculate_entropy(y):
    proportions = np.bincount(y) / len(y)
    return -np.sum([p * np.log2(p) for p in proportions if p > 0])

def best_split(X, y, num_features):
    best_gain = -1
    split_idx, split_threshold = None, None
    for feature_idx in range(num_features):
        thresholds, classes = zip(*sorted(zip(X[:, feature_idx], y)))
        unique_thresholds = np.unique(thresholds)  # Ensure unique thresholds for comparison
        for i in range(1, len(unique_thresholds)):
            threshold = (unique_thresholds[i] + unique_thresholds[i - 1]) / 2
            lhs = y[X[:, feature_idx] <= threshold]
            rhs = y[X[:, feature_idx] > threshold]
            gain = information_gain(y, lhs, rhs)
            if gain > best_gain:
                best_gain = gain
                split_idx = feature_idx
                split_threshold = threshold
    return split_idx, split_threshold

def information_gain(y, lhs, rhs):
    parent_entropy = calculate_entropy(y)
    n = len(y)
    l, r = len(lhs), len(rhs)
    if l == 0 or r == 0:
        return 0
    child_entropy = (l / n) * calculate_entropy(lhs) + (r / n) * calculate_entropy(rhs)
    return parent_entropy - child_entropy

def build_tree(X, y, depth=0, max_depth=10):
    num_samples, num_features = X.shape
    num_labels = len(np.unique(y))
    if depth >= max_depth or num_labels == 1:
        leaf_value = Counter(y).most_common(1)[0][0]
        return Node(value=leaf_value)
    split_idx, threshold = best_split(X, y, num_features)
    if split_idx is None:
        return Node(value=Counter(y).most_common(1)[0][0])
    left_idxs = X[:, split_idx] <= threshold
    right_idxs = X[:, split_idx] > threshold
    left = build_tree(X[left_idxs], y[left_idxs], depth + 1, max_depth)
    right = build_tree(X[right_idxs], y[right_idxs], depth + 1, max_depth)
    return Node(split_idx, threshold, left, right)

def predict(node, x):
    while not node.is_leaf_node():
        if x[node.feature] <= node.threshold:
            node = node.left
        else:
            node = node.right
    return node.value

def decision_tree_predictions(X, tree):
    return np.array([predict(tree, xi) for xi in X])

X_train_np = X_train.to_numpy()
y_train_np = y_train.to_numpy()
X_val_np = X_val.to_numpy()

# Build the tree
tree = build_tree(X_train_np, y_train_np, max_depth=3)

# Make predictions
predictions = decision_tree_predictions(X_val_np, tree)
accuracy = accuracy_score(y_val, predictions)
print(f"decision tree implementation Accuracy: {accuracy}")

In [None]:
from sklearn.ensemble import RandomForestClassifier

# using Random Forest
forest_model = RandomForestClassifier(random_state=42)
forest_model.fit(X_train, y_train)

# Validate the model on the new validation set
forest_predictions = forest_model.predict(X_val)
forest_accuracy = accuracy_score(y_val, forest_predictions)
print(f"Random Forest Validation Accuracy: {forest_accuracy}")

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
import numpy as np

# using Linear Regression
lin_model = LinearRegression()
lin_model.fit(X_train, y_train)

# Predicting with the linear regression model
lin_predictions = lin_model.predict(X_val)

# Thresholding the predictions to get binary outcome
lin_predictions_binary = np.where(lin_predictions > 0.5, 1, 0)

# Validate the model on the new validation set
lin_accuracy = accuracy_score(y_val, lin_predictions_binary)
print(f"Linear Regression (as classifier) Validation Accuracy: {lin_accuracy}")

In [None]:
predictions = [0] * 418
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': predictions
})

submission.to_csv('submission.csv', index=False)