In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing

train_data = pd.read_csv("../datasets/train.csv")
print('train_data:')
print(train_data.head().to_markdown())
print(train_data.shape)

print('\n\n')

test_data = pd.read_csv("../datasets/test.csv")
print('test_data:')
print(test_data.head().to_markdown())
print(test_data.shape)

print('\n\n')

gender_submission = pd.read_csv("../datasets/gender_submission.csv")
print('gender_submission:')
print(gender_submission.head().to_markdown())
print(gender_submission.shape)

In [None]:
# surviving rate by gender

women = train_data.loc[train_data.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)

print('\n')

men = train_data.loc[train_data.Sex == 'male']["Survived"]
rate_men = sum(men)/len(men)

print("% of men who survived:", rate_men)

In [None]:
from sklearn.ensemble import RandomForestClassifier

y = train_data["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

# training the model
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X,y)

# making predictions
predictions = model.predict(X_test)
y_pred = pd.Series(predictions)
y_test = gender_submission["Survived"]

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Submission successfully saved")

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score

# accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'accuracy: {accuracy: .2f}')

# classification report
classification_report = classification_report(y_test, y_pred)
print('classification report:')
print(classification_report)

# cross-validation
scores = cross_val_score(model, X, y, cv=5)
print('cross-validation acuracy:')
print(scores)

# confusion matrix
import seaborn as sns
import matplotlib.pyplot as plt

confusion_matrix = confusion_matrix(y_test, y_pred)
confusion_matrix_df = pd.DataFrame(confusion_matrix, index=['Acctual not-sruvived', 'Acctual survived'], columns=['Predicted not-survived', 'Predicted survived'])
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix_df, annot=True, fmt='d', cmap='Blues')
plt.title('confusion_matrix')
plt.show()