In [None]:
# Initial imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Read the CSV and Preform Basic Data Cleaning

In [None]:
gender_df = pd.read_csv("data/gender_submission.csv")
test_df = pd.read_csv("data/test.csv")
train_df = pd.read_csv("data/train.csv")

In [None]:
#checking for null values
train_df.info()

In [None]:
#Description of dataset
train_df.describe()

In [None]:
#detailed look at what is actually missing in data

total = train_df.isnull().sum().sort_values(ascending=False)
percent_1 = train_df.isnull().sum()/train_df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(5)

In [None]:
#Merged test dataset with gender to have is people survived or not.
test_merged_df = test_df.merge(gender_df, on='PassengerId')

Dropped Columns:

In [None]:
#Non needed columns
train_df = train_df.drop(['Ticket'], axis=1)
test_merged_df = test_merged_df.drop(['Ticket'], axis=1)

In [None]:
train_df = train_df.drop(['Cabin'], axis=1)
test_merged_df = test_merged_df.drop(['Cabin'], axis=1)

In [None]:
train_df = train_df.drop(['Name'], axis=1)
test_merged_df = test_merged_df.drop(['Name'], axis=1)

In [None]:
#dropping na from names
train_df = train_df[train_df['Age'].notna()]
test_merged_df = test_merged_df[test_df['Age'].notna()]

In [None]:
# #Making ports numeric
# ports = {"S": 0, "C": 1, "Q": 2}
# data = [train_df, test_merged_df]

# for dataset in data:
#     dataset['Embarked'] = dataset['Embarked'].map(ports)

In [None]:
# #Embarked feature has only 2 missing values, filled in with most common
# common_value = 'S'
# data = [train_df, test_merged_df]

# for dataset in data:
#     dataset['Embarked'] = dataset['Embarked'].fillna(common_value)

In [None]:
train_df = train_df.drop(['Embarked'], axis=1)
test_merged_df = test_merged_df.drop(['Embarked'], axis=1)

In [None]:
#Checking dataframes after dropping columns and merging

In [None]:
train_df.info()

In [None]:
test_merged_df.info()

In [None]:
train_df.head()

In [None]:
test_merged_df.head()

In [None]:
#changed the fare na to 0 and changed type to interget from a float64
data = [train_df, test_merged_df]

for dataset in data:
    dataset['Fare'] = dataset['Fare'].fillna(0)
    dataset['Fare'] = dataset['Fare'].astype(int)

In [None]:
data

In [None]:
#Made genders bianary
genders = {"male": 0, "female": 1}
data = [train_df, test_merged_df]

for dataset in data:
    dataset['Sex'] = dataset['Sex'].map(genders)

In [None]:
X_train = train_df.drop(["PassengerId", "Survived"], axis=1)
Y_train = train_df["Survived"]
X_test  = test_merged_df.drop(["PassengerId", "Survived"], axis=1).copy()
Y_test = test_merged_df["Survived"]
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

In [None]:
feature_names = X_test.columns
X_test.head()

In [None]:
# from sklearn.preprocessing import StandardScaler
# X_scaler = StandardScaler().fit(X_train)
# Y_scaler = StandardScaler().fit(Y_train)

In [None]:
# X_train_scaled = X_scaler.transform(X_train)
# Y_train_scaled = Y_scaler.transform(Y_train)
# X_test_scaled = X_scaler.transform(X_test)
# Y_test_scaled = Y_scaler.transform(Y_test)

# Model Building and Training

Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, Y_train)

In [None]:
prediction = classifier.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
logistic_regression = accuracy_score(Y_test,prediction)*100
logistic_regression

Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=1000)
random_forest.fit(X_train, Y_train)
ran_forest = random_forest.score(X_train, Y_train)*100

In [None]:
ran_forest

In [None]:
importances = random_forest.feature_importances_
importances

In [None]:
# We can sort the features by their importance
sorted(zip(random_forest.feature_importances_, feature_names), reverse=True)

In [None]:
import seaborn as sns
feature_imp = pd.Series(random_forest.feature_importances_, index=feature_names).sort_values(ascending=False)

#print("Accuracy: {}".format(metrics.accuracy_score(y_test, y_pred)))

plt.figure(figsize=(10,6))
sns.barplot(x=feature_imp, y=feature_imp.index)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.tight_layout()

Support Vector Machine Linear Classifier

In [None]:
from sklearn.svm import SVC

SVC = SVC()
SVC.fit(X_train, Y_train)
predictions = SVC.predict(X_test)

In [None]:
 # Calculate classification report
from sklearn.metrics import accuracy_score
SVCAC = accuracy_score(Y_test,predictions)*100
SVCAC

K Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, Y_train)
    train_score = knn.score(X_train, Y_train)
    test_score = knn.score(X_test, Y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 20, 2), train_scores, marker='o')
plt.plot(range(1, 20, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(X_train, Y_train)
knn = knn.score(X_test, Y_test)*100
knn

Decision Tree Model

In [None]:
from sklearn import tree

decision_tree = tree.DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
prediction = decision_tree.predict(X_test)

In [None]:
decision_tree = accuracy_score(Y_test,prediction)*100
decision_tree

In [None]:
results = pd.DataFrame({
    "Model" : ["Logistic Regression", "Random Forest", "Support Vector Machine", "K Nearest Neighbors", "Decision Tree"],
    "Score" : [logistic_regression, ran_forest, SVCAC, knn, decision_tree]})

results_df = results.sort_values(by = "Score", ascending = False)
results_df = results_df.set_index("Model")
results_df


# Random Forest Predictions

In [None]:
### Prediction person data (Pclass = 1, 2, or 3)
Pclass = 3
Age = 22
SibSp = 1
Sex = 0
Parch = 0
Fare = 7


new_person = np.array([[Pclass, Sex, Age, SibSp, Parch, Fare]])
new_person

In [None]:
Pclass = 3
Age = 45
SibSp = 1
Parch = 2
Fare = 20
Sex = 0
new_person2 = np.array([[Pclass, Sex, Age, SibSp, Parch, Fare]])
new_person

In [None]:
# Predict whether or not the person survives
rf_predictions_1 = random_forest.predict(new_person)
if rf_predictions_1 == 0:
    print(f"This person would probably perish on the Titanic")
else:
    print(f"This person would probably survive on the Titanic")

In [None]:
print(rf_predictions_1)

In [None]:
# Predict whether or not the person survives
rf_predictions_2 = random_forest.predict(new_person2)
if rf_predictions_2 == 0:
    print(f"This person would probably perish on the Titanic")
else:
    print(f"This person would probably survive on the Titanic")

In [None]:
print(rf_predictions_2)