In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Read the CSV and Preform Basic Data Cleaning

In [None]:
gender_df = pd.read_csv("data/gender_submission.csv")
test_df = pd.read_csv("data/test.csv")
train_df = pd.read_csv("data/train.csv")

In [None]:
train_df

In [None]:
test_df.head()

In [None]:
gender_df

# Establish the training sets

In [None]:
X_train = train_df[['Pclass', 'Sex', 'Survived','Age', 'SibSp', 'Parch', 'Fare']]

In [None]:
# Drop null values
X_train = X_train.dropna()

In [None]:
# Set the target variable
y_train = X_train['Survived']

In [None]:
y_train = y_train.values.reshape(-1, 1)

In [None]:
y_train.shape

In [None]:
# Drop the target variable from X_train
X_train = X_train[['Pclass', 'Sex','Age', 'SibSp', 'Parch', 'Fare']]

In [None]:
X_train.shape

# Clean data for the test sets

In [None]:
# merge gender_df and test_df
merged_test_df = test_df.merge(gender_df, on='PassengerId')

In [None]:
merged_test_df  

In [None]:
merged_test_df = merged_test_df[['Pclass', 'Sex', 'Survived','Age', 'SibSp', 'Parch', 'Fare']]

In [None]:
merged_test_df

In [None]:
# Remove all null values from the testing set
merged_test_df = merged_test_df.dropna()

# Establish X and y test sets

In [None]:
X_test = merged_test_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]

In [None]:
X_test.shape

In [None]:
y_test = merged_test_df['Survived']

In [None]:
y_test = y_test.values.reshape(-1,1)

In [None]:
# y_test = y_test.values.reshape(-1, 1)
y_test.shape

# Encode the gender data

In [None]:
# Encode the sex column in the test set
X_test = X_test.copy()

X_test = pd.get_dummies(X_test, columns=["Sex"])
X_test.shape

In [None]:
X_test.head()

In [None]:
# Encode the sex column in the train set
X_train = X_train.copy()

X_train = pd.get_dummies(X_train, columns=["Sex"])
X_train.head()

# Scale the data

In [None]:
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

In [None]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

# Create, fit and validate the models

Support Vector Machine Linear Classifier

In [None]:
# Support vector machine linear classifier
from sklearn.svm import SVC 

y_train = y_train.reshape(-1)
svm_model = SVC(kernel='rbf')
svm_model.fit(X_train, y_train)

In [None]:
target_names = ["survive", "die"]

In [None]:
# Calculate classification report
from sklearn.metrics import classification_report
smv_predictions = svm_model.predict(X_test)
print(classification_report(y_test, smv_predictions,
                            target_names=target_names))

In [None]:
from sklearn.metrics import accuracy_score
svm = accuracy_score(y_test,smv_predictions)*100
svm

Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=1000)
random_forest.fit(X_train, y_train)
ran_forest = random_forest.score(X_test, y_test)*100

In [None]:
ran_forest

In [None]:
importances = random_forest.feature_importances_
importances

In [None]:
feature_names = X_test.columns

In [None]:
# We can sort the features by their importance
sorted(zip(random_forest.feature_importances_, feature_names), reverse=True)

In [None]:
import seaborn as sns
feature_imp = pd.Series(random_forest.feature_importances_, index=feature_names).sort_values(ascending=False)

#print("Accuracy: {}".format(metrics.accuracy_score(y_test, y_pred)))

plt.figure(figsize=(10,6))
sns.barplot(x=feature_imp, y=feature_imp.index, palette='hls')
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.tight_layout()
plt.savefig("resource/images/Random_Forest.jpg")

Decision Tree Model

In [None]:
from sklearn import tree

decision_tree_classifier = tree.DecisionTreeClassifier()
decision_tree_classifier.fit(X_train, y_train)
prediction = decision_tree_classifier.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

decision_tree = accuracy_score(y_test,prediction)*100
decision_tree

Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
logreg_classifier = LogisticRegression(max_iter=1000)
logreg_classifier

In [None]:
y_train = y_train.reshape(-1)

In [None]:
y_train.shape

In [None]:
# Fit the model using the training data
logreg_classifier.fit(X_train, y_train)

In [None]:
prediction = logreg_classifier.predict(X_test)

In [None]:
logistic_regression = accuracy_score(y_test,prediction)*100
logistic_regression

K Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train.ravel())
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 20, 2), train_scores, marker='o')
plt.plot(range(1, 20, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")

plt.savefig("resource/images/KNN.jpg")

In [None]:
knn_classifier = KNeighborsClassifier(n_neighbors=9)
knn_classifier.fit(X_train_scaled, y_train.ravel())
knn = knn_classifier.score(X_test, y_test)*100
knn

In [None]:
results = pd.DataFrame({
    "Model" : ["Logistic Regression", "Random Forest", "Support Vector Machine", "K Nearest Neighbors", "Decision Tree"],
    "Score" : [logistic_regression, ran_forest, svm, knn, decision_tree]})

results_df = results.sort_values(by = "Score", ascending = False)
results_df = results_df.set_index("Model")
results_df

# Make predictions

In [None]:
# Generate a new data point
import numpy as np

### Prediction person data (Pclass = 1, 2, or 3)
Pclass = 1
Age = 20 
SibSp = 1 
Parch = 0
Fare = 75
Sex_female = 1
Sex_male = 0

new_person = np.array([[Pclass, Age, SibSp, Parch, Fare, Sex_female, Sex_male]])
new_person

In [None]:
### Prediction person number 2
Pclass = 3
Age = 45 
SibSp = 1 
Parch = 2
Fare = 20
Sex_female = 0
Sex_male = 1

new_person2 = np.array([[Pclass, Age, SibSp, Parch, Fare, Sex_female, Sex_male]])
new_person2

In [None]:
# Predict whether or not the new people survive
new_people = [new_person, new_person2]

for person in new_people:    
    print("--------------------------------------------")
    print(f"Here are the predictions that our classifiers made for {person}")
    svm_predictions = svm_model.predict(person)
    if svm_predictions == 0:
        print(f"Support vector machine: perish")
    else:
        print(f"Support vector machine: survive")
    
    rf_predictions_1 = random_forest.predict(person)
    if rf_predictions_1 == 0:
        print(f"Random forest: perish")
    else:
        print(f"Random forest: surive")

    lg_predictions = logreg_classifier.predict(person)
    if lg_predictions == 0:
        print(f"Logistic regression: surive")
    else:
        print(f"Logistic regression: surive")
    
    knn_predictions = knn_classifier.predict(person)
    if knn_predictions == 0:
        print(f"K-nearest neighbors: surive")
    else:
        print(f"K-nearest neighbors: surive")
    
    dt_predictions = decision_tree_classifier.predict(person)
    if dt_predictions == 0:
        print(f"Decision tree: surive")
    else:
        print(f"Decision tree: surive")

In [None]:
# Nicole's prediction
Pclass = 2
Age = 30
SibSp =  1
Parch = 1
Fare = 100
Sex_female = 1
Sex_male = 0

nicole = {'name': 'Nicole', 
          'data': [Pclass, Age, SibSp, Parch, Fare, Sex_female, Sex_male]}

In [None]:
nicole['name']

In [None]:
# Anne's prediction
Pclass = 2 
Age = 48
SibSp = 0 
Parch = 2
Fare = 100
Sex_female = 1
Sex_male = 0

anne = {'name': 'Anne', 
        'data': [Pclass, Age, SibSp, Parch, Fare, Sex_female, Sex_male]}

In [None]:
# Mariano's prediction
Pclass = 2
Age = 28
SibSp = 1 
Parch = 2
Fare = 200
Sex_female = 0
Sex_male = 1

mariano = {'name': 'Mariano', 
          'data': [Pclass, Age, SibSp, Parch, Fare, Sex_female, Sex_male]}

In [None]:
# Kendall's prediction

In [None]:
# Matt's prediction
Pclass = 2
Age = 28
SibSp = 2 
Parch = 2
Fare = 150
Sex_female = 0
Sex_male = 1

matt = {'name': 'Matt', 
        'data': [Pclass, Age, SibSp, Parch, Fare, Sex_female, Sex_male]}

In [None]:
# Rose
Pclass = 1
Age = 17
SibSp = 1 
Parch = 1
Fare = 2250
Sex_female = 1
Sex_male = 0
rose = {'name': 'Rose from The Titanic', 
        'data': [Pclass, Age, SibSp, Parch, Fare, Sex_female, Sex_male]}

In [None]:
# Jack
Pclass = 3
Age = 20
SibSp = 0 
Parch = 0
Fare = 27.5
Sex_female = 0
Sex_male = 1
jack = {'name': 'Jack from The Titanic', 
        'data': [Pclass, Age, SibSp, Parch, Fare, Sex_female, Sex_male]}

In [None]:
# Make predictions with the logistic regression model
people_to_predict = [nicole, anne, mariano, matt, rose, jack]
people_to_predict

for person in people_to_predict:
    data = person['data']
    name = person['name']
    prediction = logreg_classifier.predict(np.array([data]))
    print('--------------------------------------------------')
    if prediction == 1:
        print(f'Our model predicts {name}')
        print(f'-- Passenger Class: {data[0]}')
        print(f'-- Age: {data[1]}')
        print(f'-- Number of siblings or spouses: {data[2]}')
        print(f'-- Number of parents or children: {data[3]}')
        print(f'-- Fare paid: {data[4]}') 
        if data[5] == 0:
              print('-- Sex: Male')
        else:
              print('-- Sex: Female')
        print('WOULD have survived the sinking of the titanic.')
    else:
        print(f'Our model predicts {name}')
        print(f'-- Passenger Class: {data[0]}')
        print(f'-- Age: {data[1]}')
        print(f'-- Number of siblings or spouses: {data[2]}')
        print(f'-- Number of parents or children: {data[3]}')
        print(f'-- Fair paid: {data[4]}') 
        if data[5] == 0:
              print('-- Sex: Male')
        else:
              print('-- Sex: Female')
        print('WOULD NOT have survived the sinking of the titanic.')