# Introduction to Data Science

We'll be using Titanic data, from [Vanderbilt University's Department of Biostatistics](https://hbiostat.org/data/repo/titanic3.csv) and establish a model for predicting whether a given passenger would have survived on that fateful night in April 15, 1912.





<!-- Using this data
provides information about the survival of passengers on the Titanic and characteristics about the passengers such as age and ticket class. , the tutorial will -->

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

data = pd.read_csv("./data/titanic3.csv")
data.head()

In [None]:
# Check mem. usage

data.info(memory_usage='deep')

In [None]:
data.describe()

In [None]:
data.replace('?', np.nan, inplace= True)
data = data.astype({"age": np.float64, "fare": np.float64})

In [None]:
fig, axs = plt.subplots(ncols=5, figsize=(30,5))
sns.violinplot(x="survived", y="age", hue="sex", data=data, ax=axs[0])
sns.pointplot(x="sibsp", y="survived", hue="sex", data=data, ax=axs[1])
sns.pointplot(x="parch", y="survived", hue="sex", data=data, ax=axs[2])
sns.pointplot(x="pclass", y="survived", hue="sex", data=data, ax=axs[3])
sns.violinplot(x="survived", y="fare", hue="sex", data=data, ax=axs[4])

In [None]:
data.replace({'male': 1, 'female': 0}, inplace=True)

In [None]:
data.corr(numeric_only=True).abs()[["survived"]]

In [None]:
data['relatives'] = data.apply (lambda row: int((row['sibsp'] + row['parch']) > 0), axis=1)
data.corr(numeric_only=True).abs()[["survived"]]

In [None]:
data = data[['sex', 'pclass','age','relatives','fare','survived']].dropna()

# Train & Evaluate

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data[['sex','pclass','age','relatives','fare']], data.survived, test_size=0.2, random_state=0)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(x_train)
X_test = sc.transform(x_test)

In [None]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train, y_train)

In [None]:
from sklearn import metrics
predict_test = model.predict(X_test)
print(metrics.accuracy_score(y_test, predict_test))