## Imports

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

## Data

In [None]:
titanic_df = pd.read_csv(
    "data/titanic.csv",
    index_col="PassengerId"
    )

display(titanic_df.head())
display(titanic_df.shape)

[Data Dictionary](https://www.kaggle.com/competitions/titanic/data)

### Turning object values into numeric

In [None]:
titanic_df.Embarked.unique()

In [None]:
embarked = {'S': 1,
            'C': 2,
            'Q': 3}

titanic_df["Embarked"] = titanic_df.Embarked.map(embarked)

In [None]:
sex = {"male": 0, "female":1}

titanic_df["Sex"] = titanic_df.Sex.map(sex)

### Correlation

In [None]:
titanic_df.corr()

### Missing values

In [None]:
titanic_df.isna().sum()

### Droppig missing values

In [None]:
titanic_df.drop(["Cabin", "Ticket", "Embarked", "SibSp", "Parch", "Name"], axis=1, inplace=True)
titanic_df.dropna(inplace=True)

display(titanic_df.head())
display(titanic_df.shape)

### Reordering remaning columns

In [None]:
order = ["Pclass", "Sex", "Age", "Fare", "Survived"]
titanic_df = titanic_df[order]
titanic_df.head(1)

### Some statistics

In [None]:
titanic_df.Fare.describe().round(3)

### Exploring Sex column

In [None]:
# How it is encoded
sex


#### Interesting question:

- How much women and man were in the Titanic? 笨能n- How much of women and of man died or survived in the event? 笨能n- Is it true that sex is related, in some level, to survive or not? 笨能n- What were their average age in those events? 笨能n- What were the most frequent Pclass they would belong?
- How much was their fare? Is it related to survive or not?


### - How much women and man were in the Titanic?

In [None]:
values = dict(titanic_df.Sex.value_counts())

In [None]:
sum(values.values())

In [None]:
def make_autopct(values):
    def my_autopct(pct):
        total = sum(values.values())
        val = int(round(pct*total/100.0))
        return f'{pct:.2f}%  ({val:d})'
    return my_autopct

In [None]:
plt.pie(
    x=titanic_df.Sex.value_counts(),
    labels=list(sex),
    autopct=make_autopct(values),
    explode=(0, 0.05),
    startangle=90,
    colors=['powderblue', 'plum']
);

text = {'fontsize': 11,
        'verticalalignment': 'top',
        'horizontalalignment': 'left'}

plt.text(
    x=0.8,
    y=1.5,
    s=f"Total = {sum(values.values())}",
    fontdict=text
);

### - How much of women and of man died/survived in the event?

In [None]:
sex

In [None]:
titanic_df[["Sex", "Survived"]].value_counts()

In [None]:
gender_survivals = dict(titanic_df[["Sex", "Survived"]].value_counts())
gender_survivals

In [None]:
genres = ['Male', 'Female']
survivals = [gender_survivals.get((0,1)), gender_survivals.get((1,1))]
not_survivals = [gender_survivals.get((0,0)), gender_survivals.get((1,0))]

In [None]:
plt.rcParams["figure.figsize"] = [7.50, 3.50]
plt.rcParams["figure.autolayout"] = True

plot1 = plt.barh(genres, survivals, color='limegreen')
plot2 = plt.barh(genres, not_survivals,  left=survivals, color='red')
plt.legend([plot1, plot2], ['Survivors', 'Non-survivors']);

### Is it true that sex is related, in some level, to survive or not?

In [None]:
titanic_df[['Sex', 'Survived']].corr()

If we analyze the previous chart and the correlation matrix, we can say that they can be related but not so strongly.

TO-DO: p-value test

### What were their average age in those events?

###### Note: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

In [None]:
titanic_df['Age'].describe()

#### Creating age groups

In [None]:
def classify_group_age(age):
    if (age <= 1):
        return 1
    elif (age > 1) & (age <= 11):
        return 2
    elif (age > 11) & (age <= 15):
        return 3
    elif (age > 15) & (age <= 19):
        return 4
    elif (age > 19) & (age <= 30):
        return 5
    elif (age > 30) & (age <= 50):
        return 6
    else:
        return 7


In [None]:
titanic_df['Age Group'] = [classify_group_age(age) for age in titanic_df['Age']]

In [None]:
age_survivals = dict(titanic_df[['Age Group', 'Survived']].groupby('Survived').value_counts())
age_survivals

In [None]:
age_group = titanic_df['Age Group'].unique().tolist()
age_group.sort()
age_group

In [None]:
titanic_df.head()

In [None]:
c = list(titanic_df[['Age Group', 'Survived']].query("Survived == 0").groupby('Age Group').value_counts())
c

In [None]:
d = list(titanic_df[['Age Group', 'Survived']].query("Survived == 1").groupby('Age Group').value_counts())
d

In [None]:
survivals_group_dict = {'Age Group': age_group, 'Total Survivals': d, 'Total Death': c}

In [None]:
survivals_group_dict

In [None]:
survivals_group_dict = pd.DataFrame(survivals_group_dict)

In [None]:
survivals_group_dict

In [None]:
plt.bar(np.array(age_group) + 0.2, survivals_group_dict['Total Death'], 0.4, label='Death', color='grey');
plt.bar(np.array(age_group) - 0.2, survivals_group_dict['Total Survivals'], 0.4, label='Survived', color='green');
plt.xticks(survivals_group_dict['Age Group'], age_group)
plt.xlabel("Age Groups")
plt.ylabel("Total")
plt.title("Number of deaths and survivals in each age group")
plt.legend();

In [None]:
titanic_df['Fare']

In [None]:
titanic_df

In [None]:
sns.jointplot(titanic_df.Fare);

In [None]:
sns.displot(titanic_df.Fare);

In [None]:
sns.relpl(titanic_df.Fare)