<details><summary><b>LICENSE</b></summary>

Copyright 2015 Donne Martin

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

   http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

</details>

# Matplotlib applied

* Applying Matplotlib Visualizations to Kaggle: Titanic
* Bar Plots, Histograms, subplot2grid
* Normalized Plots
* Scatter Plots, subplots
* Kernel Density Estimation Plots

# challenge
* This assignment is modified form https://github.com/donnemartin/data-science-ipython-notebooks/blob/master/matplotlib/matplotlib-applied.ipynb
* This is an assignment to learn about Data Cleaning Visualization and plot


## Applying Matplotlib Visualizations to Kaggle: Titanic
Prepare the titanic data to plot:

In [35]:
%matplotlib inline
import pandas as pd
import numpy as np
import pylab as plt
import seaborn

# Set the global default size of matplotlib figures
plt.rc("figure", figsize=(10, 5))
# Set seaborn aesthetic parameters to defaults
seaborn.set()

In [None]:
df_train = pd.read_csv("../../assets/data/titanic_train.csv")

In [None]:
def label_encode(df, column_name, encoded_column_name):
    """Label encode one column of a Dataframe.

    Args:
        df (DataFrame): a data structure
        column_name (string): The column name to read
        encoded_column_name(string): Columns converted from strings to numeric representations
    """
    column = np.sort(df[column_name].unique())

    # Generate a mapping of column from a string to a number representation
    column_value_mapping = dict(zip(column, range(0, len(column))))

    # Transform column from a string to a number representation
    df[encoded_column_name] = df[column_name].map(column_value_mapping).astype(int)

In [None]:
def one_hot_encode(df, column_name, encoded_column_name_prefix):
    return pd.concat(
        [df, pd.get_dummies(df[column_name], prefix=encoded_column_name_prefix)], axis=1
    )


def clean_fare(df, ticket_price):
    if len(df[df[ticket_price].isnull()]) > 0:
        avg_fare = df[ticket_price].mean()
        df.replace({None: avg_fare}, inplace=True)


def clean_age(df, column_name, column_value_fill, column_value):
    df[column_value_fill] = df[column_name]
    df[column_value_fill] = (
        df[column_value_fill]
        .groupby([df[column_value], df["Pclass"]])
        .apply(lambda x: x.fillna(x.median()))
    )

In [None]:
def clean_data(df):
    # Fill in missing values of Embarked
    # Since the vast majority of passengers embarked in 'S': 3,
    # we assign the missing values in Embarked to 'S':
    df["Embarked"] = df["Embarked"].fillna("S")
    label_encode(df, "Sex", "Sex_Val")
    # Get the unique values of Embarked
    label_encode(df, "Embarked", "Embarked_Val")

    # Transform Embarked from a string to dummy variables
    df = one_hot_encode(df, "Embarked", "Embarked_Val")

    # Fill in missing values of Fare with the average Fare
    clean_fare(df, "Fare")

    # To keep Age intact, make a copy of it called AgeFill
    # that we will use to fill in the missing ages:
    # Determine the Age typical for each passenger class by Sex_Val.
    # We'll use the median instead of the mean because the Age
    # histogram seems to be right skewed.

    clean_age(df, "Age", "AgeFill", "Sex_Val")

    # Define a new feature FamilySize that is the sum of
    # Parch (number of parents or children on board) and
    # SibSp (number of siblings or spouses):
    df["FamilySize"] = df["Parch"] + df["SibSp"]

    return df


df_train = clean_data(df_train)

# Bar Plots, Histograms, subplot2grid

In [None]:
# Size of matplotlib figures that contain subplots
figsize_with_subplots = (10, 10)

# Set up a grid of plots
fig = plt.figure(figsize=figsize_with_subplots)
fig_dims = (3, 2)

# Plot death and survival counts
plt.subplot2grid(fig_dims, (0, 0))


def create_sub_plot_2_grid(
    df, column_name, plot_title, plot_kind, plot_color="b", plot_align="center"
):
    df[column_name].value_counts().plot(
        kind=plot_kind, title=plot_title, color=plot_color, align=plot_align
    )


create_sub_plot_2_grid(
    df_train, "Survived", "Death and Survival Counts", "bar", "r", "center"
)

# Plot Pclass counts
plt.subplot2grid(fig_dims, (0, 1))

create_sub_plot_2_grid(df_train, "Pclass", "Passenger Class Counts", "bar")
# Plot Sex counts
plt.subplot2grid(fig_dims, (1, 0))

create_sub_plot_2_grid(df_train, "Sex", "Gender Counts", "bar")
plt.xticks(rotation=0)

# Plot Embarked counts
plt.subplot2grid(fig_dims, (1, 1))

create_sub_plot_2_grid(df_train, "Embarked", "Ports of Embarkation Counts", "bar")

# Plot the Age histogram
plt.subplot2grid(fig_dims, (2, 0))
df_train["Age"].hist()
plt.title("Age Histogram")

In [None]:
# Get the unique values of Embarked and its maximum
family_sizes = np.sort(df_train["FamilySize"].unique())
family_size_max = max(family_sizes)

df1 = df_train[df_train["Survived"] == 0]["FamilySize"]
df2 = df_train[df_train["Survived"] == 1]["FamilySize"]
plt.hist([df1, df2], bins=family_size_max + 1, range=(0, family_size_max), stacked=True)
plt.legend(("Died", "Survived"), loc="best")
plt.title("Survivors by Family Size")

# Normalized Plots

In [None]:
pclass_xt = pd.crosstab(df_train["Pclass"], df_train["Survived"])

# Normalize the cross tab to sum to 1:
pclass_xt_pct = pclass_xt.div(pclass_xt.sum(1).astype(float), axis=0)

pclass_xt_pct.plot(kind="bar", stacked=True, title="Survival Rate by Passenger Classes")
plt.xlabel("Passenger Class")
plt.ylabel("Survival Rate")

In [None]:
def get_survival_rate_by_gender(gender):
    gender_df = df_train[df_train["Sex"] == gender]
    gender_xt = pd.crosstab(gender_df["Pclass"], df_train["Survived"])
    gender_xt_pct = gender_xt.div(gender_xt.sum(1).astype(float), axis=0)
    return gender_xt_pct


# Plot survival rate by Sex
gender_xt_pct = get_survival_rate_by_gender("female")


gender_xt_pct.plot(
    kind="bar", stacked=True, title="Female Survival Rate by Passenger Class"
)
plt.xlabel("Passenger Class")
plt.ylabel("Survival Rate")

In [None]:
# Plot survival rate by Pclass)
gender_xt_pct = get_survival_rate_by_gender("male")
gender_xt_pct.plot(
    kind="bar", stacked=True, title="Male Survival Rate by Passenger Class"
)
plt.xlabel("Passenger Class")
plt.ylabel("Survival Rate")

# Scatter Plots, subplots

In [None]:
# Set up a grid of plots
fig, axes = plt.subplots(2, 1, figsize=figsize_with_subplots)


def get_Histogram_of_AgeFill_segmented_by_Survived(df, column_number):
    df = df[df["Survived"] == column_number]["Age"]
    return df


df1 = get_Histogram_of_AgeFill_segmented_by_Survived(df_train, 0)
df2 = get_Histogram_of_AgeFill_segmented_by_Survived(df_train, 1)
max_age = int(max(df_train["AgeFill"]))

axes[1].hist([df1, df2], bins=int(max_age / 10), range=(1, max_age), stacked=True)
axes[1].legend(("Died", "Survived"), loc="best")
axes[1].set_title("Survivors by Age Groups Histogram")
axes[1].set_xlabel("Age")
axes[1].set_ylabel("Count")

# Scatter plot Survived and AgeFill
axes[0].scatter(df_train["Survived"], df_train["AgeFill"])
axes[0].set_title("Survivors by Age Plot")
axes[0].set_xlabel("Survived")
axes[0].set_ylabel("Age")

# Kernel Density Estimation Plots

In [None]:
# Get the unique values of Pclass:
def get_the_unique_values_of_Pclass(df):
    passenger_classes = np.sort(df["Pclass"].unique())
    for pclass in passenger_classes:
        df.AgeFill[df.Pclass == pclass].plot(kind="kde")


get_the_unique_values_of_Pclass(df_train)
plt.title("Age Density Plot by Passenger Class")
plt.xlabel("Age")
plt.legend(("1st Class", "2nd Class", "3rd Class"), loc="best")

## Acknowledgments


Thanks to Donne Martin for creating the open-source project <a href="https://github.com/donnemartin/data-science-ipython-notebooks">data-science-ipython-notebooks</a>, which inspires the majority of the content in this chapter.