# Learning Objectives
0. Utilize the *Titanic Dataset*.
0. Learn all of the methods in `pandas` for dataframe manipulation.
0. Create visualizations for the *Titanic Dataset*.

# Initialization

In [None]:
# import all important modules.
import numpy
import pandas

# plots
import matplotlib.pyplot as plot
import seaborn

In [None]:
# create dataframe object.
dataframe = pandas.read_csv('./data/titanic.csv')

In [None]:
# create mini-helper function.
# it will quickly peek at the dataframe.
def peek(dataframe = dataframe):
	print("DATAFRAME HEAD:")
	display(dataframe.head(5))
	print('\n---\n')
	print("DATAFRAME SHAPE:")
	display(dataframe.shape)

In [None]:
# peek the dataframe.
peek()

---

## Titanic Dataset Description
In accordance to the dataset's documentation.

```md
VARIABLE DESCRIPTIONS:
survival        Survival
                (0 = No; 1 = Yes)
pclass          Passenger Class
                (1 = 1st; 2 = 2nd; 3 = 3rd)
name            Name
sex             Sex
age             Age
sibsp           Number of Siblings/Spouses Aboard
parch           Number of Parents/Children Aboard
ticket          Ticket Number
fare            Passenger Fare
cabin           Cabin
embarked        Port of Embarkation
                (C = Cherbourg; Q = Queenstown; S = Southampton)
```

---

## Titanic Dataset Cleanup

In [None]:
# create a new column, "Gender".
# when "Sex" is female it is zero.
# when "Sex" is male it is one.
gender_map = {
	'female': 0,
	'male': 1,
}

# here, the column is created.
dataframe['Gender'] = dataframe['Sex'].map(gender_map).astype(int)

In [None]:
# peek the dataframe.
# we should have one more column.
peek()

---

## Titanic Dataset Exploration

***What is the passenger age distribution?***

In [None]:
dataframe['Age'].hist(bins=16)
plot.show()

***How many of the age values are missing?***  
*Hint: missing age values are `null`.*

In [None]:
dataframe['Age'].isna().sum()

***Where is the mean/median age? Are there any outliers?***  
*Hint: create a box-and-whisker plot.*

In [None]:
dataframe['Age'].plot.box()

***List all valid ages.***

In [None]:
dataframe['Age'].dropna().values

***How many unique values does the "Embarked" column have?***

In [None]:
# describe embarked column.
dataframe['Embarked'].describe()

In [None]:
# count uniques.
dataframe['Embarked'].nunique()

***How large is each Embarked column?***  
*Hint: use a bar-chart.*

In [None]:
# count each of the embarked categories.
dataframe['Embarked'].value_counts()

In [None]:
# plot the "Embarked" category counts.
dataframe['Embarked'].value_counts().plot(kind='bar')

In [None]:
# `barh` gives a horizontal plot.
dataframe['Embarked'].value_counts().plot(kind='barh')

***Use seaborn to plot the previous problem.***

In [None]:
seaborn.countplot(x="Embarked", data=dataframe)

***Analyze the Sex Distribution.***

In [None]:
dataframe['Sex'].value_counts().plot(kind='bar')

In [None]:
dataframe['Sex'].value_counts().plot(kind='pie')

### Continue Exploring Below...
...but only consider passangers that definately embarked at Gate "C"!

In [None]:
# filter the dataframe to only include valid conditions.
# we will be using this for the remainder of the section.
embarked_at_c = dataframe[dataframe['Embarked'] == 'C']

In [None]:
# peek the filtered dataframe.
peek(embarked_at_c)

***What is the passenger age distribution from Gate "C"?***  
*Hint: create a box-and-whisker plot.*

In [None]:
#Activity: 
# the age range of passenger whose their Embarked were 'C'
embarked_at_c['Age'].hist(bins=16)

***What does the normal distribution look like?***

In [None]:
embarked_at_c['Age'].plot(kind='kde')

### Plot how many of the passengers were children, youth, middle age and old based on there Sex for those who 'Embarked' in section 'C'?

In [None]:
# create a filtered dataframe.
gender_group = dataframe[dataframe['Embarked'] == 'C']

# use groupby to create groups of genders.
gender_group = gender_group.groupby('Sex')

# test output here.
for i in gender_group['Age']:
	print(i)

In [None]:
gender_group['Age'].hist(bins=16, alpha=0.5)

In [None]:
gender_group['Age'].plot(bins=16, kind='hist', legend=True, alpha=0.5)

In [None]:
gender_group['Age'].value_counts()


In [None]:
# # import the pandas library
# import pandas as pd
# import numpy as numpy

# ipl_data = {'Team': ['Riders', 'Riders', 'Devils', 'Devils', 'Kings',
#          'kings', 'Kings', 'Kings', 'Riders', 'Royals', 'Royals', 'Riders'],
#          'Rank': [1, 2, 2, 3, 3,4 ,1 ,1,2 , 4,1,2],
#          'Year': [2014,2015,2014,2015,2014,2015,2016,2017,2016,2014,2015,2017],
#          'Points':[876,789,863,673,741,812,756,788,694,701,804,690]}
# dataframe = pd.DataFrame(ipl_data)

# grouped = dataframe.groupby('Year')
# dataframe.groupby('Year')['Points'].agg(numpy.mean)

# https://www.tutorialspoint.com/python_pandas/python_pandas_groupby.htm

### What is the average Age for female and male (based on sex) for those who have 'Embarked' on section 'C'?

In [None]:
gender_group['Age'].agg(numpy.mean)

### Another way we can do the above task

In [None]:
gender_group['Age'].apply(lambda x:numpy.mean(x))

### Which Age is the oldest for female and male (based on sex) for those who have 'Embarked' on section 'C'?

In [None]:
gender_group['Age'].agg(numpy.max)

### For different Ages, plot the Fare they have paid?

In [None]:
seaborn.regplot(x="Age", y="Fare", fit_reg=False, data=dataframe)

In [None]:
dataframe.plot.scatter(x="Age", y="Fare")

### Plot how percentage Survived for two Sex group based on the passengers class 

In [None]:
seaborn.barplot(x="Sex", y="Survived", hue="Pclass", data=dataframe)

### Plot how many male or female were in different Passenger classes

In [None]:
seaborn.countplot(x="Sex", hue="Pclass", data=dataframe)

In [None]:
seaborn.countplot(x="Sex", hue="Survived", data=dataframe)

In [None]:
pandas.crosstab(dataframe['Sex'], dataframe['Survived']).to_json()

### Verify values obtained for pertentage 

In [None]:
dataframe[(dataframe['Sex'] == 'female') & (dataframe['Pclass'] == 1)]['Survived'].value_counts()

In [None]:
91/(91 + 3)

In [None]:
dict(dataframe[(dataframe['Sex'] == 'female') & (dataframe['Pclass'] == 1)]['Survived'].value_counts())

### Stack plot of count based on Sex for different Passenger Class

In [None]:
dataframe.groupby(['Sex'])['Pclass'].value_counts().unstack().plot(kind='bar',stacked=True)

### Stack plot of count based on Sex and Survival for different Passenger Class

In [None]:
dataframe.groupby(['Sex', 'Survived'])['Pclass'].value_counts().unstack().plot(kind='bar',stacked=True)

### Sometimes it is hard to read values from plot, what are the number of female and male at each Passenger Class

In [None]:
# dataframe.groupby(['Sex'])['Pclass'].value_counts().unstack()
# the above and crosstab are the same 
pandas.crosstab(dataframe['Sex'], dataframe['Pclass'])

In [None]:
pandas.crosstab(dataframe['Sex'], dataframe['Survived'])

In [None]:
pandas.crosstab(dataframe['Sex'], dataframe['Embarked'])

### How to represent the above cross tab in percentage and graphically present 

In [None]:
seaborn.heatmap(pandas.crosstab(dataframe['Sex'], dataframe['Embarked'], normalize='index'), cmap="YlGnBu", annot=True)

## Question:

What percent of passengers embarked at C?

In [None]:
# Answer:

print(dict(dataframe['Embarked'].value_counts()))

dict(dataframe['Embarked'].value_counts())['C']

In [None]:
sum(dict(dataframe['Embarked'].value_counts()).values())

In [None]:
dict(dataframe['Embarked'].value_counts())['C']/sum(dict(dataframe['Embarked'].value_counts()).values())

#### OR

In [None]:
len(dataframe[dataframe['Embarked'] == 'C'])/len(dataframe['Embarked'].dropna())

What percent of female passengers embarked at C?

In [None]:
pandas.crosstab(dataframe['Sex'], dataframe['Embarked'])

In [None]:
len(dataframe[(dataframe['Sex'] == 'female') & (dataframe['Embarked'] == 'C')])

In [None]:
len(dataframe[dataframe['Sex'] == 'female'])

In [None]:
73/ 314

In [None]:
len(dataframe[(dataframe['Sex'] == 'female') & (dataframe['Embarked'] == 'C')])/len(dataframe[dataframe['Sex'] == 'female'])

This question is different from above:
What percent of passengers embarked at C were female?