# Lesson 11: Making Sense of Data

We generally use different types of charts to visualize quantitative (numerical) data and qualitative (ordinal or nominal) data.

For quantitative data, we most often use histograms, box plots, and scatter plots.

We can use the [`seaborn` plotting library](https://seaborn.pydata.org/index.html) to create these plots in Python. We will use a dataset containing information about passengers aboard the Titanic.

In [None]:
import pandas as pd
import numpy as np

%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
print(sns.get_dataset_names())

In [None]:
titanic = sns.load_dataset('titanic')
titanic

In [None]:
titanic.info()

In [None]:
titanic.age

In [None]:
# matplotlib
plt.scatter(titanic.age, titanic.fare);

In [None]:
ans = sns.load_dataset('anscombe')
ans.head()

In [None]:
sns.scatterplot(data=titanic, x='age', y='fare');

In [None]:
sns.scatterplot(data=ans, x='x', y='y');

In [None]:
ans.dataset

In [None]:
sns.scatterplot(data=ans, x='x', y='y', hue='dataset');

In [None]:
sns.lmplot(data=ans, x='x', y='y', ci=False);

In [None]:
sns.relplot(data=ans, x='x', y='y', col='dataset');

In [None]:
sns.relplot(data=ans, x='x', y='y', col='dataset', col_wrap=2);

In [None]:
grp = ans.groupby('dataset')
for name, contents in grp:
    print(name)
    print('Correlation\t', round(grp.get_group(name).corr().iloc[0,1], 4))
    print('Mean x\t\t', grp.get_group(name)['x'].mean())
    print('Mean y\t\t', round(grp.get_group(name)['y'].mean(), 2))
    print('Std x\t\t', round(grp.get_group(name)['x'].std(), 4))
    print('Std y\t\t', round(grp.get_group(name)['y'].std(), 2))
    print('\n')

In [None]:
births = pd.read_csv('data/baby.csv')
births.head()

In [None]:
births.shape

In [None]:
births['Maternal Smoker'].value_counts()
#print(type(births['Maternal Smoker'].value_counts()))

In [None]:
births['Maternal Smoker'].value_counts().plot(kind='bar', rot=0);

In [None]:
sns.countplot(data=births, x='Maternal Smoker');

In [None]:
majors = pd.read_csv('data/majors.csv')
majors

In [None]:
d = {'major' : ['Data Science', 'History', 'Biology', 'Business'],
     'gpas' : [3.25, 3.1, 2.95, 3.42]}

majors = pd.DataFrame(data=d)
majors

In [None]:
# pandas
majors.plot.bar(x='major', y='gpas', rot=0);

In [None]:
plt.bar(x=majors.major, height=majors.gpas);

In [None]:
sns.barplot(data=majors, x='major', y='gpas');

In [None]:
sns.rugplot(births['Birth Weight']);

In [None]:
sns.rugplot(data=births, x='Birth Weight', height=.1);

In [None]:
bins = range(50, 200, 5)
plt.hist(births['Birth Weight'], bins=bins, edgecolor='black');

In [None]:
plt.hist(births['Birth Weight'], bins=bins, density=True, edgecolor='white');

In [None]:
plt.hist(births['Birth Weight'], bins=np.arange(50, 200, 20), density=True, edgecolor='white');

In [None]:
plt.hist(births['Birth Weight'], bins=[50, 100, 120, 140, 200], edgecolor='white', density=True);

In [None]:
sns.histplot(data=births, x='Birth Weight', edgecolor='white', kde='True');

In [None]:
sns.kdeplot(data=births, x='Birth Weight');

In [None]:
non_smoking = births[births['Maternal Smoker'] != True]
sns.histplot(data=non_smoking, x='Birth Weight', edgecolor='white', kde=True)
plt.title('Non-Smoking Mothers');

In [None]:
smoking = births[births['Maternal Smoker'] == True]
sns.histplot(data=smoking, x='Birth Weight', edgecolor='white', color='orange', kde=True)
plt.title('Smoking Mothers');

In [None]:
smoke = births['Maternal Smoker'].astype('category')
bweight = births['Birth Weight']
df = pd.concat([bweight, smoke], axis=1)
sns.boxplot(data=df, x='Birth Weight', y='Maternal Smoker');

In [None]:
smoke

In [None]:
sns.boxplot(data=df, x='Maternal Smoker', y='Birth Weight', orient='v');

In [None]:
sns.scatterplot(data=births, x='Maternal Height', y='Birth Weight');

In [None]:
sns.scatterplot(data=births, x='Maternal Height', y='Birth Weight', hue='Maternal Smoker');

In [None]:
sns.lmplot(data=births, x='Maternal Height', y='Birth Weight', ci=False);

In [None]:
sns.jointplot(data=births, x='Maternal Height', y='Birth Weight', marginal_kws={'edgecolor' : 'white'});

In [None]:
sns.jointplot(data=births, x='Maternal Height', y='Birth Weight', kind='kde', fill=True);