# [LAB#01] 탐색적 데이터 분석 (exploratory data analysis, EDA)

- 분석 데이터: "**제7차 한국인 인체치수 측정 데이터**"
- `pandas`의 `DataFrame` 사용법
- `matplotlib`을 이용한 그래프 그리기

In [None]:
!ls

In [None]:
import numpy as np
import scipy as sp
from scipy.stats import norm
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

matplotlib.rcParams['figure.dpi'] = 300
%matplotlib inline

---

## 데이터 읽고 전처리하기

In [None]:
#df = pd.read_csv("2015_7th_korean-body.csv")
df = pd.read_csv("2015_7th_korean-body.csv", thousands=',')

In [None]:
type(df)

In [None]:
df.columns

In [None]:
for i, col in enumerate(df.columns):
    print(i, col)

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df["①_003_키"]

In [None]:
df[["①_003_키", "①_031_몸무게"]]

In [None]:
sdf = df[["①_003_키", "①_031_몸무게"]].copy()

In [None]:
type(sdf)

In [None]:
sdf.head(10)

In [None]:
sdf.dtypes

In [None]:
sdf.columns

In [None]:
sdf.columns = ["height", "weight"]

In [None]:
sdf.head()

In [None]:
sdf["height"]

In [None]:
sdf.height

In [None]:
sdf["height"].str.replace(',', '')

In [None]:
sdf["height"] = sdf["height"].str.replace(',', '')

In [None]:
sdf.head()

In [None]:
sdf.dtypes

In [None]:
sdf = sdf.astype(np.float32)

In [None]:
sdf.dtypes

In [None]:
sdf.head()

---

## 결측값 처리하기

In [None]:
sdf.shape

In [None]:
sdf.describe()

In [None]:
sdf.isna()

In [None]:
sdf.loc[6420]

In [None]:
df.loc[6420]

In [None]:
ir, ic = np.where(sdf.isna())
ir, ic

In [None]:
sdf.isna?

In [None]:
ir.size  # The number of rows that has NA in any column.

In [None]:
sdf.loc[ir]

In [None]:
sdf_nona = sdf.dropna()

In [None]:
sdf.dropna?

In [None]:
sdf_nona.describe()

---

## pandas에 내장된 matplotlib 기능 이용하기

In [None]:
sdf.plot.scatter(x="height",
                 y="weight",
                 marker='o',
                 color=(0, 0, 1.0, 0.1))

---

## matplotlib 함수로 그려보기

In [None]:
plt.scatter(sdf.height,
            sdf.weight,
            marker='o',
            color=(0, 0, 1.0, 0.1))
plt.xlabel("Height")
plt.ylabel("Weight")

In [None]:
import matplotlib.font_manager as fm

for f in fm.fontManager.ttflist:
    if 'Malgun' in f.name:
        print(f'Font: {f.name}, Path: {f.fname}')

In [None]:
font_path = r"C:\Windows\Fonts\malgun.ttf"
font_prop = fm.FontProperties(fname=font_path)
font_name = font_prop.get_name()
matplotlib.rc('font',family=font_name)

In [None]:
for i, col in enumerate(df.columns):
    if df[col].dtype not in (np.float32, np.float64):
        continue

    plt.figure()
    plt.scatter(df[col],
                df["①_031_몸무게"],
                marker='o',
                color=(0, 0, 1.0, 0.1))
    plt.xlabel(col)
    plt.ylabel("몸무게")

In [None]:
for i in range(120, df.columns.size):
    col = df.columns[i]
    if df[col].dtype not in (np.float32, np.float64):
        continue

    plt.figure()
    plt.scatter(df[col],
                df["①_031_몸무게"],
                marker='o',
                color=(0, 0, 1.0, 0.1))
    plt.xlabel(col)
    plt.ylabel("몸무게")

---

## 확률 밀도 함수 그려보기

[Wikipedia: normal distribution](https://en.wikipedia.org/wiki/Normal_distribution)

<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/7/74/Normal_Distribution_PDF.svg/1920px-Normal_Distribution_PDF.svg.png"
     width="500px">

In [None]:
height = sdf["height"]
mu = height.mean()
std = height.std()
x = np.linspace(1300, 2100, 100)
p = norm.pdf(x, mu, std)  # from scipy
print("[HEIGHT] Average:", mu)
print("[HEIGHT] Std. Dev.", std)

In [None]:
norm.pdf?

In [None]:
step = 10  # (mm)
n_bins = (2100 - 1300) // step

In [None]:
fig = plt.figure(figsize=(10, 4))

plt.hist(height,
         range=(1300, 2100),
         bins=n_bins,
         histtype='bar',
         color='orange',
         edgecolor='k')

plt.xticks(np.arange(1300, 2150, step))
plt.ylabel("Count", fontsize=16, labelpad=16)

ax1 = fig.axes[0]
ax2 = ax1.twinx()
ax2.plot(x, p, 'k', linewidth=2)
ax2.set_ylabel("Probability", fontsize=16, rotation=270, labelpad=16)

plt.title('Height (mm)', fontsize=20, pad=16)
plt.show()

---

## 다른 확률 밀도 함수도 적용해 보기

[Wikipedia: Cauchy distribution](https://en.wikipedia.org/wiki/Cauchy_distribution)

<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/8/8c/Cauchy_pdf.svg/600px-Cauchy_pdf.svg.png"
     width="500px">

In [None]:
height.dropna(inplace=True)

In [None]:
from scipy.stats import cauchy
x0, gamma = cauchy.fit(height)  # from scipy
x0, gamma

In [None]:
p = cauchy.pdf(x, x0, gamma)

In [None]:
fig = plt.figure(figsize=(10, 4))

plt.hist(height,
         range=(1300, 2100),
         bins=n_bins,
         histtype='bar',
         color='orange',
         edgecolor='k')

plt.xticks(np.arange(1300, 2100, step))
plt.ylabel("Count", fontsize=16, labelpad=16)

ax1 = fig.axes[0]
ax2 = ax1.twinx()
ax2.plot(x, p, 'k', linewidth=2)
ax2.set_ylabel("Probability", fontsize=16, rotation=270, labelpad=16)

plt.title('Height (mm)', fontsize=20, pad=16)
plt.show()

---

## 성별로 그룹 짓기

In [None]:
df.groupby?

In [None]:
gb = df.groupby(by="ⓞ_02_성별")
gb

In [None]:
for sex, group in gb:
    print(sex, group, type(group))

In [None]:
df_male = gb.get_group("남")
df_female = gb.get_group("여")

col_height = "①_003_키"
col_weight = "①_031_몸무게"

sdf_male = df_male[[col_height, col_weight]].copy()
sdf_female = df_female[[col_height, col_weight]].copy()

sdf_male.columns = ["height", "weight"]
sdf_female.columns = ["height", "weight"]

# sdf_male.height = sdf_male.height.str.replace(",", "").astype(np.float32)
# sdf_female.height = sdf_female.height.str.replace(",", "").astype(np.float32)

sdf_male.dropna(inplace=True)
sdf_female.dropna(inplace=True)

In [None]:
sdf_male

In [None]:
sdf_female

In [None]:
fig = plt.figure(figsize=(10, 4))

step = 10
n_bins = (2100 - 1300) // step


plt.hist(sdf_male.height,
         range=(1300, 2100),
         bins=n_bins,
         histtype='bar',
         color=(0, 0, 1.0, 0.5),
         edgecolor='k')

plt.hist(sdf_female.height,
         range=(1300, 2100),
         bins=n_bins,
         histtype='bar',
         color=(1.0, 0, 0, 0.5),
         edgecolor='k')

plt.ylabel("Count", fontsize=16, labelpad=16)
plt.xlabel("Height (mm)", fontsize=16, labelpad=16)
plt.title('Male & Female Heights', fontsize=20, pad=16)
plt.legend(["Male", "Female"])
plt.show()

---

## 나이로 그룹 짓기

In [None]:
df["ⓞ_06_나이_반올림"].describe()

In [None]:
categories = pd.cut(df["ⓞ_06_나이_반올림"], np.arange(10, 71, 10))
categories

In [None]:
gb = df.groupby(categories)
for age, group in gb:
    print(age)
    print(group.describe())

In [None]:
dfs_age = {}

for age, group in gb:
    group = group[["①_003_키", "①_031_몸무게"]].copy()    
    group.columns = ["height", "weight"]
    
    dfs_age[age.left] = group

for age, sdf in dfs_age.items():
    print(age, type(sdf))

In [None]:
nrows = 2
ncols = 3

step = 50
n_bins = (2100 - 1300) // step

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(10, 6))

for age, ax in zip(dfs_age.keys(), axes.flatten()):
    sdf = dfs_age[age]
    ax.hist(sdf.height,
            range=(1300, 2100),
            bins=n_bins,
            histtype='bar',
            color="skyblue",
            edgecolor='k')
    ax.set_title("Age=%d's"%(age))
    ax.set_xlabel("Height (mm)")
    ax.set_ylabel("Count")
# end of for

fig.tight_layout()

In [None]:
axes.flatten()

In [None]:
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(10, 6))

for age, ax in zip(dfs_age.keys(), axes.flatten()):
    sdf = dfs_age[age]
    ax.scatter(sdf.height,
               sdf.weight,
               marker='o',
               color=(0, 0, 1.0, 0.1))
    ax.set_title("Age=%d's"%(age))
    ax.set_xlabel("Height (mm)")
    ax.set_ylabel("Weight (kg)")
# end of for

fig.tight_layout()

In [None]:
from matplotlib import colors as mpl_colors

nrows = 2
ncols = 3

step = 10
n_bins = (2100 - 1300) // step

colors = ["red", "orange", "yellow", "green", "skyblue", "blue", "purple"]

fig = plt.figure(figsize=(10, 4))

for i, (age, sdf) in enumerate(dfs_age.items()):
    color = mpl_colors.to_rgba(colors[i])  # It returns a tuple.
    color = np.array(color)  # Convert the tuple to numpy array to change alpha value.
    color[3] = 0.7  # The last element of "color" is alpha of RGBA.
    plt.hist(sdf.height,
             range=(1300, 2100),
             bins=n_bins,
             histtype='bar',
             color=color,
             edgecolor='k')

plt.xlabel("Height", fontsize=16, labelpad=16)
plt.ylabel("Count", fontsize=16, labelpad=16)
plt.title('Histogram by age', fontsize=18, pad=16)
plt.legend(["%d's"%(age) for age in dfs_age.keys()])
plt.show()