In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = sns.load_dataset("titanic")

In [4]:
df.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [10]:
df.iloc[0]

survived                 0
pclass                   3
sex                   male
age                   22.0
sibsp                    1
parch                    0
fare                  7.25
embarked                 S
class                Third
who                    man
adult_male            True
deck                   NaN
embark_town    Southampton
alive                   no
alone                False
family                   1
Name: 0, dtype: object

In [7]:
df["family"] = df["sibsp"] + df["parch"]

In [14]:
# 필터링 (디카프리오)
# 남자
# 나이 20살 이상
# 나이 25살 미만
# 3 클래스
# 죽었음
filtered = (df["sex"] == "male")
filtered = filtered & (df["pclass"] == 3)
filtered = filtered & ((df["age"] > 20) & (df["age"] < 25))
filtered &= (df["survived"] == 0)

In [16]:
df[filtered].head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,family
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,1
37,0,3,male,21.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,0
51,0,3,male,21.0,0,0,7.8,S,Third,man,True,,Southampton,no,True,0
60,0,3,male,22.0,0,0,7.2292,C,Third,man,True,,Cherbourg,no,True,0
80,0,3,male,22.0,0,0,9.0,S,Third,man,True,,Southampton,no,True,0


In [28]:
# apply
# age below 20 : young
# age below 40 : middle
# age above 40 : old
age_class = list()
for age in df["age"]:
    if age < 20:
        age_class.append("young")
    elif age < 40:
        age_class.append("middle")
    else:
        age_class.append("old")

In [21]:
df["age_class"] = df["age"].apply(lambda x: "young" if x < 20 else ("middle" if x < 40 else "old"))
df["age_class"]

0      middle
1      middle
2      middle
3      middle
4      middle
        ...  
886    middle
887     young
888       old
889    middle
890    middle
Name: age_class, Length: 891, dtype: object

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
 15  family       891 non-null    int64   
 16  age_class    891 non-null    object  
dtypes: bool(2), category(2), float64(2), int64(5), object(6)
memory usage: 94

In [30]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,family,age_class,fare_class
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,1,middle,low
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,1,middle,moddle
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,0,middle,low
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,1,middle,moddle
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,0,middle,low


In [31]:
# fare
# below 10$ : low
# below 100$ : middle
# above 100$ : high
df["fare_class"] = df["fare"].apply(lambda x: "low" if x < 10 else ("moddle" if x < 100 else "high"))
df["fare_class"]

0         low
1      moddle
2         low
3      moddle
4         low
        ...  
886    moddle
887    moddle
888    moddle
889    moddle
890       low
Name: fare_class, Length: 891, dtype: object

In [32]:
# groupby
# 집계 : 여러개의 데이터를 하나의 값으로 축약
# mean, max, min, count, sum, ...
df["age"].mean()

29.69911764705882

In [35]:
# groupby : 카테고리별로 모아서 집계
df.groupby("pclass")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000021CAFF9F040>

In [36]:
df.groupby("pclass")["age"].mean()

pclass
1    38.233441
2    29.877630
3    25.140620
Name: age, dtype: float64

In [37]:
df.groupby("sex")["age"].mean()

sex
female    27.915709
male      30.726645
Name: age, dtype: float64

In [43]:
# multi index 1 dim Series
df_group = df.groupby(["pclass", "sex"])["age"].mean()
df_group

pclass  sex   
1       female    34.611765
        male      41.281386
2       female    28.722973
        male      30.740707
3       female    21.750000
        male      26.507589
Name: age, dtype: float64

In [44]:
df_group.index

MultiIndex([(1, 'female'),
            (1,   'male'),
            (2, 'female'),
            (2,   'male'),
            (3, 'female'),
            (3,   'male')],
           names=['pclass', 'sex'])

In [46]:
# groupby : 1차원, 카테고리별 집계
# pivot : 2차원 집계
pd.pivot_table(df, columns="sex", index="pclass", values="age", aggfunc="count")

sex,female,male
pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,85,101
2,74,99
3,102,253
