In [None]:
# -q (quiet) 옵션은 로그 안보여줌 (조용히 함)
%pip install -q seaborn

Note: you may need to restart the kernel to use updated packages.


In [41]:
# 타이타닉 데이터셋 불러오기
# (pandas에 기본 내장된 예제 데이터셋)
import seaborn as sns
import pandas as pd

# titanic 데이터 DataFrame
titanic = sns.load_dataset('titanic')

# 처음 5개 행 보기
print("타이타닉 데이터 미리보기:")
print(titanic.head())

# 데이터셋 정보 확인
print("\n데이터셋 정보:")
print(titanic.info())

# 데이터 요약 통계
print("\n데이터 요약 통계:")
print(titanic.describe())

타이타닉 데이터 미리보기:
   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  

데이터셋 정보:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0 

In [3]:
print(titanic.shape)
print(titanic.columns)

(891, 15)
Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')


In [24]:
# col 결측치 합계
print(titanic.isna().sum())

# 승객 등급 등장 빈도 (1, 2, 3 등석)
print(titanic['pclass'].value_counts())

# 성별 빈도 (남/녀)
print(titanic['sex'].value_counts())

# 생존 여부 빈도 (생존자/사망자)
print(titanic['survived'].value_counts())


survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64
pclass
3    491
1    216
2    184
Name: count, dtype: int64
sex
male      577
female    314
Name: count, dtype: int64
survived
0    549
1    342
Name: count, dtype: int64


In [36]:
# 성별에 따른 생존율
print(titanic.groupby('sex')['survived'].mean() * 100)
print(titanic.groupby('sex').agg({'survived': ['mean']}))  # 더 복잡한 통계 가능

# 승객 등급에 따른 생존율
print(titanic.groupby('pclass')['survived'].mean() * 100)
print(titanic.groupby('pclass').agg({'survived': ['mean']}))  # 더 복잡한 통계 가능

sex
female    74.203822
male      18.890815
Name: survived, dtype: float64
        survived
            mean
sex             
female  0.742038
male    0.188908
pclass
1    62.962963
2    47.282609
3    24.236253
Name: survived, dtype: float64
        survived
            mean
pclass          
1       0.629630
2       0.472826
3       0.242363


In [53]:
# 성별 & 승객 등급에 따른 생존율
print(titanic.groupby(['sex', 'pclass'])['survived'].mean() * 100)
print(titanic.pivot_table(
    values='survived',
    index='sex',
    columns='pclass',
    aggfunc='mean'
))

# 나이 그룹별 생존율
titanic['age_group'] = pd.cut(
    titanic['age'],
    bins=[0,12,18,35,60,100],  # 초과 ~ 이하
    labels=['아동','청소년','청년','중장년','노년']
)
titanic.head()

# 카테고리가 있는데, 해당되는 내용이 없을 때 표시한다 / 안 한다. observed 옵션
print(titanic.groupby('age_group', observed=True)['survived'].mean() * 100)

# 성별 + 나이 그룹으로 생존율 확인
print(titanic.groupby(['sex', 'age_group'])['survived'].mean() * 100)

sex     pclass
female  1         96.808511
        2         92.105263
        3         50.000000
male    1         36.885246
        2         15.740741
        3         13.544669
Name: survived, dtype: float64
pclass         1         2         3
sex                                 
female  0.968085  0.921053  0.500000
male    0.368852  0.157407  0.135447
age_group
아동     57.971014
청소년    42.857143
청년     38.268156
중장년    40.000000
노년     22.727273
Name: survived, dtype: float64
sex     age_group
female  아동            59.375000
        청소년           75.000000
        청년            78.333333
        중장년           77.142857
        노년           100.000000
male    아동            56.756757
        청소년            8.823529
        청년            18.067227
        중장년           19.200000
        노년            10.526316
Name: survived, dtype: float64


  print(titanic.groupby(['sex', 'age_group'])['survived'].mean() * 100)


In [83]:
titanic = sns.load_dataset('titanic')
# 결측치 확인
missing = titanic.isna().sum()
# 결측 있는 항목만 확인
print(missing[missing > 0])

# 결측 비율
missing_p = titanic.isna().mean() * 100
missing_p[missing_p > 0]

age            177
embarked         2
deck           688
embark_town      2
dtype: int64


age            19.865320
embarked        0.224467
deck           77.216611
embark_town     0.224467
dtype: float64

In [80]:
# 결측치 채우기
# age: 중요한 정보 : 평균/중앙값 대체
# embarked: 가장 많은 사람들이 탄 곳으로 대체
# deck: 추측 불가능(의미 없음) -> 삭제

# 필요하다면, 카피 떠서 진행
titanic_processed = titanic.copy()

# 비어있던 행 마스킹
age_mask = titanic['age'].isna()

# 나이 결측치 채우기 (남녀상관 없이 전체 평균으로 채움)
mean_age = titanic['age'].mean()
titanic_processed['age'] = titanic['age'].fillna(mean_age)

# 결측치 있는지 재확인
titanic_processed.isna().sum()

# 평균으로 채워진 값들만 확인
titanic_processed[age_mask]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
5,0,3,male,29.699118,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
17,1,2,male,29.699118,0,0,13.0000,S,Second,man,True,,Southampton,yes,True
19,1,3,female,29.699118,0,0,7.2250,C,Third,woman,False,,Cherbourg,yes,True
26,0,3,male,29.699118,0,0,7.2250,C,Third,man,True,,Cherbourg,no,True
28,1,3,female,29.699118,0,0,7.8792,Q,Third,woman,False,,Queenstown,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
859,0,3,male,29.699118,0,0,7.2292,C,Third,man,True,,Cherbourg,no,True
863,0,3,female,29.699118,8,2,69.5500,S,Third,woman,False,,Southampton,no,False
868,0,3,male,29.699118,0,0,9.5000,S,Third,man,True,,Southampton,no,True
878,0,3,male,29.699118,0,0,7.8958,S,Third,man,True,,Southampton,no,True


In [84]:
# 전체 평균으로 대체하는 게 옳을까?
# 성별/객실별로 평균을 추정하는 건?

# 모든 사람들을 성별/객실 그룹의 평균 나이로 바꾼 Series
mean_ages = titanic.groupby(['sex', 'pclass'])['age'].transform('mean')

# titanic 의 age col 중 빈 값만, mean_ages 로 채움
titanic_processed['age'] = titanic['age'].fillna(mean_ages)

titanic_processed.loc[titanic['age'].isna()]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
5,0,3,male,26.507589,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
17,1,2,male,30.740707,0,0,13.0000,S,Second,man,True,,Southampton,yes,True
19,1,3,female,21.750000,0,0,7.2250,C,Third,woman,False,,Cherbourg,yes,True
26,0,3,male,26.507589,0,0,7.2250,C,Third,man,True,,Cherbourg,no,True
28,1,3,female,21.750000,0,0,7.8792,Q,Third,woman,False,,Queenstown,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
859,0,3,male,26.507589,0,0,7.2292,C,Third,man,True,,Cherbourg,no,True
863,0,3,female,21.750000,8,2,69.5500,S,Third,woman,False,,Southampton,no,False
868,0,3,male,26.507589,0,0,9.5000,S,Third,man,True,,Southampton,no,True
878,0,3,male,26.507589,0,0,7.8958,S,Third,man,True,,Southampton,no,True


In [91]:
# embarked (탑승 도시) -> 최빈값(가장 많이 탄 곳)

# 비어있는 항구 개수
titanic['embarked'].isna().sum()

# 최빈값 (Series - 벡터)
mode_embarked = titanic['embarked'].mode()[0]  # Series 에서 1개 뽑기

titanic_processed['embarked'] = titanic['embarked'].fillna(mode_embarked)

# 처리 완료 확인 (빈값 0개)
titanic_processed.isna().sum()

survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         0
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [None]:
# deck 은 77% 가 비어있음 -> 삭제
titanic['deck'].isna().mean() * 100  # 빈값 비율

# 'deck' 컬럼은 삭제 - inplace 는 실행하면 2번째 실행 불가능
titanic_processed.drop('deck', axis=1, inplace=True)

titanic_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          891 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     891 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  embark_town  889 non-null    object  
 12  alive        891 non-null    object  
 13  alone        891 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(5)
memory usage: 79.4+ KB


In [26]:
survived0 = titanic[titanic['survived'] == 0]
survived1 = titanic[titanic['survived'] == 1]

print(survived0['survived'].count())
print(titanic[(titanic['survived'] == 0) & (titanic['adult_male'] == True)])

549
     survived  pclass   sex   age  sibsp  parch     fare embarked   class  \
0           0       3  male  22.0      1      0   7.2500        S   Third   
4           0       3  male  35.0      0      0   8.0500        S   Third   
5           0       3  male   NaN      0      0   8.4583        Q   Third   
6           0       1  male  54.0      0      0  51.8625        S   First   
12          0       3  male  20.0      0      0   8.0500        S   Third   
..        ...     ...   ...   ...    ...    ...      ...      ...     ...   
881         0       3  male  33.0      0      0   7.8958        S   Third   
883         0       2  male  28.0      0      0  10.5000        S  Second   
884         0       3  male  25.0      0      0   7.0500        S   Third   
886         0       2  male  27.0      0      0  13.0000        S  Second   
890         0       3  male  32.0      0      0   7.7500        Q   Third   

     who  adult_male deck  embark_town alive  alone  
0    man        T

In [36]:
print(titanic.groupby('pclass')['survived'].agg('count'))
print(survived1.groupby('pclass')['survived'].agg('count'))

pclass
1    216
2    184
3    491
Name: survived, dtype: int64
pclass
1    136
2     87
3    119
Name: survived, dtype: int64


In [24]:

print(survived1.head(10))

    survived  pclass     sex   age  sibsp  parch     fare embarked   class  \
1          1       1  female  38.0      1      0  71.2833        C   First   
2          1       3  female  26.0      0      0   7.9250        S   Third   
3          1       1  female  35.0      1      0  53.1000        S   First   
8          1       3  female  27.0      0      2  11.1333        S   Third   
9          1       2  female  14.0      1      0  30.0708        C  Second   
10         1       3  female   4.0      1      1  16.7000        S   Third   
11         1       1  female  58.0      0      0  26.5500        S   First   
15         1       2  female  55.0      0      0  16.0000        S  Second   
17         1       2    male   NaN      0      0  13.0000        S  Second   
19         1       3  female   NaN      0      0   7.2250        C   Third   

      who  adult_male deck  embark_town alive  alone  
1   woman       False    C    Cherbourg   yes  False  
2   woman       False  NaN  Sou