## 모듈 import

In [1]:
from IPython.display import Image
import numpy as np
import pandas as pd
import seaborn as sns 

## 데이터셋 로드

In [2]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


**컬럼(columns) 설명**

- survivied: 생존여부 (1: 생존, 0: 사망)
- pclass: 좌석 등급 (1등급, 2등급, 3등급)
- sex: 성별
- age: 나이
- sibsp: 형제 + 배우자 수
- parch: 부모 + 자녀 수
- fare: 좌석 요금
- embarked: 탑승 항구 (S, C, Q)
- class: pclass와 동일
- who: 성별과 동일
- adult_male: 성인 남자 여부
- deck: 데크 번호 (알파벳 + 숫자 혼용)
- embark_town: 탑승 항구 이름
- alive: 생존여부 (yes, no)
- alone: 혼자 탑승 여부

## 상위 5개의 행 출력

In [3]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [4]:
df.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
886,0,2,male,27.0,0,0,13.0,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.45,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True
890,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


데이터는 몇개의 행과 열로 이루어져 있는지 확인

In [5]:
df.shape

(891, 15)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


데이터의 컬럼별 결측치 확인

In [7]:
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

생존자와 사망자의 분포 확인

In [8]:
df['survived'].value_counts()

0    549
1    342
Name: survived, dtype: int64

항구별 생존자 합계 계산

In [9]:
df.groupby('embarked')['survived'].sum()

embarked
C     93
Q     30
S    217
Name: survived, dtype: int64

항구별 생존율 계산

In [10]:
df.groupby('embarked')['survived'].mean()

embarked
C    0.553571
Q    0.389610
S    0.336957
Name: survived, dtype: float64

항구별 생존자 합계 및 생존율 계산

In [11]:
df.groupby('embarked')['survived'].agg(['sum','mean'])

Unnamed: 0_level_0,sum,mean
embarked,Unnamed: 1_level_1,Unnamed: 2_level_1
C,93,0.553571
Q,30,0.38961
S,217,0.336957


성별 생존자 합계 및 생존율 계산

In [12]:
df.groupby('sex')['survived'].agg(['sum','mean'])

Unnamed: 0_level_0,sum,mean
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,233,0.742038
male,109,0.188908


혼자인 경우와 혼자가 아닌 경우 합계 & 생존율 계산

In [13]:
df.groupby('alone')['survived'].agg(['sum','mean'])

Unnamed: 0_level_0,sum,mean
alone,Unnamed: 1_level_1,Unnamed: 2_level_1
False,179,0.50565
True,163,0.303538


등급 ( 별 생존자 합계 & 생존율 계산

In [14]:
df.groupby('pclass')['survived'].agg(['sum','mean'])

Unnamed: 0_level_0,sum,mean
pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,136,0.62963
2,87,0.472826
3,119,0.242363


성별 , 등급별 생존자 합계 & 생존율 계산

In [15]:
df.groupby(['sex','pclass'])['survived'].agg(['sum','mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,sum,mean
sex,pclass,Unnamed: 2_level_1,Unnamed: 3_level_1
female,1,91,0.968085
female,2,70,0.921053
female,3,72,0.5
male,1,45,0.368852
male,2,17,0.157407
male,3,47,0.135447


혼자인 경우 / 성별 합계 & 생존율 계산

In [16]:
df.groupby(['alone','sex'])['survived'].agg(['sum','mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,sum,mean
alone,sex,Unnamed: 2_level_1,Unnamed: 3_level_1
False,female,134,0.712766
False,male,45,0.271084
True,female,99,0.785714
True,male,64,0.155718


who, 등급별 생존자 합계 & 생존율 계산

In [17]:
df.groupby(['who','pclass'])['survived'].agg(['sum','mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,sum,mean
who,pclass,Unnamed: 2_level_1,Unnamed: 3_level_1
child,1,5,0.833333
child,2,19,1.0
child,3,25,0.431034
man,1,42,0.352941
man,2,8,0.080808
man,3,38,0.119122
woman,1,89,0.978022
woman,2,60,0.909091
woman,3,56,0.491228


pivot_table 을 활용하여 , 성별 , pclass) 별 생존자 합계 ( 출력

In [18]:
df.pivot_table(index='sex',
              columns='pclass',
              values = 'survived',
              aggfunc=['sum', 'mean'])

Unnamed: 0_level_0,sum,sum,sum,mean,mean,mean
pclass,1,2,3,1,2,3
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
female,91,70,72,0.968085,0.921053,0.5
male,45,17,47,0.368852,0.157407,0.135447


who, 등급별 생존자 합계 & 생존율 계산 결과를 토대로 별도의 DataFrame 로 생성
인덱스 초기화 및 생존율 내림차순 정렬

In [19]:
result = df.groupby(['who', 'pclass'])['survived'].agg(['sum', 'mean'])
result = result.reset_index()
result.sort_values(by='mean', ascending=False).reset_index(drop=True)

Unnamed: 0,who,pclass,sum,mean
0,child,2,19,1.0
1,woman,1,89,0.978022
2,woman,2,60,0.909091
3,child,1,5,0.833333
4,woman,3,56,0.491228
5,child,3,25,0.431034
6,man,1,42,0.352941
7,man,3,38,0.119122
8,man,2,8,0.080808


child 의 나이는 몇 세부터 몇 세까지로 정의되어 있는지 확인

In [20]:
df.loc[df['who'] == 'child', 'age'].agg(['min', 'max'])

min     0.42
max    15.00
Name: age, dtype: float64

등급별 (pclass) / 연령별 (who) 평균 요금 비교

In [21]:
pd.DataFrame(df.groupby(['pclass', 'who'])['fare'].mean())

Unnamed: 0_level_0,Unnamed: 1_level_0,fare
pclass,who,Unnamed: 2_level_1
1,child,139.382633
1,man,65.951086
1,woman,104.317995
2,child,28.323905
2,man,19.054124
2,woman,20.868624
3,child,23.22019
3,man,11.340213
3,woman,15.354351


부자는 살았을까 ? (fare 요금 기준 상위 10% 의 생존율 확인 )
- fare 요금 기준 상위 10% 기준 요금 확인 및 생존자 & 생존율 계산

In [22]:
rich = df['fare'].quantile(0.9)
rich

77.9583

In [23]:
df.loc[df['fare'] >= rich, 'survived'].agg(['count', 'mean'])

count    90.000000
mean      0.766667
Name: survived, dtype: float64

생존자의 평균 나이와 사망자의 평균 나이 비교

In [24]:
df.groupby('survived')['age'].mean()

survived
0    30.626179
1    28.343690
Name: age, dtype: float64

deck 정보가 NaN 인 경우와 채워져 있는 경우 생존율 비교

In [25]:
df.loc[df['deck'].isnull(), 'survived'].mean()

0.29941860465116277

In [26]:
df.loc[df['deck'].notnull(), 'survived'].mean()

0.6699507389162561

결측치 확인

In [27]:
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

결측치 대체
- embarked 컬럼은 최빈값(mode) 으로 대체

In [28]:
df['embarked'] = df['embarked'].fillna(df['embarked'].mode()[0])

In [29]:
# 코드 검증 (Cell 실행시 에러가 나지 않아야 함)
assert 0 == df['embarked'].isnull().sum()

결측치 대체
- age 컬럼은 성별에 따른 평균 나이로 대체

In [30]:
df.groupby('sex')['age'].mean()

sex
female    27.915709
male      30.726645
Name: age, dtype: float64

In [31]:
male_mean = df.groupby('sex')['age'].mean()['male']
female_mean = df.groupby('sex')['age'].mean()['female']

df.loc[df['sex'] == 'male', 'age'] = \
    df.loc[df['sex'] == 'male', 'age'].fillna(male_mean)
df.loc[df['sex'] == 'female', 'age'] = \
    df.loc[df['sex'] == 'female', 'age'].fillna(female_mean)

In [32]:
# 코드 검증 (Cell 실행시 에러가 나지 않아야 함)
assert 27.92 == round(df.groupby('sex')['age'].mean()['female'], 2)
assert 30.73 == round(df.groupby('sex')['age'].mean()['male'], 2)

결측치 대체
- deck 컬럼 결측치는 No Data 로 대체

In [33]:
df['deck'] = df['deck'].cat.add_categories('No Data')

In [34]:
df['deck'] = df['deck'].fillna('No Data')

In [35]:
df['deck'].value_counts()

No Data    688
C           59
B           47
D           33
E           32
A           15
F           13
G            4
Name: deck, dtype: int64

중복된 컬럼 제거 (class, embark_town, alive 컬럼 제거)

In [36]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,No Data,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,No Data,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,No Data,Southampton,no,True


In [37]:
df = df.drop(['class', 'embark_town', 'alive'], axis=1)

In [38]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,who,adult_male,deck,alone
0,0,3,male,22.0,1,0,7.25,S,man,True,No Data,False
1,1,1,female,38.0,1,0,71.2833,C,woman,False,C,False
2,1,3,female,26.0,0,0,7.925,S,woman,False,No Data,True
3,1,1,female,35.0,1,0,53.1,S,woman,False,C,False
4,0,3,male,35.0,0,0,8.05,S,man,True,No Data,True


특성 공학 (feature engineering)
- family 컬럼을 만들고 sibsp + parch 더한 값을 입력

In [39]:
df['family'] = df['sibsp'] + df['parch']

In [40]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,who,adult_male,deck,alone,family
0,0,3,male,22.0,1,0,7.25,S,man,True,No Data,False,1
1,1,1,female,38.0,1,0,71.2833,C,woman,False,C,False,1
2,1,3,female,26.0,0,0,7.925,S,woman,False,No Data,True,0
3,1,1,female,35.0,1,0,53.1,S,woman,False,C,False,1
4,0,3,male,35.0,0,0,8.05,S,man,True,No Data,True,0


성별 & 가족 수 별 생존율 확인

In [41]:
pd.DataFrame(df.groupby(['sex', 'family'])['survived'].mean())

Unnamed: 0_level_0,Unnamed: 1_level_0,survived
sex,family,Unnamed: 2_level_1
female,0,0.785714
female,1,0.816092
female,2,0.77551
female,3,0.842105
female,4,0.25
female,5,0.375
female,6,0.375
female,7,0.0
female,10,0.0
male,0,0.155718


생존율 TOP 5 출력

In [42]:
result = df.groupby(['sex', 'family'])['survived'].mean()
result = result.reset_index().sort_values('survived', ascending=False)
result.head().reset_index(drop=True)

Unnamed: 0,sex,family,survived
0,female,3,0.842105
1,female,1,0.816092
2,female,0,0.785714
3,female,2,0.77551
4,male,3,0.5


생존율 하위 TOP 10 출력

In [43]:
result = df.groupby(['sex', 'family'])['survived'].mean()
result = result.reset_index().sort_values('survived', ascending=False)
result.tail(10).reset_index(drop=True)

Unnamed: 0,sex,family,survived
0,female,4,0.25
1,male,6,0.25
2,male,1,0.243243
3,male,0,0.155718
4,female,10,0.0
5,male,4,0.0
6,male,5,0.0
7,female,7,0.0
8,male,7,0.0
9,male,10,0.0


apply 함수를 활용하여 남자는 1, 여자는 0 으로 값을 변경하고 gender 컬럼을 새로
만들어 적용

In [44]:
def make_bin(x):
    if x == 'male':
        return 1
    elif x == 'female':
        return 0
    
df['gender'] = df['sex'].apply(make_bin)

In [45]:
# 코드 검증
df['gender'].value_counts()

1    577
0    314
Name: gender, dtype: int64

요금을 5 구간으로 나누어 fare_bin 컬럼을 새로 만들어 적용
- 동일한 분포를 갖도록 pd.qcut() 을 사용

In [46]:
df['fare_bin'] = pd.qcut(df['fare'], 5)

In [47]:
df['fare_bin'].value_counts()

(7.854, 10.5]        184
(21.679, 39.688]     180
(-0.001, 7.854]      179
(39.688, 512.329]    176
(10.5, 21.679]       172
Name: fare_bin, dtype: int64

나이를 10 구간으로 나누어 age_bin 컬럼을 새로 만들어 적용
- 동일한 구간을 갖도록 pd.cut() 을 사용

In [48]:
df['age_bin'] = pd.cut(df['age'], 10)

In [49]:
df['age_bin'].value_counts()

(24.294, 32.252]    346
(16.336, 24.294]    177
(32.252, 40.21]     118
(40.21, 48.168]      70
(0.34, 8.378]        54
(8.378, 16.336]      46
(48.168, 56.126]     45
(56.126, 64.084]     24
(64.084, 72.042]      9
(72.042, 80.0]        2
Name: age_bin, dtype: int64

아래의 명령을 통해 설치

In [50]:
#!pip install ydata-profiling

Defaulting to user installation because normal site-packages is not writeable




탐색적 데이터 분석을 진행할 데이터 프레임 준비

In [51]:
import pandas as pd
import ydata_profiling
from sklearn.datasets import load_boston
boston = load_boston()
df = pd.DataFrame(boston['data'], columns=boston['feature_names'])
df['target'] = boston['target']
df.head()


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


프로파일 보고서 생성
- profile_report 함수를 이용하여 보고서를 생성 후 변수에 저장하여 파일로 저장

In [52]:
report = df.profile_report()
report.to_file('./boston_report.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]