In [1]:
import pandas as pd

## 파일 읽어오기

In [3]:
url = 'https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv'
titanic = pd.read_csv(url)
titanic[:2]

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833


## 데이터 파악

In [4]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 887 entries, 0 to 886
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Survived                 887 non-null    int64  
 1   Pclass                   887 non-null    int64  
 2   Name                     887 non-null    object 
 3   Sex                      887 non-null    object 
 4   Age                      887 non-null    float64
 5   Siblings/Spouses Aboard  887 non-null    int64  
 6   Parents/Children Aboard  887 non-null    int64  
 7   Fare                     887 non-null    float64
dtypes: float64(2), int64(4), object(2)
memory usage: 55.6+ KB


In [5]:
# 컬럼명 확인
titanic.columns

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'Siblings/Spouses Aboard',
       'Parents/Children Aboard', 'Fare'],
      dtype='object')

## 전처리

In [13]:
# 결측치 확인
titanic.isna().sum()

Survived                   0
Pclass                     0
Name                       0
Sex                        0
Age                        0
Siblings/Spouses Aboard    0
Parents/Children Aboard    0
Fare                       0
dtype: int64

In [19]:
# 컬럼명 변경 : Sex -> Gender
titanic.rename(columns={'Sex' : 'Gender'}, inplace=True)
titanic[:2]

Unnamed: 0,Survived,Pclass,Name,Gender,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833


## 가설
- 무엇을 분석할 것인가? 
1. 생존자의 성별 비율은 비슷할 것이다.
2. 1등실 승객의 생존율이 높았을 것이다.
3. 사망자는 고령자가 많았을 것이다.

### [가설 1] 생존자의 성별 비율은 비슷할 것이다.

#### [방법 1]

In [60]:
# 생존자 성별 인원 수
survived_gender = titanic[titanic.Survived ==1].groupby('Gender').size()
survived_gender

Gender
female    233
male      109
dtype: int64

In [61]:
# 생존자 성별 비율
survived_gender / survived_gender.sum() * 100

Gender
female    68.128655
male      31.871345
dtype: float64

#### [방법 2]

In [66]:
titanic[titanic.Survived == 1].Gender.value_counts(normalize=True) * 100

Gender
female    68.128655
male      31.871345
Name: proportion, dtype: float64

In [74]:
# 성별별 사망자와 생존자 비율
g_s = titanic.groupby('Gender').Survived.value_counts(normalize=True).sort_index() * 100
g_s

Gender  Survived
female  0           25.796178
        1           74.203822
male    0           80.977312
        1           19.022688
Name: proportion, dtype: float64

In [90]:
g_s.unstack() # 펼치기

Survived,0,1
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
female,25.796178,74.203822
male,80.977312,19.022688


In [92]:
g_s.unstack('Gender')
g_s.unstack(0)

Gender,female,male
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,25.796178,80.977312
1,74.203822,19.022688


In [93]:
g_s.unstack(0).stack()

Survived  Gender
0         female    25.796178
          male      80.977312
1         female    74.203822
          male      19.022688
dtype: float64

In [95]:
# 행의 인덱스 순서만 바꿈
g_s.swaplevel().sort_index()

Survived  Gender
0         female    25.796178
          male      80.977312
1         female    74.203822
          male      19.022688
Name: proportion, dtype: float64

#### 시각화

In [39]:
titanic[titanic.Survived == 1].groupby('Pclass').Survived.count()

Pclass
1    136
2     87
3    119
Name: Survived, dtype: int64

In [46]:
titanic[titanic.Survived == 0].groupby('Age').Survived.count()

Age
1.0     2
2.0     7
3.0     2
4.0     3
5.0     2
       ..
69.0    1
70.0    2
70.5    1
71.0    2
74.0    1
Name: Survived, Length: 80, dtype: int64