## Pandas 시작 - 파일을 DataFrame 로딩, 기본 API

In [1]:
import pandas as pd

### read_csv()
read_csv()를 이용하여 csv 파일을 편리하게 DataFrame으로 로딩 <br />
read_csv()의 sep 인자를 콤마(,)가 아닌 다른 분리자로 변경하여 다른 유형의 파일도 로드 가능

In [2]:
titanic_df = pd.read_csv('titanic_train.csv')
# titanic_df = pd.read_csv('titanic_train.tsv', sep='\t')

print('titanic 변수 type: ', type(titanic_df))

titanic 변수 type:  <class 'pandas.core.frame.DataFrame'>


### head()
DataFrame의 맨 앞 일부 데이터만 추출

In [3]:
titanic_df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### DataFrame 생성

In [4]:
dic1 = {'Name': ['Chulmin', 'Eunkyung', 'Jinwoong', 'Soobeom'],
        'Year': [2011, 2016, 2015, 2015],
        'Gender': ['Male', 'Female', 'Male', 'Male']
       }

# Dictionary를 DataFrame으로 변환
data_df = pd.DataFrame(dic1)
print(data_df)
print("#" * 30)

# 새로운 Columns명 추가
data_df = pd.DataFrame(dic1, columns=['Name', 'Year', 'Gender', 'Age'])
print(data_df)
print("#" * 30)

# Index를 새로운 값으로 할당
data_df = pd.DataFrame(dic1, index=['one', 'two', 'three', 'four'])
print(data_df)
print("#" * 30)

       Name  Year  Gender
0   Chulmin  2011    Male
1  Eunkyung  2016  Female
2  Jinwoong  2015    Male
3   Soobeom  2015    Male
##############################
       Name  Year  Gender  Age
0   Chulmin  2011    Male  NaN
1  Eunkyung  2016  Female  NaN
2  Jinwoong  2015    Male  NaN
3   Soobeom  2015    Male  NaN
##############################
           Name  Year  Gender
one     Chulmin  2011    Male
two    Eunkyung  2016  Female
three  Jinwoong  2015    Male
four    Soobeom  2015    Male
##############################


### DataFrame의 Column명과 Index

In [5]:
print('columns: ', titanic_df.columns)
print('index: ', titanic_df.index)
print('index value: ', titanic_df.index.values)

columns:  Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
index:  RangeIndex(start=0, stop=891, step=1)
index value:  [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 19

### DataFrame에서 Series 추출 및 DataFrame 필터링 추출

In [6]:
# DataFrame 객체에서 [] 연산자 내 한 개의 Column만 입력하면 Series 객체 반환
series = titanic_df['Name']
print(series.head(3))
print('## type: ', type(series))

# DataFrame 객체에서 [] 연산자 내 여러개의 Column을 리스트로 입력시, 그 Column으로 구성된 DataFrame 반환
filtered_df = titanic_df[['Name', 'Age']]
print(filtered_df.head(3))
print('## type: ', type(filtered_df))

# DataFrame 객체에서 [] 연산자 내 한 개의 Column을 리스트로 입력시, 그 Column으로 구성된 DataFrame 반환
one_col_df = titanic_df[['Name']]
print(one_col_df.head(3))
print('## type: ', type(one_col_df))

0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
Name: Name, dtype: object
## type:  <class 'pandas.core.series.Series'>
                                                Name   Age
0                            Braund, Mr. Owen Harris  22.0
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  38.0
2                             Heikkinen, Miss. Laina  26.0
## type:  <class 'pandas.core.frame.DataFrame'>
                                                Name
0                            Braund, Mr. Owen Harris
1  Cumings, Mrs. John Bradley (Florence Briggs Th...
2                             Heikkinen, Miss. Laina
## type:  <class 'pandas.core.frame.DataFrame'>


### shape
DataFrame의 Row와 Column의 크기를 가지고 있는 속성

In [7]:
# Index는 Column의 크기 속성에 포함되지 않음
print('DataFrame 크기: ', titanic_df.shape)

DataFrame 크기:  (891, 12)


### info
DataFrame내의 Column명, 데이터 타입, Null 개수, 데이터 건수 정보 제공

In [8]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


### describe()
데이터값들의 평균, 표준편차, 4분위 분포도 제공 <br />
숫자형 Column들에 대해 해당 정보 제공

In [9]:
# null 값은 제외 후 산정
titanic_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


### value_counts()
동일한 개별 데이터 값이 몇건 있는지 정보 제공 <br />
개별 데이터 값의 분포도를 제공 <br />

value_counts()는 Series 객체에서만 호출 될 수 있음

In [10]:
value_counts = titanic_df['Pclass'].value_counts()
print(type(value_counts))
print(value_counts)

<class 'pandas.core.series.Series'>
3    491
1    216
2    184
Name: Pclass, dtype: int64


In [11]:
titanic_pclass = titanic_df['Pclass']
print(type(titanic_pclass))

<class 'pandas.core.series.Series'>


In [12]:
titanic_pclass.head()

0    3
1    1
2    3
3    1
4    3
Name: Pclass, dtype: int64

### sort_values()
by = 정렬하고자 하는 Column <br />
ascending = True(오름차순), False(내림차순)

In [13]:
titanic_df.sort_values(by='Pclass', ascending = False)

titanic_df[['Name', 'Age']].sort_values(by='Age')
titanic_df[['Name', 'Age', 'Pclass']].sort_values(by=['Pclass', 'Age'])

Unnamed: 0,Name,Age,Pclass
305,"Allison, Master. Hudson Trevor",0.92,1
297,"Allison, Miss. Helen Loraine",2.00,1
445,"Dodge, Master. Washington",4.00,1
802,"Carter, Master. William Thornton II",11.00,1
435,"Carter, Miss. Lucile Polk",14.00,1
...,...,...,...
859,"Razi, Mr. Raihed",,3
863,"Sage, Miss. Dorothy Edith ""Dolly""",,3
868,"van Melkebeke, Mr. Philemon",,3
878,"Laleff, Mr. Kristo",,3


### DataFrame과 List, Dictionary, numpy ndarray 상호 변환
### List, ndarray에서 DataFrame 변환

In [14]:
import numpy as np

col_name1 = ['col1']
list1 = [1, 2, 3]
array1 = np.array(list1)
print('array1 shape: ', array1.shape)

df_list1 = pd.DataFrame(list1, columns=col_name1)
print('1차원 List로 만든 DataFrame\n', df_list1)

df_array1 = pd.DataFrame(array1, columns=col_name1)
print('1차원 ndaray로 만든 DataFrame\n', df_array1)

array1 shape:  (3,)
1차원 List로 만든 DataFrame
    col1
0     1
1     2
2     3
1차원 ndaray로 만든 DataFrame
    col1
0     1
1     2
2     3


In [15]:
col_name2 = ['col1', 'col2', 'col3']
list2 = [[1, 2, 3],
         [11, 12, 13]]
array2 = np.array(list2)
print('array2 shape: ', array2.shape)

df_list2 = pd.DataFrame(list2, columns=col_name2)
print('2차원 List로 만든 DataFrame\n', df_list2)

df_array2 = pd.DataFrame(array2, columns=col_name2)
print('2차원 ndaray로 만든 DataFrame\n', df_array2)

array2 shape:  (2, 3)
2차원 List로 만든 DataFrame
    col1  col2  col3
0     1     2     3
1    11    12    13
2차원 ndaray로 만든 DataFrame
    col1  col2  col3
0     1     2     3
1    11    12    13


### Dictionary에서 DataFrame 변환

In [16]:
# Key = Column명, Value = Lisr ( or ndarray )
dict = {'col1': [1, 11], 'col2': [2, 22], 'col3': [3, 44]}

df_dict = pd.DataFrame(dict)
print(df_dict)

   col1  col2  col3
0     1     2     3
1    11    22    44


### DataFrame을 ndarray로 변환

In [17]:
array3 = df_dict.values

print('df_dict.values 타입: ', type(array3), 'df_dict.value shape: ', array3.shape)
print(array3)

df_dict.values 타입:  <class 'numpy.ndarray'> df_dict.value shape:  (2, 3)
[[ 1  2  3]
 [11 22 44]]


### DataFrame을 List와 Dictionary로 변환

In [18]:
list3 = df_dict.values.tolist()
print('타입: ', type(list3))
print(list3)

dict3 = df_dict.to_dict('list')
print('타입: ', type(dict3))
print(dict3)

타입:  <class 'list'>
[[1, 2, 3], [11, 22, 44]]
타입:  <class 'dict'>
{'col1': [1, 11], 'col2': [2, 22], 'col3': [3, 44]}


### DataFrame의 Column Data Set Access
DataFrame의 column data set 생성과 수정은 [] 연산자를 이용 <br />
column에 새로운 값을 할당하려면 DataFrame [] 내에 column명을 입력하고 값을 할당

In [19]:
titanic_df['Age_0'] = 0
titanic_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_0
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0


In [20]:
titanic_df['Age_by_10'] = titanic_df['Age'] * 10
titanic_df['Family_No'] = titanic_df['SibSp'] + titanic_df['Parch'] + 1
titanic_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_0,Age_by_10,Family_No
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,220.0,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0,380.0,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,260.0,1


기존 Column 값을 업데이트 하려면 해당 Column에 업데이트 값 지정

In [21]:
titanic_df['Age_by_10'] = titanic_df['Age_by_10'] + 100
titanic_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_0,Age_by_10,Family_No
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,320.0,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0,480.0,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,360.0,1


### DataFrrame 데이터 삭제

### axis에 따른 삭제

In [22]:
titanic_drop_df = titanic_df.drop('Age_0', axis=1)
titanic_drop_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_by_10,Family_No
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,320.0,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,480.0,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,360.0,1


drop()의 inplace 인자의 defalut값은 False <br />
이 경우 drop() 호출을 한 DataFrame은 아무런 영향이 없으며 method 호출 결과가 해당 컬럼이 drop 된 DataFrame을 return

In [23]:
titanic_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_0,Age_by_10,Family_No
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,320.0,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0,480.0,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,360.0,1


여러개의 Column들의 삭제는 drop 인자로 List 입력 <br />
inplace = True의 경우 호출한 DataFrame에 drop이 반영, return값은 None

In [24]:
drop_result = titanic_df.drop(['Age_0', 'Age_by_10', 'Family_No'], axis=1, inplace=True)
print(drop_result)
titanic_df.head(3)

None


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


axis=0의 경우 drop()은 row 방향으로 데이터 삭제

In [25]:
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 15)
print('#### before axis 0 drop ####')
print(titanic_df.head(3))

titanic_df.drop([0, 1, 2], axis=0, inplace=True)

print('#### after axis 0 drop ####')
print(titanic_df.head(3))

#### before axis 0 drop ####
   PassengerId  Survived  Pclass            Name     Sex   Age  SibSp  Parch          Ticket     Fare Cabin Embarked
0            1         0       3  Braund, Mr....    male  22.0      1      0       A/5 21171   7.2500   NaN        S
1            2         1       1  Cumings, Mr...  female  38.0      1      0        PC 17599  71.2833   C85        C
2            3         1       3  Heikkinen, ...  female  26.0      0      0  STON/O2. 31...   7.9250   NaN        S
#### after axis 0 drop ####
   PassengerId  Survived  Pclass            Name     Sex   Age  SibSp  Parch  Ticket     Fare Cabin Embarked
3            4         1       1  Futrelle, M...  female  35.0      1      0  113803  53.1000  C123        S
4            5         0       3  Allen, Mr. ...    male  35.0      0      0  373450   8.0500   NaN        S
5            6         0       3  Moran, Mr. ...    male   NaN      0      0  330877   8.4583   NaN        Q


### Index 객체

In [26]:
# 원본 파일 재 로딩
titanic_df = pd.read_csv('titanic_train.csv')

# Index 객체 추출
indexes = titanic_df.index
print(indexes)

# Index 객체를 실제 값 array로 변환 
print('Index 객체 Array 값\n', indexes.values)

RangeIndex(start=0, stop=891, step=1)
Index 객체 Array 값
 [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232

Index는 1차원 데이터

In [27]:
print(type(indexes.values))
print(indexes.values.shape)
print(indexes[:5].values)
print(indexes.values[:5])
print(indexes[6])

<class 'numpy.ndarray'>
(891,)
[0 1 2 3 4]
[0 1 2 3 4]
6


**Index 값은 임의 변경이 불가능**

Series 객체는 Index 객체를 포함하지만 연산 함수 적용 시 Index는 연산에서 제외 <br />
Index는 오직 식별용으로만 사용

In [28]:
series_fair = titanic_df['Fare']
series_fair.head(5)

0     7.2500
1    71.2833
2     7.9250
3    53.1000
4     8.0500
Name: Fare, dtype: float64

In [29]:
print('Fair Series max: ', series_fair.max())
print('Fair Series min: ', series_fair.min())
print('sum() Fair Series: ', sum(series_fair))
print('Fair Series + 3\n', (series_fair + 3).head(3) )

Fair Series max:  512.3292
Fair Series min:  0.0
sum() Fair Series:  28693.949299999967
Fair Series + 3
 0    10.2500
1    74.2833
2    10.9250
Name: Fare, dtype: float64


DataFrame과 Series에서 reset_index() method를 수행하면 새롭게 인덱스를 연속 숫자 형으로 할당하며 기존 인덱스는 'index'라는 새로운 Column명으로 추가

In [30]:
titanic_reset_df = titanic_df.reset_index(inplace=False)
titanic_reset_df.head(3)

Unnamed: 0,index,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,1,0,3,"Braund, Mr....",male,22.0,1,0,A/5 21171,7.25,,S
1,1,2,1,1,"Cumings, Mr...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,2,3,1,3,"Heikkinen, ...",female,26.0,0,0,STON/O2. 31...,7.925,,S


In [31]:
titanic_reset_df.shape

(891, 13)

In [32]:
print('#### before reset_index ####')
value_counts = titanic_df['Pclass'].value_counts()
print(value_counts)
print('value_counts type', type(value_counts))

new_value_counts = value_counts.reset_index(inplace=False)
print('#### after reset_index ####')
print(new_value_counts)
print('new_value_counts type', type(new_value_counts))

#### before reset_index ####
3    491
1    216
2    184
Name: Pclass, dtype: int64
value_counts type <class 'pandas.core.series.Series'>
#### after reset_index ####
   index  Pclass
0      3     491
1      1     216
2      2     184
new_value_counts type <class 'pandas.core.frame.DataFrame'>


## 데이터 Selection 및 Filtering

### DataFrame의 [] 연산자

numpy에서의 [] 연산자는 행, 열의 위치, 슬라이싱 범위 등을 지정 <br />
하지만, DataFrame의 [] 안에 들어갈 수 있는 것은 Column명의 문자(또는 Column명의 List 객체), 또는 Index로 변환 가능한 표현식임

In [33]:
titanic_df = pd.read_csv('titanic_train.csv')

print('단일 Column data 추출\n', titanic_df['Pclass'].head(3))
print('\n여러 Column들의 data 추출\n', titanic_df[['Survived', 'Pclass']].head(3))

# [ ] 안의 숫자 Index는 KeyError 발생
# print(titanic_df[0])

단일 Column data 추출
 0    3
1    1
2    3
Name: Pclass, dtype: int64

여러 Column들의 data 추출
    Survived  Pclass
0         0       3
1         1       1
2         1       3


앞에서 DataFrame의 [] 내에 숫자 값을 입력할 경우 KeyError가 발생했음 <br />
하지만, Pandas의 Index 형태로 변환가능한 표현식은 사용할 수 있음

titanic_df의 처음 2개의 data를 추출하고자 titanic_df[0:2]와 같은 슬라이싱은 사용 가능

In [34]:
titanic_df[0:2]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr....",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mr...",female,38.0,1,0,PC 17599,71.2833,C85,C


[ ] 내에 조건식을 입력하여 Boolean Indexing 수행 가능

In [35]:
titanic_df[titanic_df['Pclass'] == 3].head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr....",male,22.0,1,0,A/5 21171,7.25,,S
2,3,1,3,"Heikkinen, ...",female,26.0,0,0,STON/O2. 31...,7.925,,S
4,5,0,3,"Allen, Mr. ...",male,35.0,0,0,373450,8.05,,S


### DataFrame ix[] 연산자
명칭 기반과 위치 기반 인덱싱 모두 제공 ( Debug하기 불편하여 잘 안씀 )

현재 삭제된 상태

### DataFrame iloc[] 연산자
위치 기반 인덱싱 제공

In [36]:
data = {'Name': ['Chulmin', 'Eunkyung','Jinwoong','Soobeom'],
        'Year': [2011, 2016, 2015, 2015],
        'Gender': ['Male', 'Female', 'Male', 'Male']
       }
data_df = pd.DataFrame(data, index=['one','two','three','four'])
data_df

Unnamed: 0,Name,Year,Gender
one,Chulmin,2011,Male
two,Eunkyung,2016,Female
three,Jinwoong,2015,Male
four,Soobeom,2015,Male


In [37]:
# data_df 를 reset_index() 로 새로운 숫자형 인덱스를 생성
data_df_reset = data_df.reset_index()
data_df_reset = data_df_reset.rename(columns={'index':'old_index'})

# index 값에 1을 더해서 1부터 시작하는 새로운 index값 생성
data_df_reset.index = data_df_reset.index+1
data_df_reset

Unnamed: 0,old_index,Name,Year,Gender
1,one,Chulmin,2011,Male
2,two,Eunkyung,2016,Female
3,three,Jinwoong,2015,Male
4,four,Soobeom,2015,Male


In [38]:
data_df.head()

Unnamed: 0,Name,Year,Gender
one,Chulmin,2011,Male
two,Eunkyung,2016,Female
three,Jinwoong,2015,Male
four,Soobeom,2015,Male


In [39]:
data_df.iloc[0, 0]

'Chulmin'

In [40]:
# 위치 기반이 아닌 명칭 기반을 사용했으므로 오류
# data_df.lioc[0, 'Name']

# DataFrame의 Index 객체도 사용할 수 없음
# data_df.lioc['one', 0]

In [41]:
data_df_reset.head()

Unnamed: 0,old_index,Name,Year,Gender
1,one,Chulmin,2011,Male
2,two,Eunkyung,2016,Female
3,three,Jinwoong,2015,Male
4,four,Soobeom,2015,Male


In [42]:
data_df_reset.iloc[0, 1]

'Chulmin'

### DataFrame loc[] 연산자
명칭 기반 인덱싱 제공

In [43]:
data_df.loc['one', 'Name']

'Chulmin'

In [44]:
data_df_reset.loc[1, 'Name']

'Chulmin'

In [45]:
# 위치 기반이 아니므로 DataFrame Index 객체에 0이 없어 에러 발생
# data_df_reset.loc[0, 'Name']

In [46]:
print('위치기반 lioc slicing\n', data_df.iloc[0:1, 0])
print('\n명칭기반 loc slicing\n', data_df.loc['one':'two', 'Name'])

위치기반 lioc slicing
 one    Chulmin
Name: Name, dtype: object

명칭기반 loc slicing
 one     Chulmin
two    Eunkyung
Name: Name, dtype: object


In [47]:
print(data_df_reset.loc[1:2, 'Name'])

1     Chulmin
2    Eunkyung
Name: Name, dtype: object


### Boolean indexing
조건식을 [ ] 안에 기입하여 간편하게 필터링

In [48]:
titanic_df = pd.read_csv('titanic_train.csv')
titanic_boolean = titanic_df[ titanic_df['Age'] > 60 ]
print(type(titanic_boolean))
titanic_boolean

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
33,34,0,2,"Wheadon, Mr...",male,66.0,0,0,C.A. 24579,10.5,,S
54,55,0,1,"Ostby, Mr. ...",male,65.0,0,1,113509,61.9792,B30,C
96,97,0,1,Goldschmidt...,male,71.0,0,0,PC 17754,34.6542,A5,C
116,117,0,3,"Connors, Mr...",male,70.5,0,0,370369,7.75,,Q
170,171,0,1,Van der hoe...,male,61.0,0,0,111240,33.5,B19,S
252,253,0,1,"Stead, Mr. ...",male,62.0,0,0,113514,26.55,C87,S
275,276,1,1,"Andrews, Mi...",female,63.0,1,0,13502,77.9583,D7,S
280,281,0,3,"Duane, Mr. ...",male,65.0,0,0,336439,7.75,,Q
326,327,0,3,"Nysveen, Mr...",male,61.0,0,0,345364,6.2375,,S
438,439,0,1,"Fortune, Mr...",male,64.0,1,4,19950,263.0,C23 C25 C27,S


In [49]:
var1 = titanic_df['Age'] > 60
print(type(var1))
print(var1)

<class 'pandas.core.series.Series'>
0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Name: Age, Length: 891, dtype: bool


In [50]:
titanic_df[ titanic_df['Age'] > 60 ][['Name', 'Age']].head(3)

Unnamed: 0,Name,Age
33,"Wheadon, Mr...",66.0
54,"Ostby, Mr. ...",65.0
96,Goldschmidt...,71.0


In [51]:
titanic_df[['Name', 'Age']][ titanic_df['Age'] > 60 ].head(3)

Unnamed: 0,Name,Age
33,"Wheadon, Mr...",66.0
54,"Ostby, Mr. ...",65.0
96,Goldschmidt...,71.0


In [52]:
# loc는 Boolean Indexing을 지원
# row 위치에 Boolean Indexing, column 위치에 보여줄 Column
titanic_df.loc[ titanic_df['Age'] > 60, ['Name', 'Age']].head(3)

Unnamed: 0,Name,Age
33,"Wheadon, Mr...",66.0
54,"Ostby, Mr. ...",65.0
96,Goldschmidt...,71.0


논리 연산자로 결합된 조건식도 Boolean Indexing 적용 가능

In [53]:
titanic_df[ (titanic_df['Age'] > 60) & (titanic_df['Pclass'] == 1) & (titanic_df['Sex'] == 'female')]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
275,276,1,1,"Andrews, Mi...",female,63.0,1,0,13502,77.9583,D7,S
829,830,1,1,"Stone, Mrs....",female,62.0,0,0,113572,80.0,B28,


조건식은 변수로도 할당 가능 (가독성 향상)

In [54]:
cond1 = titanic_df['Age'] > 60
cond2 = titanic_df['Pclass'] == 1
cond3 = titanic_df['Sex'] == 'female'

titanic_df[ cond1 & cond2 & cond3 ]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
275,276,1,1,"Andrews, Mi...",female,63.0,1,0,13502,77.9583,D7,S
829,830,1,1,"Stone, Mrs....",female,62.0,0,0,113572,80.0,B28,


## Aggregation Function과 GroupBy 적용
### Aggregation Function

In [55]:
titanic_df = pd.read_csv('titanic_train.csv')

In [56]:
titanic_df.count()

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64

특정 Column들로 Aggregation Function 수행

In [57]:
# titanic_df[['Age', 'Fare']].mean(axis=1)
# ()의 defalut는 axis=0
titanic_df[['Age', 'Fare']].mean()

Age     29.699118
Fare    32.204208
dtype: float64

In [58]:
# titanic_df[['Age', 'Fare']].sum(axis=0)
titanic_df[['Age', 'Fare']].sum()

Age     21205.1700
Fare    28693.9493
dtype: float64

In [59]:
titanic_df[['Age', 'Fare']].count()

Age     714
Fare    891
dtype: int64

### groupby()
by 인자에 Group By 하고자 하는 Column을 입력 <br />
여러개의 Column Group By 시 [] 내에 해당하는 Column명 입력

DataFrame에 groupby()를 호출하면 DataFrameGroupBy 객체 반환

In [60]:
titanic_groupby = titanic_df.groupby(by='Pclass')
print(type(titanic_groupby))
print(titanic_groupby)

<class 'pandas.core.groupby.generic.DataFrameGroupBy'>
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000180B47D14C0>


DataFrameGroupBy 객체에 Aggregation Function 호출하여 Group By 수행

In [61]:
titanic_groupby = titanic_df.groupby('Pclass').count()
titanic_groupby

Unnamed: 0_level_0,PassengerId,Survived,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,216,216,216,216,186,216,216,216,216,176,214
2,184,184,184,184,173,184,184,184,184,16,184
3,491,491,491,491,355,491,491,491,491,12,491


In [62]:
print(type(titanic_groupby))
print(titanic_groupby.shape)
print(titanic_groupby.index)

<class 'pandas.core.frame.DataFrame'>
(3, 11)
Int64Index([1, 2, 3], dtype='int64', name='Pclass')


In [63]:
titanic_groupby = titanic_df.groupby(by='Pclass')[['PassengerId', 'Survived']].count()
titanic_groupby

Unnamed: 0_level_0,PassengerId,Survived
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,216,216
2,184,184
3,491,491


In [64]:
titanic_df[['Pclass', 'PassengerId', 'Survived']].groupby('Pclass').count()

Unnamed: 0_level_0,PassengerId,Survived
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,216,216
2,184,184
3,491,491


In [65]:
titanic_df.groupby('Pclass')['Pclass'].count()
titanic_df['Pclass'].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

Pandas는 여러개의 aggreagation function의 실행을 위해 agg()를 제공

In [66]:
titanic_df.groupby('Pclass')['Age'].agg([max, min])

Unnamed: 0_level_0,max,min
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,80.0,0.92
2,70.0,0.67
3,74.0,0.42


In [67]:
agg_format = {'Age': 'max', 'SibSp': 'sum', 'Fare': 'mean'}
titanic_df.groupby('Pclass').agg(agg_format)

Unnamed: 0_level_0,Age,SibSp,Fare
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,80.0,90,84.154687
2,70.0,74,20.662183
3,74.0,302,13.67555


## Missing 데이터 처리하기
DataFrame의 isna()는 모든 Column 값들이 NaN인지 True/False 값을 반환

In [68]:
titanic_df.isna().head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,False,False,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,True,False


isna() 반환 결과에 sum()을 호출하여 Column 별 NaN 건수 확인할 수 있음

In [69]:
# isna()에서 True인 값들만 더하므로 NaN의 갯수를 확인할 수 있음 (True == 1)
titanic_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

### fillna()로 Missing 데이터 대체

In [70]:
## Inplace의 defalut = False
titanic_df['Cabin'] = titanic_df['Cabin'].fillna('C000')
titanic_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr....",male,22.0,1,0,A/5 21171,7.25,C000,S
1,2,1,1,"Cumings, Mr...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, ...",female,26.0,0,0,STON/O2. 31...,7.925,C000,S


In [71]:
titanic_df['Age'] = titanic_df['Age'].fillna(titanic_df['Age'].mean())
titanic_df['Embarked'] = titanic_df['Embarked'].fillna('S')
titanic_df.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

## Apply lambda 식으로 Data 가공
파이썬 lambda

In [72]:
def get_square(a):
    return a ** 2

print('3의 제곱: ', get_square(3))

3의 제곱:  9


In [73]:
lambda_square = lambda x : x ** 2

print('3의 제곱: ', lambda_square(3))

3의 제곱:  9


In [74]:
a = [1, 2, 3]

squares = map(lambda x : x ** 2, a)
list(squares)

[1, 4, 9]

### Pandas에 apply lambda 적용

In [75]:
titanic_df['Name_len'] = titanic_df['Name'].apply(lambda x : len(x))
titanic_df[['Name', 'Name_len']].head(3)

Unnamed: 0,Name,Name_len
0,"Braund, Mr....",23
1,"Cumings, Mr...",51
2,"Heikkinen, ...",22


In [76]:
titanic_df['Child_Adult'] = titanic_df['Age'].apply(lambda x : 'Child' if x <= 15 else 'Adult')
titanic_df[['Age', 'Child_Adult']].head(10)

Unnamed: 0,Age,Child_Adult
0,22.0,Adult
1,38.0,Adult
2,26.0,Adult
3,35.0,Adult
4,35.0,Adult
5,29.699118,Adult
6,54.0,Adult
7,2.0,Child
8,27.0,Adult
9,14.0,Child


In [77]:
titanic_df['Age_cat'] = titanic_df['Age'].apply(lambda x : 'Child' if x <= 15 else ('Adult' if x <= 60 else 'Elderly'))
titanic_df['Age_cat'].value_counts()

Adult      786
Child       83
Elderly     22
Name: Age_cat, dtype: int64

In [78]:
def get_category(age):
    cat = ''
    if age <= 5: cat = 'Baby'
    elif age <= 12: cat = 'Child'
    elif age <= 18: cat = 'Teenager'
    elif age <= 25: cat = 'Student'
    elif age <= 35: cat = 'Young Adult'
    elif age <= 60: cat = 'Adult'
    else: cat = 'Elderly'
        
    return cat

titanic_df['Age_cat'] = titanic_df['Age'].apply(lambda x : get_category(x))
titanic_df[['Age', 'Age_cat']].head()

Unnamed: 0,Age,Age_cat
0,22.0,Student
1,38.0,Adult
2,26.0,Young Adult
3,35.0,Young Adult
4,35.0,Young Adult
