# Pandas for Data Manipulation in Python

**아래 링크를 통해 이 노트북을 주피터 노트북 뷰어(nbviewer.jupyter.org)로 보거나 구글 코랩(colab.research.google.com)에서 실행할 수 있습니다.**

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://nbviewer.jupyter.org/github/nhkim55/bigdata_fintech_python/blob/main/code/ch7_pandas.ipynb"><img src="https://jupyter.org/assets/share.png" width="60" />주피터 노트북 뷰어로 보기</a>
  </td>
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/nhkim55/bigdata_fintech_python/blob/main/code/ch7_pandas.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />구글 코랩(Colab)에서 실행하기</a>
  </td>
</table>

## Read Data

### CSV 파일
data_frame = pd.read_csv(input_file)

input_file은 불러오려는 CSV 파일의 경로




### Excel 파일


data_frame = pd.read_excel(input_file, sheetname, header)


input_file은 불려오려는 EXCEL 파일의 경로


In [None]:
# 라이브러리를 임포트합니다.
import pandas as pd

# 데이터 URL
url = 'https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/titanic.csv'

# 데이터프레임으로 데이터를 적재합니다.
dataframe = pd.read_csv(url)

# 처음 다섯 개의 행을 출력합니다.
dataframe.head(5)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0,1
4,"Allison, Master Hudson Trevor",1st,0.92,male,1,0


## Create Data

In [None]:
# 라이브러리를 임포트합니다.
import pandas as pd

# 데이터프레임을 만듭니다.
dataframe = pd.DataFrame()

# 열을 추가합니다.
dataframe['Name'] = ['Jacky Jackson', 'Steven Stevenson']
dataframe['Age'] = [38, 25]
dataframe['Driver'] = [True, False]

# 데이터프레임을 확인합니다.
dataframe

Unnamed: 0,Name,Age,Driver
0,Jacky Jackson,38,True
1,Steven Stevenson,25,False


In [None]:
# 행을 만듭니다.
new_person = pd.Series(['Molly Mooney', 40, True], index=['Name','Age','Driver'])

# 행을 추가합니다.
dataframe.append(new_person, ignore_index=True)

Unnamed: 0,Name,Age,Driver
0,Jacky Jackson,38,True
1,Steven Stevenson,25,False
2,Molly Mooney,40,True


In [None]:
# 행 추가 
dataframe.loc[len(dataframe)]=["aa",50, False]
print(dataframe)

               Name  Age  Driver
0     Jacky Jackson   38    True
1  Steven Stevenson   25   False
2                aa   50   False


In [None]:
# 리스트나 튜플로부터 생성
data = [ ['Jacky Jackson', 38, True], ['Steven Stevenson', 25, False] ]
pd.DataFrame(data, columns=['Name', 'Age', 'Driver'])

Unnamed: 0,0,1,2
0,Jacky Jackson,38,True
1,Steven Stevenson,25,False


In [None]:
# ndarray로부터 생성
import numpy as np

data = [ ['Jacky Jackson', 38, True], ['Steven Stevenson', 25, False] ]

matrix = np.array(data)
pd.DataFrame(matrix, columns=['Name', 'Age', 'Driver'])

Unnamed: 0,Name,Age,Driver
0,Jacky Jackson,38,True
1,Steven Stevenson,25,False


In [None]:
# 딕셔너리로부터 생성
data = {'Name': ['Jacky Jackson', 'Steven Stevenson'],
        'Age': [38, 25],
        'Driver': [True, False]}
pd.DataFrame(data)

Unnamed: 0,Name,Age,Driver
0,Jacky Jackson,38,True
1,Steven Stevenson,25,False


In [None]:
# 딕셔너리로 이루어진 리스트로부터 생성
data = [ {'Name': 'Jacky Jackson', 'Age': 38, 'Driver': True},
         {'Name': 'Steven Stevenson', 'Age': 25, 'Driver': False} ]
# index 지정도 가능
pd.DataFrame(data, index=['row1', 'row2'])

Unnamed: 0,Name,Age,Driver
row1,Jacky Jackson,38,True
row2,Steven Stevenson,25,False


## Write Data


data_frame.to_csv(output_file, sep = ‘,’, index=False)

output_file은 출력하려는 CSV 파일의 경로

In [None]:
# 구글 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# 구글 드라이브에 저장
data_frame1.to_csv('/content/drive/My Drive/python/data/df.csv', sep=',', index=False)

## Pandas Data Type-Series

In [None]:
import pandas as pd
dat = pd.Series([1,3,6,12])
dat


12

In [None]:
dat.values

array([ 1,  3,  6, 12])

In [None]:
import numpy as np
dat2 = pd.Series(np.array([1,3,np.nan,12]))
dat3 = pd.Series(['aa', 'bb', 'c', 'd'])
dat4 = pd.Series([1, 'aa', 2.34, 'd'])
print(dat2)
print(dat3)
print(dat4)

0     1.0
1     3.0
2     NaN
3    12.0
dtype: float64
0    aa
1    bb
2     c
3     d
dtype: object
0       1
1      aa
2    2.34
3       d
dtype: object


In [None]:
dat5 = pd.Series([1,3,6,12], index=[1, 10, 20, 33])
dat6 = pd.Series([1,3,6,12], index=['a','b','c','a'])
dat7 = pd.Series({'a': 1, 'b':3, 'c':6, 'd':12})
print(dat5)
print(dat6)
print(dat7)

1      1
10     3
20     6
33    12
dtype: int64
a     1
b     3
c     6
a    12
dtype: int64
a     1
b     3
c     6
d    12
dtype: int64


In [None]:
dat2.index = ['un', 'due', 'trois', 'quatre']
dat2

un         1.0
due        3.0
trois      NaN
quatre    12.0
dtype: float64

In [None]:
df=pd.DataFrame(np.random.randn(6,2),
             index=[[1,1,2,2,3,3],[1,2,1,2,1,2]],columns=['item1','item2'])
print(df)

        item1     item2
1 1 -1.353775  1.674084
  2  0.502874 -0.531864
2 1  0.147058 -0.544811
  2 -0.163631  0.723905
3 1 -1.863569  1.462880
  2  0.007659 -1.541851


In [None]:
df.item1[1:2]

1  2   -1.704634
Name: item1, dtype: float64

In [None]:
df.loc[3,1]

item1   -1.863569
item2    1.462880
Name: (3, 1), dtype: float64

## Data Frame

### 데이터 설명하기

In [None]:
# 라이브러리를 임포트합니다.
import pandas as pd

# 데이터 URL
url = 'https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/titanic.csv'

# 데이터를 적재합니다.
dataframe = pd.read_csv(url)

# 두 개의 행을 확인합니다.
dataframe.head(2)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1


In [None]:
dataframe.tail(3)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
1310,"Zenni, Mr Philip",3rd,22.0,male,0,0
1311,"Lievens, Mr Rene",3rd,24.0,male,0,0
1312,"Zimmerman, Leo",3rd,29.0,male,0,0


In [None]:
# 차원을 확인합니다.
dataframe.shape

(1313, 6)

In [None]:
# 통곗값을 확인합니다.
dataframe.describe()

Unnamed: 0,Age,Survived,SexCode
count,756.0,1313.0,1313.0
mean,30.397989,0.342727,0.351866
std,14.259049,0.474802,0.477734
min,0.17,0.0,0.0
25%,21.0,0.0,0.0
50%,28.0,0.0,0.0
75%,39.0,1.0,1.0
max,71.0,1.0,1.0


### 탐색하기

In [None]:
# 라이브러리를 임포트합니다.
import pandas as pd

# 데이터 URL
url = 'https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/titanic.csv'

# 데이터를 적재합니다.
dataframe = pd.read_csv(url)

# 첫 번째 행을 선택합니다.
dataframe.iloc[0]

Name        Allen, Miss Elisabeth Walton
PClass                               1st
Age                                   29
Sex                               female
Survived                               1
SexCode                                1
Name: 0, dtype: object

In [None]:
# 세 개의 행을 선택합니다.
dataframe.iloc[1:4]

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0,1


In [None]:
# 네 개의 행을 선택합니다.
dataframe.loc[1:4]

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0,1
4,"Allison, Master Hudson Trevor",1st,0.92,male,1,0


In [None]:
# 네 개의 행을 선택합니다.
dataframe.iloc[:4]

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0,1


In [None]:
# 인덱스를 설정합니다.
dataframe = dataframe.set_index(dataframe['Name'])

# 행을 확인합니다.
dataframe.loc['Allen, Miss Elisabeth Walton']

Name        Allen, Miss Elisabeth Walton
PClass                               1st
Age                                   29
Sex                               female
Survived                               1
SexCode                                1
Name: Allen, Miss Elisabeth Walton, dtype: object

In [None]:
# 'Allison, Miss Helen Loraine' 이전까지 Age 열과 Sex 열만 선택합니다.
dataframe.loc[:'Allison, Miss Helen Loraine', 'Age':'Sex']

Unnamed: 0_level_0,Age,Sex
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
"Allen, Miss Elisabeth Walton",29.0,female
"Allison, Miss Helen Loraine",2.0,female


In [None]:
# dataframe[:2]와 동일합니다.
dataframe[:'Allison, Miss Helen Loraine']

Unnamed: 0_level_0,Name,PClass,Age,Sex,Survived,SexCode
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Allen, Miss Elisabeth Walton","Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
"Allison, Miss Helen Loraine","Allison, Miss Helen Loraine",1st,2.0,female,0,1


In [None]:
dataframe['Name']

Name
Allen, Miss Elisabeth Walton                                      Allen, Miss Elisabeth Walton
Allison, Miss Helen Loraine                                        Allison, Miss Helen Loraine
Allison, Mr Hudson Joshua Creighton                        Allison, Mr Hudson Joshua Creighton
Allison, Mrs Hudson JC (Bessie Waldo Daniels)    Allison, Mrs Hudson JC (Bessie Waldo Daniels)
Allison, Master Hudson Trevor                                    Allison, Master Hudson Trevor
                                                                     ...                      
Zakarian, Mr Artun                                                          Zakarian, Mr Artun
Zakarian, Mr Maprieder                                                  Zakarian, Mr Maprieder
Zenni, Mr Philip                                                              Zenni, Mr Philip
Lievens, Mr Rene                                                              Lievens, Mr Rene
Zimmerman, Leo                               

In [None]:
dataframe[['Age', 'Sex']].head(2)

Unnamed: 0_level_0,Age,Sex
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
"Allen, Miss Elisabeth Walton",29.0,female
"Allison, Miss Helen Loraine",2.0,female


### 조건에 따라 행 선택

In [None]:
# 라이브러리를 임포트합니다.
import pandas as pd

# 데이터 URL
url = 'https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/titanic.csv'

# 데이터를 적재합니다.
dataframe = pd.read_csv(url)

# ‘sex’ 열이 ‘female’인 행 중 처음 두 개를 출력합니다.
dataframe[dataframe['Sex'] == 'female'].head(2)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1


In [None]:
# 행을 필터링합니다.
dataframe[(dataframe['Sex'] == 'female') & (dataframe['Age'] >= 65)]

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
73,"Crosby, Mrs Edward Gifford (Catherine Elizabet...",1st,69.0,female,1,1


In [None]:
# Name 열에 Allison이 포함된 행만 찾기
dataframe['Name'].str.find('Allison')

0      -1
1       0
2       0
3       0
4       0
       ..
1308   -1
1309   -1
1310   -1
1311   -1
1312   -1
Name: Name, Length: 1313, dtype: int64

In [None]:
dataframe[dataframe['Name'].str.find('Allison')>-1].head(3)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0,1


### 값치환

In [None]:
# 라이브러리를 임포트합니다.
import pandas as pd

# 데이터 URL
url = 'https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/titanic.csv'

# 데이터를 적재합니다.
dataframe = pd.read_csv(url)

# 값을 치환하고 두 개의 행을 출력합니다.
dataframe['Sex'].replace("female", "Woman").head(2)

0    Woman
1    Woman
Name: Sex, dtype: object

In [None]:
# "female"과 "male을 "Woman"과 "Man"으로 치환합니다.
dataframe['Sex'].replace(["female", "male"], ["Woman", "Man"]).head(5)

0    Woman
1    Woman
2      Man
3    Woman
4      Man
Name: Sex, dtype: object

In [None]:
# 값을 치환하고 두 개의 행을 출력합니다.
dataframe.replace(1, "One").head(2)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29,female,One,One
1,"Allison, Miss Helen Loraine",1st,2,female,0,One


In [None]:
# 값을 치환하고 두 개의 행을 출력합니다.
dataframe.replace(r"1st", "First", regex=True).head(2)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",First,29.0,female,1,1
1,"Allison, Miss Helen Loraine",First,2.0,female,0,1


In [None]:
# female과 male을 person으로 바꿉니다.
dataframe.replace(["female", "male"], "person").head(3)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,person,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,person,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,person,0,0


In [None]:
# female을 1로 바꾸고 male을 0으로 바꿉니다.
dataframe.replace({"female": 1, "male": 0}).head(3)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,1,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,1,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,0,0,0


### 열 이름 바꾸기

In [None]:
# 라이브러리를 임포트합니다.
import pandas as pd

# 데이터 URL
url = 'https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/titanic.csv'

# 데이터를 적재합니다.
dataframe = pd.read_csv(url)

# 열 이름을 바꾸고 두 개의 행을 출력합니다.
dataframe.rename(columns={'PClass': 'Passenger Class'}).head(2)

Unnamed: 0,Name,Passenger Class,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1


In [None]:
# 열 이름을 바꾸고 두 개의 행을 출력합니다.
dataframe.rename(columns={'PClass': 'Passenger Class', 'Sex': 'Gender'}).head(2)

Unnamed: 0,Name,Passenger Class,Age,Gender,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1


In [None]:
dataframe.columns = ['Name', 'Passenger Class', 'Age', 'Gender','Survived','SexCode']

Unnamed: 0,Name,Passenger Class,Age,Gender,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.00,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.00,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.00,male,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.00,female,0,1
4,"Allison, Master Hudson Trevor",1st,0.92,male,1,0
...,...,...,...,...,...,...
1308,"Zakarian, Mr Artun",3rd,27.00,male,0,0
1309,"Zakarian, Mr Maprieder",3rd,26.00,male,0,0
1310,"Zenni, Mr Philip",3rd,22.00,male,0,0
1311,"Lievens, Mr Rene",3rd,24.00,male,0,0


In [None]:
# 인덱스 0을 -1로 바꿉니다.
dataframe.rename(index={0:-1}).head(2)

Unnamed: 0,Name,Passenger Class,Age,Gender,Survived,SexCode
-1,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1


In [None]:
# 열 이름을 소문자로 바꿉니다.
dataframe.rename(str.lower, axis='columns').head(2)

Unnamed: 0,name,passenger class,age,gender,survived,sexcode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1


In [None]:
dataframe.index = range(1,1314)

Unnamed: 0,Name,Passenger Class,Age,Gender,Survived,SexCode
1,"Allen, Miss Elisabeth Walton",1st,29.00,female,1,1
2,"Allison, Miss Helen Loraine",1st,2.00,female,0,1
3,"Allison, Mr Hudson Joshua Creighton",1st,30.00,male,0,0
4,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.00,female,0,1
5,"Allison, Master Hudson Trevor",1st,0.92,male,1,0
...,...,...,...,...,...,...
1309,"Zakarian, Mr Artun",3rd,27.00,male,0,0
1310,"Zakarian, Mr Maprieder",3rd,26.00,male,0,0
1311,"Zenni, Mr Philip",3rd,22.00,male,0,0
1312,"Lievens, Mr Rene",3rd,24.00,male,0,0


### 최솟값, 최댓값, 합, 평균 계산 및 개수 세기

In [None]:
# 라이브러리를 임포트합니다.
import pandas as pd

# 데이터 URL
url = 'https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/titanic.csv'

# 데이터를 적재합니다.
dataframe = pd.read_csv(url)

# 통곗값을 계산합니다.
print('최댓값:', dataframe['Age'].max())
print('최솟값:', dataframe['Age'].min())
print('평균:', dataframe['Age'].mean())
print('합:', dataframe['Age'].sum())
print('카운트:', dataframe['Age'].count())

최댓값: 71.0
최솟값: 0.17
평균: 30.397989417989415
합: 22980.88
카운트: 756


In [None]:
# 카운트를 출력합니다.
dataframe.count()

Name        1313
PClass      1313
Age          756
Sex         1313
Survived    1313
SexCode     1313
dtype: int64

In [None]:
# 수치형 열의 공분산을 계산합니다.
dataframe.cov()

Unnamed: 0,Age,Survived,SexCode
Age,203.32047,-0.430491,-0.382054
Survived,-0.430491,0.225437,0.11407
SexCode,-0.382054,0.11407,0.22823


In [None]:
# 수치형 열의 상관계수를 계산합니다.
dataframe.corr()

Unnamed: 0,Age,Survived,SexCode
Age,1.0,-0.061254,-0.055138
Survived,-0.061254,1.0,0.502891
SexCode,-0.055138,0.502891,1.0


### 정렬하기

In [None]:
# 라이브러리를 임포트합니다.
import pandas as pd

# 데이터 URL
url = 'https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/titanic.csv'

# 데이터를 적재합니다.
dataframe = pd.read_csv(url)

# 정렬합니다. 
dataframe.sort_values(by='Age', ascending=False)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
505,"Mitchell, Mr Henry Michael",2nd,71.0,male,0,0
119,"Goldschmidt, Mr George B",1st,71.0,male,0,0
9,"Artagaveytia, Mr Ramon",1st,71.0,male,0,0
72,"Crosby, Captain Edward Gifford",1st,70.0,male,0,0
73,"Crosby, Mrs Edward Gifford (Catherine Elizabet...",1st,69.0,female,1,1
...,...,...,...,...,...,...
1300,"Wiseman, Mr Phillippe",3rd,,male,0,0
1302,"Yalsevac, Mr Ivan",3rd,,male,1,0
1305,"Youssef, Mr Gerios",3rd,,male,0,0
1306,"Zabour, Miss Hileni",3rd,,female,0,1


### 고유한 값 찾기

In [None]:
# 라이브러리를 임포트합니다.
import pandas as pd

# 데이터 URL
url = 'https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/titanic.csv'

# 데이터를 적재합니다.
dataframe = pd.read_csv(url)

# 고유한 값을 찾습니다.
dataframe['Sex'].unique()

array(['female', 'male'], dtype=object)

In [None]:
# 카운트를 출력합니다.
dataframe['Sex'].value_counts()

male      851
female    462
Name: Sex, dtype: int64

In [None]:
# 카운트를 출력합니다.
dataframe['PClass'].value_counts()

3rd    711
1st    322
2nd    279
*        1
Name: PClass, dtype: int64

In [None]:
# 고유한 값의 개수를 출력합니다.
dataframe['PClass'].nunique()

4

In [None]:
dataframe.nunique()

Name        1310
PClass         4
Age           75
Sex            2
Survived       2
SexCode        2
dtype: int64

### 누락된 값 다루기

In [None]:
# 라이브러리를 임포트합니다.
import pandas as pd

# 데이터 URL
url = 'https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/titanic.csv'

# 데이터를 적재합니다.
dataframe = pd.read_csv(url)

## 누락된 값을 선택하고 두 개의 행을 출력합니다.
dataframe[dataframe['Age'].isnull()].head(2)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
12,"Aubert, Mrs Leontine Pauline",1st,,female,1,1
13,"Barkworth, Mr Algernon H",1st,,male,1,0


In [None]:
# NaN으로 값을 바꾸려고 합니다.
dataframe['Sex'] = dataframe['Sex'].replace('male', NaN)

NameError: ignored

In [None]:
# 라이브러리를 임포트합니다.
import numpy as np

# NaN으로 값을 바꿉니다.
dataframe['Sex'] = dataframe['Sex'].replace('male', np.nan)

In [None]:
# 데이터를 적재하고 누락된 값을 설정합니다.
dataframe = pd.read_csv(url, na_values=[np.nan, 'NONE', -999])

In [None]:
# female만 NaN값으로 설정
dataframe = pd.read_csv(url, na_values=['female'], 
                        keep_default_na=False)
dataframe[12:14]

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
12,"Aubert, Mrs Leontine Pauline",1st,,,1,1
13,"Barkworth, Mr Algernon H",1st,,male,1,0


In [None]:
# NaN변환하지 않음
dataframe = pd.read_csv(url, na_filter=False)
dataframe[12:14]

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
12,"Aubert, Mrs Leontine Pauline",1st,,female,1,1
13,"Barkworth, Mr Algernon H",1st,,male,1,0


In [None]:
print(dataframe.Age.count())
dataframe.Age.unique()

1313


array(['29.0', '2.0', '30.0', '25.0', '0.92', '47.0', '63.0', '39.0',
       '58.0', '71.0', '19.0', '', '50.0', '24.0', '36.0', '37.0', '26.0',
       '28.0', '45.0', '22.0', '41.0', '48.0', '44.0', '59.0', '60.0',
       '53.0', '33.0', '14.0', '11.0', '49.0', '46.0', '27.0', '31.0',
       '64.0', '55.0', '70.0', '69.0', '38.0', '17.0', '4.0', '23.0',
       '35.0', '54.0', '21.0', '52.0', '16.0', '51.0', '42.0', '40.0',
       '15.0', '65.0', '18.0', '56.0', '43.0', '61.0', '13.0', '34.0',
       '6.0', '57.0', '32.0', '62.0', '67.0', '20.0', '1.0', '12.0',
       '0.83', '8.0', '7.0', '3.0', '0.8', '9.0', '5.0', '0.33', '0.17',
       '10.0', '1.5'], dtype=object)

### 열 삭제

In [None]:
# 라이브러리를 임포트합니다.
import pandas as pd

# 데이터 URL
url = 'https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/titanic.csv'

# 데이터를 적재합니다.
dataframe = pd.read_csv(url)

# 열을 삭제합니다.
dataframe.drop('Age', axis=1).head(2)

Unnamed: 0,Name,PClass,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,female,1,1
1,"Allison, Miss Helen Loraine",1st,female,0,1


In [None]:
# 여러개의 열을 삭제합니다.
dataframe.drop(['Age', 'Sex'], axis=1).head(2)

Unnamed: 0,Name,PClass,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,1,1
1,"Allison, Miss Helen Loraine",1st,0,1


In [None]:
# 열인덱스 지정하여 PClass 열을 삭제합니다.
dataframe.drop(dataframe.columns[1], axis=1).head(2)

Unnamed: 0,Name,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",29.0,female,1,1
1,"Allison, Miss Helen Loraine",2.0,female,0,1


### 행 삭제

In [None]:
# 라이브러리를 임포트합니다.
import pandas as pd

# 데이터 URL
url = 'https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/titanic.csv'

# 데이터를 적재합니다.
dataframe = pd.read_csv(url)

# 행을 삭제하고 처음 두 개의 행을 출력합니다.
dataframe[dataframe['Sex'] != 'male'].head(2)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1


In [None]:
# 행을 삭제하고 처음 두 개의 행을 출력합니다.
dataframe[dataframe['Name'] != 'Allison, Miss Helen Loraine'].head(2)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0


In [None]:
# 행을 삭제하고 처음 두 개의 행을 출력합니다.
dataframe[dataframe.index != 0].head(2)

### 중복된 행 삭제

In [None]:
# 라이브러리를 임포트합니다.
import pandas as pd

# 데이터 URL
url = 'https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/titanic.csv'

# 데이터를 적재합니다.
dataframe = pd.read_csv(url)

# 중복 행을 삭제하고 처음 두 개의 행을 출력합니다.
dataframe.drop_duplicates().head(2)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1


In [None]:
# 행의 개수를 출력합니다.
print("원본 데이터프레임 행의 수:", len(dataframe))
print("중복 삭제 후 행의 수:", len(dataframe.drop_duplicates()))

원본 데이터프레임 행의 수: 1313
중복 삭제 후 행의 수: 1313


In [None]:
# 중복된 행을 삭제합니다. 처음 나타난 것 유지
dataframe.drop_duplicates(subset=['Sex'])

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0


In [None]:
# 중복된 행을 삭제합니다. 마지막에 나타난 것 유지
dataframe.drop_duplicates(subset=['Sex'], keep='last')

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
1307,"Zabour, Miss Tamini",3rd,,female,0,1
1312,"Zimmerman, Leo",3rd,29.0,male,0,0


### 값에 따라 행 그룹핑

In [None]:
# 라이브러리를 임포트합니다.
import pandas as pd

# 데이터 URL
url = 'https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/titanic.csv'

# 데이터를 적재합니다.
dataframe = pd.read_csv(url)

# ‘Sex’ 열의 값으로 행을 그룹핑하고 평균을 계산합니다.
dataframe.groupby('Sex').mean()

Unnamed: 0_level_0,Age,Survived,SexCode
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,29.396424,0.666667,1.0
male,31.014338,0.166863,0.0


In [None]:
# 행을 그룹핑합니다.
dataframe.groupby('Sex')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7eff8c78b940>

In [None]:
# 행을 그룹핑하고 카운팅합니다.
dataframe.groupby('Survived')['Name'].count()

Survived
0    863
1    450
Name: Name, dtype: int64

In [None]:
# 행을 그룹핑한 다음 평균을 계산합니다.
dataframe.groupby(['Sex','Survived'])['Age'].mean()

Sex     Survived
female  0           24.901408
        1           30.867143
male    0           32.320780
        1           25.951875
Name: Age, dtype: float64

### 열 원소 순회

In [None]:
# 라이브러리를 임포트합니다.
import pandas as pd

# 데이터 URL
url = 'https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/titanic.csv'

# 데이터를 적재합니다.
dataframe = pd.read_csv(url)

# 처음 두 이름을 대문자로 바꾸어 출력합니다.
for name in dataframe['Name'][0:2]:
    print(name.upper())

ALLEN, MISS ELISABETH WALTON
ALLISON, MISS HELEN LORAINE


In [None]:
# 처음 두 이름을 대문자로 바꾸어 출력합니다.
[name.upper() for name in dataframe['Name'][0:2]]

['ALLEN, MISS ELISABETH WALTON', 'ALLISON, MISS HELEN LORAINE']

### 모든 열 원소에 함수 적용

In [None]:
# 라이브러리를 임포트합니다.
import pandas as pd

# 데이터 URL
url = 'https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/titanic.csv'

# 데이터를 적재합니다.
dataframe = pd.read_csv(url)

# 함수를 만듭니다.
def uppercase(x):
    return x.upper()

# 함수를 적용하고 두 개의 행을 출력합니다.
dataframe['Name'].apply(uppercase)[0:2]

0    ALLEN, MISS ELISABETH WALTON
1     ALLISON, MISS HELEN LORAINE
Name: Name, dtype: object

In [None]:
# Survived 열의 1을 Live로, 0을 Dead로 바꿉니다.
dataframe['Survived'].map({1:'Live', 0:'Dead'})[:5]

0    Live
1    Dead
2    Dead
3    Dead
4    Live
Name: Survived, dtype: object

In [None]:
# 함수의 매개변수(age)를 apply 메서드를 호출할 때 전달할 수 있습니다.
dataframe['Age'].apply(lambda x, age: x < age, age=30)[:5]

0     True
1     True
2    False
3     True
4     True
Name: Age, dtype: bool

In [None]:
# 각 열에서 가장 큰 값을 뽑습니다.
dataframe.apply(lambda x: max(x))

Name        del Carlo, Mrs Sebastiano (Argenia Genovese)
PClass                                               3rd
Age                                                   71
Sex                                                 male
Survived                                               1
SexCode                                                1
dtype: object

In [None]:
def truncate_string(x):
    if type(x) == str:
        return x[:20]
    return x

# 문자열의 길이를 최대 20자로 줄입니다.
dataframe.applymap(truncate_string)[:5]

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabet",1st,29.0,female,1,1
1,"Allison, Miss Helen",1st,2.0,female,0,1
2,"Allison, Mr Hudson J",1st,30.0,male,0,0
3,"Allison, Mrs Hudson",1st,25.0,female,0,1
4,"Allison, Master Huds",1st,0.92,male,1,0


### 그룹에 함수 적용

In [None]:
# 라이브러리를 임포트합니다.
import pandas as pd

# 데이터 URL
url = 'https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/titanic.csv'

# 데이터를 적재합니다.
dataframe = pd.read_csv(url)

# 행을 그룹핑한 다음 함수를 적용합니다.
dataframe.groupby('Sex').apply(lambda x: x.count())

Unnamed: 0_level_0,Name,PClass,Age,Sex,Survived,SexCode
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
female,462,462,288,462,462,462
male,851,851,468,851,851,851


### 데이터프레임 연결

In [None]:
# 라이브러리를 임포트합니다.
import pandas as pd

# 데이터프레임을 만듭니다.
data_a = {'id': ['1', '2', '3'],
          'first': ['Alex', 'Amy', 'Allen'],
          'last': ['Anderson', 'Ackerman', 'Ali']}
dataframe_a = pd.DataFrame(data_a, columns = ['id', 'first', 'last'])

# 데이터프레임을 만듭니다.
data_b = {'id': ['4', '5', '6'],
          'first': ['Billy', 'Brian', 'Bran'],
          'last': ['Bonder', 'Black', 'Balwner']}
dataframe_b = pd.DataFrame(data_b, columns = ['id', 'first', 'last'])

# 행 방향으로 데이터프레임을 연결합니다.
pd.concat([dataframe_a, dataframe_b], axis=0)

Unnamed: 0,id,first,last
0,1,Alex,Anderson
1,2,Amy,Ackerman
2,3,Allen,Ali
0,4,Billy,Bonder
1,5,Brian,Black
2,6,Bran,Balwner


In [None]:
# 열 방향으로 데이터프레임을 연결합니다.
pd.concat([dataframe_a, dataframe_b], axis=1)

Unnamed: 0,id,first,last,id.1,first.1,last.1
0,1,Alex,Anderson,4,Billy,Bonder
1,2,Amy,Ackerman,5,Brian,Black
2,3,Allen,Ali,6,Bran,Balwner


In [None]:
# 행을 만듭니다.
row = pd.Series([10, 'Chris', 'Chillon'], index=['id', 'first', 'last'])

# 행을 추가합니다.
dataframe_a.append(row, ignore_index=True)

Unnamed: 0,id,first,last
0,1,Alex,Anderson
1,2,Amy,Ackerman
2,3,Allen,Ali
3,10,Chris,Chillon


### 데이터프레임 병합

In [None]:
# 라이브러리를 임포트합니다.
import pandas as pd

# 데이터프레임을 만듭니다.
employee_data = {'employee_id': ['1', '2', '3', '4'],
                 'name': ['Amy Jones', 'Allen Keys', 'Alice Bees',
                 'Tim Horton']}
dataframe_employees = pd.DataFrame(employee_data, columns = ['employee_id',
                                                              'name'])

# 데이터프레임을 만듭니다.
sales_data = {'employee_id': ['3', '4', '5', '6'],
              'total_sales': [23456, 2512, 2345, 1455]}
dataframe_sales = pd.DataFrame(sales_data, columns = ['employee_id',
                                                      'total_sales'])

# 데이터프레임을 병합합니다.
pd.merge(dataframe_employees, dataframe_sales, on='employee_id')

Unnamed: 0,employee_id,name,total_sales
0,3,Alice Bees,23456
1,4,Tim Horton,2512


In [None]:
# 데이터프레임을 병합합니다.
pd.merge(dataframe_employees, dataframe_sales, on='employee_id', how='outer')

Unnamed: 0,employee_id,name,total_sales
0,1,Amy Jones,
1,2,Allen Keys,
2,3,Alice Bees,23456.0
3,4,Tim Horton,2512.0
4,5,,2345.0
5,6,,1455.0


In [None]:
# 데이터프레임을 병합합니다.
pd.merge(dataframe_employees, dataframe_sales, on='employee_id', how='left')

Unnamed: 0,employee_id,name,total_sales
0,1,Amy Jones,
1,2,Allen Keys,
2,3,Alice Bees,23456.0
3,4,Tim Horton,2512.0


In [None]:
# 데이터프레임을 병합합니다.
pd.merge(dataframe_employees, dataframe_sales, on='employee_id', how='right')

Unnamed: 0,employee_id,name,total_sales
0,3,Alice Bees,23456
1,4,Tim Horton,2512
2,5,,2345
3,6,,1455


In [None]:
# 데이터프레임을 병합합니다.
pd.merge(dataframe_employees,
         dataframe_sales,
         left_on='employee_id',
         right_on='employee_id')

Unnamed: 0,employee_id,name,total_sales
0,3,Alice Bees,23456
1,4,Tim Horton,2512


In [None]:
pd.merge(dataframe_employees,
         dataframe_sales,
         right_index=True,
         left_index=True)

Unnamed: 0,employee_id_x,name,employee_id_y,total_sales
0,1,Amy Jones,3,23456
1,2,Allen Keys,4,2512
2,3,Alice Bees,5,2345
3,4,Tim Horton,6,1455


## More - Sync with Numpy

In [None]:
import numpy as np
import pandas as pd
#random한 3행 4열의 DataFrame 생성
data_frame = pd.DataFrame(np.random.randn(3, 4))
data_frame

Unnamed: 0,0,1,2,3
0,-0.381614,1.204266,-1.547892,-0.042493
1,1.0072,1.077901,0.655597,-0.386212
2,0.582988,-0.145973,0.84508,-0.407658


In [None]:
#data_frame의 모든 원소에 log 값을 취함
new_data_frame = np.log(data_frame)


  


In [None]:
new_data_frame

Unnamed: 0,0,1,2,3
0,,0.18587,,
1,0.007175,0.075015,-0.422209,
2,-0.539588,,-0.168323,


In [None]:
#data_frame을 ndarray로 변환
nparray = np.array(data_frame)

## More - treatment with NaN

In [None]:
data_frame.dropna()

Unnamed: 0,0,1,2,3
0,-0.381614,1.204266,-1.547892,-0.042493
1,1.0072,1.077901,0.655597,-0.386212
2,0.582988,-0.145973,0.84508,-0.407658


In [None]:
data_frame.dropna(axis=1)

Unnamed: 0,0,1,2,3
0,-0.381614,1.204266,-1.547892,-0.042493
1,1.0072,1.077901,0.655597,-0.386212
2,0.582988,-0.145973,0.84508,-0.407658


In [None]:

#col1, col2 열에 NaN이 포함된 행을 삭제
data_frame.dropna(subset=[‘col1’,’col2’])

In [None]:

#3행과 5행에 NaN이 포함된 열을 삭제
data_frame.dropna(axis=1, subset=[3, 5])

In [None]:

#모든 NaN을 0으로 치환한다
data_frame.fillna(0)

Unnamed: 0,0,1,2,3
0,-0.381614,1.204266,-1.547892,-0.042493
1,1.0072,1.077901,0.655597,-0.386212
2,0.582988,-0.145973,0.84508,-0.407658


In [None]:
data_frame.isnull()

Unnamed: 0,0,1,2,3
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False


In [None]:
#NaN을 같은 열의 바로 위의 행 값으로 대체
data_frame.fillna(method='ffill')

Unnamed: 0,0,1,2,3
0,-0.381614,1.204266,-1.547892,-0.042493
1,1.0072,1.077901,0.655597,-0.386212
2,0.582988,-0.145973,0.84508,-0.407658


In [None]:
#NaN을 같은 열의 바로 아래 행 값으로 대체
data_frame.fillna(method='bfill')

Unnamed: 0,0,1,2,3
0,-0.381614,1.204266,-1.547892,-0.042493
1,1.0072,1.077901,0.655597,-0.386212
2,0.582988,-0.145973,0.84508,-0.407658
