In [1]:
import pandas as pd
import numpy as np

# Series 

## Series란
  - 1차원 데이터
  - 인덱스 지정 가능

In [16]:
s = pd.Series([1, 4, 9, 16, 25])

s

0     1
1     4
2     9
3    16
4    25
dtype: int64

In [6]:
t = pd.Series({'one':1, 'four':4, 'nine':9, 'ten':10})

t

one      1
four     4
nine     9
ten     10
dtype: int64

## Series + Numpy
  - Numpy의 ndarray와 유사

In [8]:
s[1], t[2]

(2, 9)

In [9]:
t[1:3]

four    4
nine    9
dtype: int64

In [11]:
s[s > s.median()]

3     5
4    10
dtype: int64

In [13]:
s[[3,2,3]]

3    5
2    3
3    5
dtype: int64

In [18]:
np.exp(s)

0    2.718282e+00
1    5.459815e+01
2    8.103084e+03
3    8.886111e+06
4    7.200490e+10
dtype: float64

In [20]:
s.dtype

dtype('int64')

## Series + dict
  - dict와 유사

### 인덱싱, 슬라이싱 

In [22]:
t['one']

1

In [24]:
t['one':'nine']

one     1
four    4
nine    9
dtype: int64

### 데이터 추가 

In [25]:
t['six'] = 6
t

one      1
four     4
nine     9
ten     10
six      6
dtype: int64

### Series.get()

In [27]:
'six' in t

True

In [34]:
t['seven']

KeyError: 'seven'

In [35]:
'seven' in t, t.get('seven', 0)

(False, 0)

# DataFrame

## DataFrame이란
  - 2차원 데이터 테이블
  - 인덱스 지정 가능

## 생성
  - `pd.DataFrame()`
  - 참고
    - 데이터 타입 조회 : `DataFrame.dtypes`

### dict to DataFrame

In [2]:
dic = {'height' : [1,2,3,4], 'weight':[30, 40, 50, 60]}

df = pd.DataFrame(dic)

df

Unnamed: 0,height,weight
0,1,30
1,2,40
2,3,50
3,4,60


### csv to DataFrame
  - `pd.read_csv()`

In [4]:
csvDF = pd.read_csv('./country_wise_latest.csv')
csvDF

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Confirmed last week,1 week change,1 week % increase,WHO Region
0,Afghanistan,36263,1269,25198,9796,106,10,18,3.50,69.49,5.04,35526,737,2.07,Eastern Mediterranean
1,Albania,4880,144,2745,1991,117,6,63,2.95,56.25,5.25,4171,709,17.00,Europe
2,Algeria,27973,1163,18837,7973,616,8,749,4.16,67.34,6.17,23691,4282,18.07,Africa
3,Andorra,907,52,803,52,10,0,0,5.73,88.53,6.48,884,23,2.60,Europe
4,Angola,950,41,242,667,18,1,0,4.32,25.47,16.94,749,201,26.84,Africa
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182,West Bank and Gaza,10621,78,3752,6791,152,2,0,0.73,35.33,2.08,8916,1705,19.12,Eastern Mediterranean
183,Western Sahara,10,1,8,1,0,0,0,10.00,80.00,12.50,10,0,0.00,Africa
184,Yemen,1691,483,833,375,10,4,36,28.56,49.26,57.98,1619,72,4.45,Eastern Mediterranean
185,Zambia,4552,140,2815,1597,71,1,465,3.08,61.84,4.97,3326,1226,36.86,Africa


In [5]:
df.dtypes

height    int64
weight    int64
dtype: object

In [7]:
csvDF.dtypes

Country/Region             object
Confirmed                   int64
Deaths                      int64
Recovered                   int64
Active                      int64
New cases                   int64
New deaths                  int64
New recovered               int64
Deaths / 100 Cases        float64
Recovered / 100 Cases     float64
Deaths / 100 Recovered    float64
Confirmed last week         int64
1 week change               int64
1 week % increase         float64
WHO Region                 object
dtype: object

## 활용 1. 일부분 조회

### head
  - 데이터프레임에서 위부터 지정한 갯수만큼 조회 (기본값은 5)
  - `DataFrame.head(count)`

In [11]:
csvDF.head()

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Confirmed last week,1 week change,1 week % increase,WHO Region
0,Afghanistan,36263,1269,25198,9796,106,10,18,3.5,69.49,5.04,35526,737,2.07,Eastern Mediterranean
1,Albania,4880,144,2745,1991,117,6,63,2.95,56.25,5.25,4171,709,17.0,Europe
2,Algeria,27973,1163,18837,7973,616,8,749,4.16,67.34,6.17,23691,4282,18.07,Africa
3,Andorra,907,52,803,52,10,0,0,5.73,88.53,6.48,884,23,2.6,Europe
4,Angola,950,41,242,667,18,1,0,4.32,25.47,16.94,749,201,26.84,Africa


### tail
  - 데이터프레임에서 아래부터 지정한 개수만큼 조회 (기본값은 5)
  - `DataFrame.tail(count)`

In [12]:
csvDF.tail(7)

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Confirmed last week,1 week change,1 week % increase,WHO Region
180,Venezuela,15988,146,9959,5883,525,4,213,0.91,62.29,1.47,12334,3654,29.63,Americas
181,Vietnam,431,0,365,66,11,0,0,0.0,84.69,0.0,384,47,12.24,Western Pacific
182,West Bank and Gaza,10621,78,3752,6791,152,2,0,0.73,35.33,2.08,8916,1705,19.12,Eastern Mediterranean
183,Western Sahara,10,1,8,1,0,0,0,10.0,80.0,12.5,10,0,0.0,Africa
184,Yemen,1691,483,833,375,10,4,36,28.56,49.26,57.98,1619,72,4.45,Eastern Mediterranean
185,Zambia,4552,140,2815,1597,71,1,465,3.08,61.84,4.97,3326,1226,36.86,Africa
186,Zimbabwe,2704,36,542,2126,192,2,24,1.33,20.04,6.64,1713,991,57.85,Africa


## 활용 2. 데이터 접근
  - `DataFrame['column_name']`
  - `DataFrame.column_name`

In [19]:
csvDF['WHO Region']

0      Eastern Mediterranean
1                     Europe
2                     Africa
3                     Europe
4                     Africa
               ...          
182    Eastern Mediterranean
183                   Africa
184    Eastern Mediterranean
185                   Africa
186                   Africa
Name: WHO Region, Length: 187, dtype: object

In [17]:
csvDF.WHO Region

SyntaxError: unexpected character after line continuation character (<ipython-input-17-2626263dfbde>, line 1)

In [18]:
csvDF.Active

0      9796
1      1991
2      7973
3        52
4       667
       ... 
182    6791
183       1
184     375
185    1597
186    2126
Name: Active, Length: 187, dtype: int64

## DataFrame과 Series의 관계
  - DataFrame의 각 column은 Series로 구성

In [21]:
type(csvDF['WHO Region'])

pandas.core.series.Series

## 활용 3. 조건을 통한 데이터 접근 
  - `DataFrame['조건']`
  - 참고
    - column의 값을 중복을 제외하고 조회 : `DataFrame['column_name'].unique()`

### 신규 확진자가 100명이 넘는 나라 

In [25]:
csvDF[csvDF['New cases'] > 100].head()

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Confirmed last week,1 week change,1 week % increase,WHO Region
0,Afghanistan,36263,1269,25198,9796,106,10,18,3.5,69.49,5.04,35526,737,2.07,Eastern Mediterranean
1,Albania,4880,144,2745,1991,117,6,63,2.95,56.25,5.25,4171,709,17.0,Europe
2,Algeria,27973,1163,18837,7973,616,8,749,4.16,67.34,6.17,23691,4282,18.07,Africa
6,Argentina,167416,3059,72575,91782,4890,120,2057,1.83,43.35,4.21,130774,36642,28.02,Americas
8,Australia,15303,167,9311,5825,368,6,137,1.09,60.84,1.79,12428,2875,23.13,Western Pacific


### WHO 지역이 동남아시아인 나라 

In [26]:
csvDF['WHO Region'].unique()

array(['Eastern Mediterranean', 'Europe', 'Africa', 'Americas',
       'Western Pacific', 'South-East Asia'], dtype=object)

In [27]:
csvDF[csvDF['WHO Region'] == 'South-East Asia']

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Confirmed last week,1 week change,1 week % increase,WHO Region
13,Bangladesh,226225,2965,125683,97577,2772,37,1801,1.31,55.56,2.36,207453,18772,9.05,South-East Asia
19,Bhutan,99,0,86,13,4,0,1,0.0,86.87,0.0,90,9,10.0,South-East Asia
27,Burma,350,6,292,52,0,0,2,1.71,83.43,2.05,341,9,2.64,South-East Asia
79,India,1480073,33408,951166,495499,44457,637,33598,2.26,64.26,3.51,1155338,324735,28.11,South-East Asia
80,Indonesia,100303,4838,58173,37292,1525,57,1518,4.82,58.0,8.32,88214,12089,13.7,South-East Asia
106,Maldives,3369,15,2547,807,67,0,19,0.45,75.6,0.59,2999,370,12.34,South-East Asia
119,Nepal,18752,48,13754,4950,139,3,626,0.26,73.35,0.35,17844,908,5.09,South-East Asia
158,Sri Lanka,2805,11,2121,673,23,0,15,0.39,75.61,0.52,2730,75,2.75,South-East Asia
167,Thailand,3297,58,3111,128,6,0,2,1.76,94.36,1.86,3250,47,1.45,South-East Asia
168,Timor-Leste,24,0,0,24,0,0,0,0.0,0.0,0.0,24,0,0.0,South-East Asia


## 활용 4. 행을 기준으로 데이터 접근
  - 인덱스 활용 : `DataFrame.loc[row, col]`
  - 숫자 인덱스 활용 : `DataFrmae.iloc[rowidx, colidx]`
    - Numpy 슬라이싱 활용가능

In [28]:
books_dict = {'Available' : [True, True, True], 'Location': [132, 486, 8086], 'Genre' : ['Programming', 'Physics', 'Math']}

booksDF = pd.DataFrame(books_dict, index=['버그란 무엇인가', '두근세근 물리학', '나혼자 적분'])

booksDF

Unnamed: 0,Available,Location,Genre
버그란 무엇인가,True,132,Programming
두근세근 물리학,True,486,Physics
나혼자 적분,True,8086,Math


### 인덱스 활용

In [32]:
print(type(booksDF.loc['버그란 무엇인가']))

booksDF.loc['버그란 무엇인가']

<class 'pandas.core.series.Series'>


Available           True
Location             132
Genre        Programming
Name: 버그란 무엇인가, dtype: object

In [33]:
booksDF.loc['나혼자 적분', 'Available']

True

### 숫자 인덱스 활용 

In [35]:
booksDF.iloc[0, 1]

132

In [36]:
booksDF.iloc[0:2, 1:3]

Unnamed: 0,Location,Genre
버그란 무엇인가,132,Programming
두근세근 물리학,486,Physics


## 활용 5. groupby()
  - split : 특정 기준을 바탕으로 DF를 분할
  - Apply : 통계함수 - sum(), mean(), median(), ... 등을 통해 각 데이터를 압축
  - `DataFrame.groupby('column_name')`

### WHO Region 별 확진자 수
  1. DF에서 원하는 column을 추출
  2. 추출한 데이터에 대해 묶으려는 column을 기준으로 그룹화

In [37]:
covid_region = csvDF['Confirmed'].groupby(by = csvDF['WHO Region'])

In [39]:
covid_region.sum()

WHO Region
Africa                    723207
Americas                 8839286
Eastern Mediterranean    1490744
Europe                   3299523
South-East Asia          1835297
Western Pacific           292428
Name: Confirmed, dtype: int64

### WHO Region별 확진자 수 평균
  - 한 국가당 확진자 수

In [40]:
covid_region.mean()

WHO Region
Africa                    15066.812500
Americas                 252551.028571
Eastern Mediterranean     67761.090909
Europe                    58920.053571
South-East Asia          183529.700000
Western Pacific           18276.750000
Name: Confirmed, dtype: float64