In [1]:
import pandas as pd

df = pd.DataFrame({
    "미국": [2.1, 2.2, 2.3],
    "한국": [0.4, 0.5, 0.45],
    "중국": [10, 13, 15],
    "년도": [2000, 2010, 2020]
    }).set_index('년도')

df

Unnamed: 0_level_0,미국,한국,중국
년도,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,2.1,0.4,10
2010,2.2,0.5,13
2020,2.3,0.45,15


# DataFrame

### 특정 열 가져오기
- 특정 열은 Series

In [2]:
df['미국']

print(df['미국'][2000])
print(df.loc[2000]['미국'])
print(df.iloc[0]['미국'])

2.1
2.1
2.1


### 열 추가하기

In [3]:
df['일본'] = [1, 2, 3]

### 열 삭제하기
- del [컬럼]

In [4]:
del df['일본']

### 특정 행 가져오기
1. loc - 인덱스를 통해 값을 찾음
2. iloc - 인덱스 번호를 통해 값을 찾음

In [5]:
df.loc[2000]
df.iloc[0]

미국     2.1
한국     0.4
중국    10.0
Name: 2000, dtype: float64

### 행 추가
- df.loc[new index] = list

In [6]:
df.loc[2021] = [1, 2, 3]
df

Unnamed: 0_level_0,미국,한국,중국
년도,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,2.1,0.4,10
2010,2.2,0.5,13
2020,2.3,0.45,15
2021,1.0,2.0,3


### 행 삭제
- df.drop([index name])

In [7]:
df.drop([2021])

Unnamed: 0_level_0,미국,한국,중국
년도,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,2.1,0.4,10
2010,2.2,0.5,13
2020,2.3,0.45,15


### 컬럼 선택
- 새로운 데이터 저장소가 만들어짐

#### copy()
> - 같은 데이터를 참조하지 않고 사용하기 위함
> - 원본 데이터는 놔두고, 복사하여 데이터를 처리하는 경우가 많음.
> - 보통 원본 데이터는 온전히 보전하고, 필요한 데이터 분석시 필요한 컬럼만 가져올 때 사용

In [8]:
copy_df = df[['중국', '한국']].copy()
copy_df

Unnamed: 0_level_0,중국,한국
년도,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,10,0.4
2010,13,0.5
2020,15,0.45
2021,3,2.0


### 컬럼명 변경하기

In [9]:
copy_df.columns = ['브라질', '아르헨티나']

### DF에서 Series 추출하기
- 하나의 컬럼만 선택하면 됨

In [10]:
korea = df['한국']
korea

년도
2000    0.40
2010    0.50
2020    0.45
2021    2.00
Name: 한국, dtype: float64

In [12]:
# COVID 실습
import pandas as pd
PATH = "COVID-19-master/csse_covid_19_data/csse_covid_19_daily_reports/"
doc = pd.read_csv(PATH + "04-01-2020.csv", encoding='utf-8-sig')

In [14]:
countries = doc['Country_Region']
countries.head()

0    US
1    US
2    US
3    US
4    US
Name: Country_Region, dtype: object

### Series로 feature를 상세히 탐색
- size: 사이즈 반환
- count(): 데이터가 없는 경우를 뺸 사이즈 반환
- unique(): 유일 값 반환
- value_counts(): 데이터가 없는 경우를 제외하고 각 값의 갯수를 반환

In [18]:
print(countries.size, countries.count())
print(countries.unique(), len(countries.unique()))

2483 2483
['US' 'Canada' 'United Kingdom' 'China' 'Netherlands' 'Australia'
 'Denmark' 'France' 'Afghanistan' 'Albania' 'Algeria' 'Andorra' 'Angola'
 'Antigua and Barbuda' 'Argentina' 'Armenia' 'Austria' 'Azerbaijan'
 'Bahamas' 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus' 'Belgium' 'Belize'
 'Benin' 'Bhutan' 'Bolivia' 'Bosnia and Herzegovina' 'Botswana' 'Brazil'
 'Brunei' 'Bulgaria' 'Burkina Faso' 'Burma' 'Burundi' 'Cabo Verde'
 'Cambodia' 'Cameroon' 'Central African Republic' 'Chad' 'Chile'
 'Colombia' 'Congo (Brazzaville)' 'Congo (Kinshasa)' 'Costa Rica'
 "Cote d'Ivoire" 'Croatia' 'Cuba' 'Cyprus' 'Czechia' 'Diamond Princess'
 'Djibouti' 'Dominica' 'Dominican Republic' 'Ecuador' 'Egypt'
 'El Salvador' 'Equatorial Guinea' 'Eritrea' 'Estonia' 'Eswatini'
 'Ethiopia' 'Fiji' 'Finland' 'Gabon' 'Gambia' 'Georgia' 'Germany' 'Ghana'
 'Greece' 'Grenada' 'Guatemala' 'Guinea' 'Guinea-Bissau' 'Guyana' 'Haiti'
 'Holy See' 'Honduras' 'Hungary' 'Iceland' 'India' 'Indonesia' 'Iran'
 'Iraq' 'Ireland' 'I

In [19]:
countries.value_counts()

US                  2228
China                 33
Canada                15
United Kingdom        10
France                10
                    ... 
Diamond Princess       1
Costa Rica             1
Lebanon                1
Bangladesh             1
Croatia                1
Name: Country_Region, Length: 180, dtype: int64

### 필요한 컬럼만 선택하기
---
- 여러 컬럼을 선택하면 별도의 데이터프레임이 됨

In [23]:
covid_stat = doc[['Confirmed', 'Deaths', 'Recovered']]
covid_stat.head()

Unnamed: 0,Confirmed,Deaths,Recovered
0,4,0,0
1,47,1,0
2,7,0,0
3,195,3,0
4,1,0,0


In [25]:
doc = pd.read_csv(PATH + "04-01-2020.csv", encoding='utf-8-sig')
doc

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
0,45001.0,Abbeville,South Carolina,US,2020-04-01 21:58:49,34.223334,-82.461707,4,0,0,0,"Abbeville, South Carolina, US"
1,22001.0,Acadia,Louisiana,US,2020-04-01 21:58:49,30.295065,-92.414197,47,1,0,0,"Acadia, Louisiana, US"
2,51001.0,Accomack,Virginia,US,2020-04-01 21:58:49,37.767072,-75.632346,7,0,0,0,"Accomack, Virginia, US"
3,16001.0,Ada,Idaho,US,2020-04-01 21:58:49,43.452658,-116.241552,195,3,0,0,"Ada, Idaho, US"
4,19001.0,Adair,Iowa,US,2020-04-01 21:58:49,41.330756,-94.471059,1,0,0,0,"Adair, Iowa, US"
...,...,...,...,...,...,...,...,...,...,...,...,...
2478,,,,Venezuela,2020-04-01 21:58:34,6.423800,-66.589700,143,3,41,99,Venezuela
2479,,,,Vietnam,2020-04-01 21:58:34,14.058324,108.277199,218,0,63,155,Vietnam
2480,,,,West Bank and Gaza,2020-04-01 21:58:34,31.952200,35.233200,134,1,18,115,West Bank and Gaza
2481,,,,Zambia,2020-04-01 21:58:34,-13.133897,27.849332,36,0,0,36,Zambia


### 특정 조건 row 검색
- doc[] = ''

In [None]:
doc_us = doc[doc['Country_Region'] == 'US']

### NaN 데이터 처리하기
- 결측치 유뮤 확인
- isnull(): 없는 데이터가 있는지 확인 (RETURN: True or False)
- sum(): 없는 데이터가 있는 행 갯수 확인
- Usage: isnull().sum()

In [28]:
doc.isnull().sum()

FIPS              312
Admin2            262
Province_State    176
Country_Region      0
Last_Update         0
Lat                 1
Long_               1
Confirmed           0
Deaths              0
Recovered           0
Active              0
Combined_Key        0
dtype: int64

### 결측치 제거하기
---
- dropna(): 결측치를 가진 행 모두 삭제