# 갭마인더 데이터 집합 불러오기

In [1]:
import pandas as pd
df = pd.read_csv('pandas_data/gapminder.tsv', sep='\t')
df

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.853030
2,Afghanistan,Asia,1962,31.997,10267083,853.100710
3,Afghanistan,Asia,1967,34.020,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
...,...,...,...,...,...,...
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.449960
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623


# 불러온 데이터 집합 살펴보기

In [4]:
# 타입, shape, info,...
df.shape
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    1704 non-null   object 
 1   continent  1704 non-null   object 
 2   year       1704 non-null   int64  
 3   lifeExp    1704 non-null   float64
 4   pop        1704 non-null   int64  
 5   gdpPercap  1704 non-null   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB


# 열 단위로 데이터 추출하기

In [10]:
df[['country']]
df[['continent']]
df[['year']]
df[['lifeExp']]
df[['pop']]
df[['gdpPercap']]

Unnamed: 0,gdpPercap
0,779.445314
1,820.853030
2,853.100710
3,836.197138
4,739.981106
...,...
1699,706.157306
1700,693.420786
1701,792.449960
1702,672.038623


# loc 속성으로 행 단위 데이터 추출하기

In [13]:
df.loc[:,['year','pop']]

Unnamed: 0,year,pop
0,1952,8425333
1,1957,9240934
2,1962,10267083
3,1967,11537966
4,1972,13079460
...,...,...
1699,1987,9216418
1700,1992,10704340
1701,1997,11404948
1702,2002,11926563


In [15]:
df.iloc[:,[2,4]]

Unnamed: 0,year,pop
0,1952,8425333
1,1957,9240934
2,1962,10267083
3,1967,11537966
4,1972,13079460
...,...,...
1699,1987,9216418
1700,1992,10704340
1701,1997,11404948
1702,2002,11926563


# tail과 loc는 조금 달라요!

In [50]:
type(df.tail())

pandas.core.frame.DataFrame

In [53]:
type(df.loc[1])

pandas.core.series.Series

# iloc 속성으로 행 단위 데이터 추출하기

In [40]:
df.iloc[:15,0]

0     Afghanistan
1     Afghanistan
2     Afghanistan
3     Afghanistan
4     Afghanistan
5     Afghanistan
6     Afghanistan
7     Afghanistan
8     Afghanistan
9     Afghanistan
10    Afghanistan
11    Afghanistan
12        Albania
13        Albania
14        Albania
Name: country, dtype: object

# 파이썬 슬라이싱 구문을 조합하여 원하는 데이터 추출하기

In [47]:
df.iloc[1:4,[3]]

Unnamed: 0,lifeExp
1,30.332
2,31.997
3,34.02


# iloc 속성과 range 메서드로 원하는 데이터 추출하기

In [18]:
df.iloc[:,list(range(3))]

Unnamed: 0,country,continent,year
0,Afghanistan,Asia,1952
1,Afghanistan,Asia,1957
2,Afghanistan,Asia,1962
3,Afghanistan,Asia,1967
4,Afghanistan,Asia,1972
...,...,...,...
1699,Zimbabwe,Africa,1987
1700,Zimbabwe,Africa,1992
1701,Zimbabwe,Africa,1997
1702,Zimbabwe,Africa,2002


# 열 지정값에 파이썬 슬라이싱을 사용하여 원하는 데이터 추출하기

In [46]:
df.iloc[:,1:4]

Unnamed: 0,continent,year,lifeExp
0,Asia,1952,28.801
1,Asia,1957,30.332
2,Asia,1962,31.997
3,Asia,1967,34.020
4,Asia,1972,36.088
...,...,...,...
1699,Africa,1987,62.351
1700,Africa,1992,60.377
1701,Africa,1997,46.809
1702,Africa,2002,39.989


# loc, iloc 자유자재로 사용하기

In [36]:
df.iloc[[1,11,22],[2,5]]

Unnamed: 0,year,gdpPercap
1,1957,820.85303
11,2007,974.580338
22,2002,4604.211737


# 그룹화한 데이터의 평균 구하기

In [23]:
# 국가별, 대륙별, 연도 기준으로 그룹화 -> 연산
df.groupby('country').mean()
df.groupby('continent').mean()
df.groupby('year').mean()

Unnamed: 0_level_0,lifeExp,pop,gdpPercap
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,49.05762,16950400.0,3725.276046
1957,51.507401,18763410.0,4299.408345
1962,53.609249,20421010.0,4725.812342
1967,55.67829,22658300.0,5483.653047
1972,57.647386,25189980.0,6770.082815
1977,59.570157,27676380.0,7313.166421
1982,61.533197,30207300.0,7518.901673
1987,63.212613,33038570.0,7900.920218
1992,64.160338,35990920.0,8158.608521
1997,65.014676,38839470.0,9090.175363


# 그룹화한 데이터의 개수 세어보기

In [24]:
df.groupby('year').count()

Unnamed: 0_level_0,country,continent,lifeExp,pop,gdpPercap
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1952,142,142,142,142,142
1957,142,142,142,142,142
1962,142,142,142,142,142
1967,142,142,142,142,142
1972,142,142,142,142,142
1977,142,142,142,142,142
1982,142,142,142,142,142
1987,142,142,142,142,142
1992,142,142,142,142,142
1997,142,142,142,142,142


In [25]:
df.groupby('year').size()

year
1952    142
1957    142
1962    142
1967    142
1972    142
1977    142
1982    142
1987    142
1992    142
1997    142
2002    142
2007    142
dtype: int64