In [1]:
# Series
import pandas as pd
import numpy as np

data = pd.Series([1,2,3])
data

0    1
1    2
2    3
dtype: int64

In [2]:
data = pd.Series([1,2,3,4],index=['a','b','c','d'])

data['b']

2

In [4]:
data = pd.Series([1,2,3,4],index=['a','b','c','d'], name='Title')
data['c'] = 5
data

a    1
b    2
c    5
d    4
Name: Title, dtype: int64

In [7]:
population_dict = {
    'korea': 5180,
    'japan': 12718,
    'china': 141500,
    'usa': 32676
}
population = pd.Series(population_dict)
population

korea      5180
japan     12718
china    141500
usa       32676
dtype: int64

In [9]:
# 데이터 프레임
gdp_dict = {
    'korea': 169320000,
    'japan': 516700000,
    'china': 1409250000,
    'usa': 2041280000
}

gdp = pd.Series(gdp_dict)
country = pd.DataFrame({
    'population':population,
    'gdp' : gdp
})

country

Unnamed: 0,population,gdp
korea,5180,169320000
japan,12718,516700000
china,141500,1409250000
usa,32676,2041280000


In [14]:
# 딕셔너리로 변환
print(country.index)
print(country.columns)

print(country['gdp'])
type(country['gdp'])

Index(['korea', 'japan', 'china', 'usa'], dtype='object')
Index(['population', 'gdp'], dtype='object')
korea     169320000
japan     516700000
china    1409250000
usa      2041280000
Name: gdp, dtype: int64


pandas.core.series.Series

In [18]:
# Series도 array처럼 연산자 활용
gdp_per_capita = country['gdp'] / country['population']
country['gdp per capita'] = gdp_per_capita
print(gdp_per_capita)
country


korea    32687.258687
japan    40627.457147
china     9959.363958
usa      62470.314604
dtype: float64


Unnamed: 0,population,gdp,gdp per capita
korea,5180,169320000,32687.258687
japan,12718,516700000,40627.457147
china,141500,1409250000,9959.363958
usa,32676,2041280000,62470.314604


In [19]:
# 저장하기
country.to_csv('./country.csv')
# csv (comma separated value)

In [20]:
# 불러오기
pd.read_csv('./country.csv')

Unnamed: 0.1,Unnamed: 0,population,gdp,gdp per capita
0,korea,5180,169320000,32687.258687
1,japan,12718,516700000,40627.457147
2,china,141500,1409250000,9959.363958
3,usa,32676,2041280000,62470.314604


In [22]:
# indexing / slicing    
# .loc[행, 열] , iloc[행,열]
country.loc['china']



population        1.415000e+05
gdp               1.409250e+09
gdp per capita    9.959364e+03
Name: china, dtype: float64

In [25]:
country.loc['japan':'china', :'population']

Unnamed: 0,population
japan,12718
china,141500


In [24]:
country.iloc[0]
country.iloc[1:3, :2]

Unnamed: 0,population,gdp
japan,12718,516700000
china,141500,1409250000


In [29]:
# 데이터 프레임 새 데이터 추가 / 수정

dataframe = pd.DataFrame(columns=['이름'
,
'나이'
,
'주소'])
dataframe.loc[0] = ['임원균'
, '26',
'서울']
dataframe.loc[1] = {'이름':'철수'
,
'나이':'25',
'주소':'인천'}
dataframe


Unnamed: 0,이름,나이,주소
0,임원균,26,서울
1,철수,25,인천


In [30]:
dataframe.loc[1,
'이름'] = '영희'

In [31]:
dataframe

Unnamed: 0,이름,나이,주소
0,임원균,26,서울
1,영희,25,인천


In [32]:
# 새로운 컬럼 추가
dataframe['전화번호'] = np.nan
dataframe.loc[0, '전화번호'] = '01012341234'


  dataframe.loc[0, '전화번호'] = '01012341234'


In [33]:
len(dataframe)

2

In [34]:
dataframe

Unnamed: 0,이름,나이,주소,전화번호
0,임원균,26,서울,1012341234.0
1,영희,25,인천,


In [35]:
# 컬럼 선택하기
dataframe['이름']

0    임원균
1     영희
Name: 이름, dtype: object

In [37]:
dataframe[['이름','주소','나이']]

Unnamed: 0,이름,주소,나이
0,임원균,서울,26
1,영희,인천,25


In [38]:
# 누락된 데이터 체크
dataframe.isnull()

Unnamed: 0,이름,나이,주소,전화번호
0,False,False,False,False
1,False,False,False,True


In [40]:
dataframe.notnull()

Unnamed: 0,이름,나이,주소,전화번호
0,True,True,True,True
1,True,True,True,False


In [41]:
dataframe.dropna()

Unnamed: 0,이름,나이,주소,전화번호
0,임원균,26,서울,1012341234


In [43]:
dataframe['전화번호'] = dataframe['전화번호'].fillna('전화번호 없음')
dataframe

Unnamed: 0,이름,나이,주소,전화번호
0,임원균,26,서울,01012341234
1,영희,25,인천,전화번호 없음


In [49]:
# Series 연산
A = pd.Series([2,4,6],index=[0,1,2])
B = pd.Series([1,3,5],index=[1,2,3])

print(A)
print(B)


0    2
1    4
2    6
dtype: int64
1    1
2    3
3    5
dtype: int64


In [50]:
A+B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [51]:
A.add(B,fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

In [56]:
A = pd.DataFrame(np.random.randint(0, 10, (2, 2)), columns=list("AB"))
B = pd.DataFrame(np.random.randint(0, 10, (3, 3)), columns=list("BAC"))

print(A)
print()
print(B)

   A  B
0  8  4
1  6  9

   B  A  C
0  5  5  8
1  6  5  2
2  6  0  7


In [53]:
A + B

Unnamed: 0,A,B,C
0,11.0,10.0,
1,10.0,7.0,
2,,,


In [54]:
A.add(B, fill_value=0)

Unnamed: 0,A,B,C
0,11.0,10.0,9.0
1,10.0,7.0,0.0
2,6.0,4.0,6.0


In [58]:
# 집계함수
data = {
    'A':[i+5 for i in range(3)],
    'B':[i**2 for i in range(3)]
}

df = pd.DataFrame(data)
df

Unnamed: 0,A,B
0,5,0
1,6,1
2,7,4


In [59]:
df['A'].sum()

18

In [60]:
df.sum()

A    18
B     5
dtype: int64

In [62]:
df.mean()

A    6.000000
B    1.666667
dtype: float64

In [63]:
# 값으로 정렬하기
df = pd.DataFrame({
'col1' : [2, 1, 9, 8, 7, 4],
'col2' : ['A', 'A', 'B', np.nan, 'D', 'C'],
'col3': [0, 1, 9, 4, 2, 3],
})

In [67]:
df.sort_values('col1')

Unnamed: 0,col1,col2,col3
1,1,A,1
0,2,A,0
5,4,C,3
4,7,D,2
3,8,,4
2,9,B,9


In [68]:
df.sort_values('col1',ascending=False)

Unnamed: 0,col1,col2,col3
2,9,B,9
3,8,,4
4,7,D,2
5,4,C,3
0,2,A,0
1,1,A,1


In [69]:
df.sort_values(['col2','col1'])

Unnamed: 0,col1,col2,col3
1,1,A,1
0,2,A,0
2,9,B,9
5,4,C,3
4,7,D,2
3,8,,4
