In [48]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

In [2]:
obj1 = Series([1, 'A', True])
obj1

0       1
1       A
2    True
dtype: object

In [3]:
sdata = {'Utah':35000, 'Oregon':71000, 'Texas':16000, 'Ohio':5000}
obj3 = Series(sdata)
obj3

Utah      35000
Oregon    71000
Texas     16000
Ohio       5000
dtype: int64

In [4]:
obj4 = Series([4, 7, -5, 3], index=['a', 'b', 'c', 'd'])
obj4

a    4
b    7
c   -5
d    3
dtype: int64

In [8]:
print(obj4.values)
print(obj4.index)
print(obj4.dtypes)

[ 4  7 -5  3]
Index(['a', 'b', 'c', 'd'], dtype='object')
int64


데이터프레임은 아래와 같이 생성할 수 있음. 리스트의 길이가 같지 않으면 오류가 발생.

In [9]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
       'year': [2000, 2001, 2002, 2001, 2002],
       'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
data

{'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
 'year': [2000, 2001, 2002, 2001, 2002],
 'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}

In [10]:
frame = DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [11]:
DataFrame(data, columns=['year', 'pop', 'state'])

Unnamed: 0,year,pop,state
0,2000,1.5,Ohio
1,2001,1.7,Ohio
2,2002,3.6,Ohio
3,2001,2.4,Nevada
4,2002,2.9,Nevada


In [12]:
frame2 = DataFrame(data, index=['one', 'two', 'three', 'four', 'five'])
frame2

Unnamed: 0,state,year,pop
one,Ohio,2000,1.5
two,Ohio,2001,1.7
three,Ohio,2002,3.6
four,Nevada,2001,2.4
five,Nevada,2002,2.9


중첩된 딕셔너리를 통해서도 데이터프레임을 생성할 수 있음. 인덱스가 자동으로 삽입.

In [13]:
pop = {'Nevada':{2001:2.4, 2002:2.9}, 'Ohio':{2000:1.5, 2001:1.7, 2002:3.6}}
pop

{'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

In [14]:
frame3 = DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


Numpy 배열 객체로도 데이터프레임 생성이 가능.

In [16]:
num = DataFrame(np.arange(9).reshape(3,3), index=['a', 'b', 'd'], columns=['x', 'y', 'z'])
num

Unnamed: 0,x,y,z
a,0,1,2
b,3,4,5
d,6,7,8


색인 객체(index object)를 따로 저장할 수 있음.

In [17]:
obj = Series(range(3), index=['a', 'b', 'c'])
obj

a    0
b    1
c    2
dtype: int64

In [18]:
index = obj.index
index

Index(['a', 'b', 'c'], dtype='object')

In [21]:
index[2] 

'c'

In [22]:
index[1:]

Index(['b', 'c'], dtype='object')

In [23]:
'a' in index

True

In [25]:
index[0] = 'A' # 오류가 발생. 개별 요소값을 변경할 수 없음.

TypeError: Index does not support mutable operations

In [26]:
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [27]:
'Ohio' in frame3.columns

True

In [28]:
2003 in frame3.index

False

Numpy의 배열과 유사하게, Series와 DataFrame도 인덱싱과 슬라이싱이 가능.

In [3]:
obj = Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [4]:
obj['a']

0.0

In [5]:
obj[-1]

3.0

In [6]:
obj[['a', 'c']]

a    0.0
c    2.0
dtype: float64

In [7]:
obj[[0,2]]

a    0.0
c    2.0
dtype: float64

숫자로 인덱싱할 때는 파이썬의 문법과 동일하지만, 인덱스로 인덱싱하면 끝점이 포함됨.

In [8]:
obj['a':'c']

a    0.0
b    1.0
c    2.0
dtype: float64

In [9]:
data = DataFrame(np.arange(16).reshape((4,4)), index=['one', 'two', 'three', 'four'], columns=['Ohio', 'Texas', 'Seattle', 'Portland'])
data

Unnamed: 0,Ohio,Texas,Seattle,Portland
one,0,1,2,3
two,4,5,6,7
three,8,9,10,11
four,12,13,14,15


In [10]:
data['Seattle']

one       2
two       6
three    10
four     14
Name: Seattle, dtype: int64

In [11]:
data.Seattle

one       2
two       6
three    10
four     14
Name: Seattle, dtype: int64

In [12]:
data[['Ohio', 'Seattle']]

Unnamed: 0,Ohio,Seattle
one,0,2
two,4,6
three,8,10
four,12,14


In [13]:
data[:2]

Unnamed: 0,Ohio,Texas,Seattle,Portland
one,0,1,2,3
two,4,5,6,7


In [14]:
data['one':'two']

Unnamed: 0,Ohio,Texas,Seattle,Portland
one,0,1,2,3
two,4,5,6,7


In [15]:
data < 5

Unnamed: 0,Ohio,Texas,Seattle,Portland
one,True,True,True,True
two,True,False,False,False
three,False,False,False,False
four,False,False,False,False


In [16]:
data[data<5]

Unnamed: 0,Ohio,Texas,Seattle,Portland
one,0.0,1.0,2.0,3.0
two,4.0,,,
three,,,,
four,,,,


In [17]:
data[data<5] = 0
data

Unnamed: 0,Ohio,Texas,Seattle,Portland
one,0,0,0,0
two,0,5,6,7
three,8,9,10,11
four,12,13,14,15


In [18]:
data.iat[2,3]

11

In [20]:
data.loc['three', 'Portland']

11

In [21]:
data.loc[:, ['Ohio', 'Portland']]

Unnamed: 0,Ohio,Portland
one,0,0
two,0,7
three,8,11
four,12,15


In [22]:
data.iloc[2,3]

11

In [23]:
data.iloc[0:2, 0:2]

Unnamed: 0,Ohio,Texas
one,0,0
two,0,5


재색인(reindex)도 가능.

In [24]:
obj = Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [26]:
obj2 = obj.reindex(['a', 'b', 'c', 'd'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
dtype: float64

In [27]:
obj3 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj3

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [28]:
frame = DataFrame(np.arange(9).reshape((3,3)), index=['a', 'c', 'd'], columns=['Ohio', 'Texas', 'Seattle'])
frame

Unnamed: 0,Ohio,Texas,Seattle
a,0,1,2
c,3,4,5
d,6,7,8


In [29]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2

Unnamed: 0,Ohio,Texas,Seattle
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [30]:
frame3 = frame2.reindex(columns = ['Texas', 'Seattle', 'Ohio'])
frame3

Unnamed: 0,Texas,Seattle,Ohio
a,1.0,2.0,0.0
b,,,
c,4.0,5.0,3.0
d,7.0,8.0,6.0


행과 열 제거(drop)가 가능.

In [31]:
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [32]:
obj.drop('d')

b    7.2
a   -5.3
c    3.6
dtype: float64

In [33]:
obj.drop(['d', 'b'])

a   -5.3
c    3.6
dtype: float64

In [34]:
frame2

Unnamed: 0,Ohio,Texas,Seattle
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [36]:
frame2.drop('b')

Unnamed: 0,Ohio,Texas,Seattle
a,0.0,1.0,2.0
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [37]:
frame2.drop(['b','a'])

Unnamed: 0,Ohio,Texas,Seattle
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [38]:
frame2.drop(['Texas', 'Ohio'], axis=1)

Unnamed: 0,Seattle
a,2.0
b,
c,5.0
d,8.0


In [45]:
data = {'city': ['Seoul', 'Busan', 'Gwangju', 'Daegu'],
       'year': [2000, 2001, 2002, 2002],
       'pop': [4000, 2000, 1000, 1000]}

data = DataFrame(data, index=['One', 'Two', 'Three', 'Four'])
data

Unnamed: 0,city,year,pop
One,Seoul,2000,4000
Two,Busan,2001,2000
Three,Gwangju,2002,1000
Four,Daegu,2002,1000


In [46]:
data.loc['Five'] = ['Incheon', 2002, 1000]
data

Unnamed: 0,city,year,pop
One,Seoul,2000,4000
Two,Busan,2001,2000
Three,Gwangju,2002,1000
Four,Daegu,2002,1000
Five,Incheon,2002,1000


In [47]:
data.loc['Six',:] = ['Gyoenggi', 2002, 1000]
data

Unnamed: 0,city,year,pop
One,Seoul,2000.0,4000.0
Two,Busan,2001.0,2000.0
Three,Gwangju,2002.0,1000.0
Four,Daegu,2002.0,1000.0
Five,Incheon,2002.0,1000.0
Six,Gyoenggi,2002.0,1000.0


In [48]:
data["debt"] = [10, 20, 30, 40, 50, 60]
data

Unnamed: 0,city,year,pop,debt
One,Seoul,2000.0,4000.0,10
Two,Busan,2001.0,2000.0,20
Three,Gwangju,2002.0,1000.0,30
Four,Daegu,2002.0,1000.0,40
Five,Incheon,2002.0,1000.0,50
Six,Gyoenggi,2002.0,1000.0,60


In [50]:
data['Sum'] = data['pop'] + data['debt'] * 1.06
data

Unnamed: 0,city,year,pop,debt,Sum
One,Seoul,2000.0,4000.0,10,4010.6
Two,Busan,2001.0,2000.0,20,2021.2
Three,Gwangju,2002.0,1000.0,30,1031.8
Four,Daegu,2002.0,1000.0,40,1042.4
Five,Incheon,2002.0,1000.0,50,1053.0
Six,Gyoenggi,2002.0,1000.0,60,1063.6


Numpy에서 가능한 연산들은 Series, DataFrame을 이용한 연산에도 적용이 가능.
이제 산술연산(arithmetic operation)에 대해 알아볼 것.

In [2]:
a = Series([1,2,3,4])
b = Series([30,20,10,40], index=['c', 'b', 'a', 'd'])
print(a) ; print(b)

0    1
1    2
2    3
3    4
dtype: int64
c    30
b    20
a    10
d    40
dtype: int64


In [3]:
print(a//2)

0    0
1    1
2    1
3    2
dtype: int64


In [6]:
s1 = Series(np.arange(5), index=['a', 'b', 'c', 'd', 'e'])
s2 = Series(np.arange(0, 50, 10), index=['a', 'c', 'e', 'f', 'g'])
s2

a     0
c    10
e    20
f    30
g    40
dtype: int64

In [7]:
s1 + s2

a     0.0
b     NaN
c    12.0
d     NaN
e    24.0
f     NaN
g     NaN
dtype: float64

In [8]:
x = DataFrame(np.arange(12).reshape(3,4), columns=list('abcd'))
x

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [9]:
print(x**2)

    a   b    c    d
0   0   1    4    9
1  16  25   36   49
2  64  81  100  121


In [10]:
y = DataFrame(np.arange(20).reshape(4,5), columns=list('abcde'))
y

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19


In [11]:
x+y

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [13]:
s3 = Series(np.array([10, 20, 10, 20]), index=['a', 'b', 'c', 'd'])
df3 = DataFrame(np.arange(16).reshape((4,4)), index=['Ohio', 'Seattle', 'Portland', 'Texas'], columns=['a', 'b', 'c', 'd'])




In [14]:
s3

a    10
b    20
c    10
d    20
dtype: int64

In [15]:
df3

Unnamed: 0,a,b,c,d
Ohio,0,1,2,3
Seattle,4,5,6,7
Portland,8,9,10,11
Texas,12,13,14,15


In [16]:
s3 + df3

Unnamed: 0,a,b,c,d
Ohio,10,21,12,23
Seattle,14,25,16,27
Portland,18,29,20,31
Texas,22,33,24,35


In [17]:
print(x)

   a  b   c   d
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11


In [18]:
np.exp(x)

Unnamed: 0,a,b,c,d
0,1.0,2.718282,7.389056,20.085537
1,54.59815,148.413159,403.428793,1096.633158
2,2980.957987,8103.083928,22026.465795,59874.141715


정렬(sort)과 순위(rank)를 매길 수 있음. sort_values와 sort_index를 이용.

In [4]:
obj = Series(np.array([7,2,1,3]), index=['d', 'a', 'b', 'c'])
obj

d    7
a    2
b    1
c    3
dtype: int64

In [20]:
obj.sort_index()

a    2
b    1
c    3
d    7
dtype: int64

In [22]:
obj.sort_index(ascending=False)

d    7
c    3
b    1
a    2
dtype: int64

In [23]:
obj.sort_values()

b    1
a    2
c    3
d    7
dtype: int64

In [24]:
obj.sort_values(ascending=False)

d    7
c    3
a    2
b    1
dtype: int64

In [8]:
frame = DataFrame(np.arange(8).reshape((2,4)), index=['three', 'one'], columns=['d', 'a', 'b', 'c'])
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [26]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [27]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [28]:
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [30]:
frame.sort_values(by='a', ascending=False)

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [31]:
frame.sort_values(by=['d'])

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [34]:
np.random.seed(100)
data2 = DataFrame(np.random.randn(6,4), columns= ['a', 'b', 'c', 'd'], index=pd.date_range("20230112", periods=6))
data2

Unnamed: 0,a,b,c,d
2023-01-12,-1.749765,0.34268,1.153036,-0.252436
2023-01-13,0.981321,0.514219,0.22118,-1.070043
2023-01-14,-0.189496,0.255001,-0.458027,0.435163
2023-01-15,-0.583595,0.816847,0.672721,-0.104411
2023-01-16,-0.53128,1.029733,-0.438136,-1.118318
2023-01-17,1.618982,1.541605,-0.251879,-0.842436


In [6]:
obj

d    7
a    2
b    1
c    3
dtype: int64

In [7]:
obj.rank()

d    4.0
a    2.0
b    1.0
c    3.0
dtype: float64

In [9]:
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [10]:
frame.rank()

Unnamed: 0,d,a,b,c
three,1.0,1.0,1.0,1.0
one,2.0,2.0,2.0,2.0


In [11]:
frame.rank(axis=1)

Unnamed: 0,d,a,b,c
three,1.0,2.0,3.0,4.0
one,1.0,2.0,3.0,4.0


In [12]:
data = [[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]]
data = DataFrame(data, columns=['one', 'two'], index=['a', 'b', 'c', 'd'])
data

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [13]:
data.sum()

one    9.25
two   -5.80
dtype: float64

In [14]:
data.sum(axis=0)

one    9.25
two   -5.80
dtype: float64

In [15]:
data.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [16]:
data.sum(axis=0, skipna=False)

one   NaN
two   NaN
dtype: float64

In [17]:
data['one'].sum()

9.25

In [18]:
df = DataFrame(np.arange(8).reshape((4,2)), index=['a', 'b', 'c', 'd'], columns=['one', 'two'])
df

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5
d,6,7


In [19]:
df.min()

one    0
two    1
dtype: int64

In [20]:
df.min(axis=1)

a    0
b    2
c    4
d    6
dtype: int64

In [21]:
df.idxmin()

one    a
two    a
dtype: object

In [22]:
df.idxmin(axis=1)

a    one
b    one
c    one
d    one
dtype: object

In [23]:
df.quantile()

one    3.0
two    4.0
Name: 0.5, dtype: float64

In [24]:
df.quantile(0.75)

one    4.5
two    5.5
Name: 0.75, dtype: float64

In [25]:
df.quantile([0, 0.25, 0.5, 0.75])

Unnamed: 0,one,two
0.0,0.0,1.0
0.25,1.5,2.5
0.5,3.0,4.0
0.75,4.5,5.5


In [26]:
df.describe()

Unnamed: 0,one,two
count,4.0,4.0
mean,3.0,4.0
std,2.581989,2.581989
min,0.0,1.0
25%,1.5,2.5
50%,3.0,4.0
75%,4.5,5.5
max,6.0,7.0


In [27]:
df.cumsum()

Unnamed: 0,one,two
a,0,1
b,2,4
c,6,9
d,12,16


In [28]:
df

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5
d,6,7


In [29]:
df[df>1].sum()

one    12.0
two    15.0
dtype: float64

In [30]:
data

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [31]:
data.isnull().sum()

one    1
two    2
dtype: int64

In [32]:
data.isnull().all()

one    False
two    False
dtype: bool

In [33]:
data.T.isnull().all()

a    False
b    False
c     True
d    False
dtype: bool

In [34]:
s1 = Series([1,2,3])
s2 = Series([2,4,6])
s3 = Series([-1,-2,-3])

In [35]:
s1.corr(s2)

1.0

In [36]:
print(s1.cov(s2))
print(s2.cov(s3))

2.0
-2.0


In [38]:
df = pd.concat([s1, s2, s3], axis=1)
df

Unnamed: 0,0,1,2
0,1,2,-1
1,2,4,-2
2,3,6,-3


In [39]:
df.cov()

Unnamed: 0,0,1,2
0,1.0,2.0,-1.0
1,2.0,4.0,-2.0
2,-1.0,-2.0,1.0


In [40]:
df.corr()

Unnamed: 0,0,1,2
0,1.0,1.0,-1.0
1,1.0,1.0,-1.0
2,-1.0,-1.0,1.0


In [41]:
df[0].corr(df[1])

1.0

데이터프레임은 결합이 가능하다는 장점이 있음.

In [42]:
s1 = Series([0,1,2])
s2 = Series([3,4,5])
s3 = Series(['S1', 'S2', 'S3'])
print(s1) ; print(s2) ; print(s3)

0    0
1    1
2    2
dtype: int64
0    3
1    4
2    5
dtype: int64
0    S1
1    S2
2    S3
dtype: object


In [43]:
pd.concat([s1, s2, s3])

0     0
1     1
2     2
0     3
1     4
2     5
0    S1
1    S2
2    S3
dtype: object

In [44]:
pd.concat([s1, s2, s3], axis=1)

Unnamed: 0,0,1,2
0,0,3,S1
1,1,4,S2
2,2,5,S3


In [45]:
pd.concat([s1, s2, s3], axis=1, keys=['c0', 'c1', 'c2'])

Unnamed: 0,c0,c1,c2
0,0,3,S1
1,1,4,S2
2,2,5,S3


In [46]:
df_1 = DataFrame({'A': ['a0', 'a1', 'a2'], 'B': ['b0', 'b1', 'b2'], 'C': ['c0', 'c1', 'c2'], 'D': ['d0', 'd1', 'd2']}, index=[0,1,2])
df_1

Unnamed: 0,A,B,C,D
0,a0,b0,c0,d0
1,a1,b1,c1,d1
2,a2,b2,c2,d2


In [47]:
df_2 = DataFrame({'A': ['a3', 'a4', 'a5'], 'B': ['b3', 'b4', 'b5'], 'C': ['c3', 'c4', 'c5'], 'D': ['d3', 'd4', 'd5']}, index=[3,4,5])
df_2

Unnamed: 0,A,B,C,D
3,a3,b3,c3,d3
4,a4,b4,c4,d4
5,a5,b5,c5,d5


In [48]:
df_3 = DataFrame({'E': ['e0', 'e1', 'e2'], 'F': ['f0', 'f1', 'f2'], 'G': ['g0', 'g1', 'g2'], 'H': ['h0', 'h1', 'h2']}, index=[0,1,2])
df_3

Unnamed: 0,E,F,G,H
0,e0,f0,g0,h0
1,e1,f1,g1,h1
2,e2,f2,g2,h2


In [50]:
pd.concat([df_1, df_2])

Unnamed: 0,A,B,C,D
0,a0,b0,c0,d0
1,a1,b1,c1,d1
2,a2,b2,c2,d2
3,a3,b3,c3,d3
4,a4,b4,c4,d4
5,a5,b5,c5,d5


In [51]:
pd.concat([df_1, df_3], axis=1)

Unnamed: 0,A,B,C,D,E,F,G,H
0,a0,b0,c0,d0,e0,f0,g0,h0
1,a1,b1,c1,d1,e1,f1,g1,h1
2,a2,b2,c2,d2,e2,f2,g2,h2


In [53]:
df_4 = pd.DataFrame({'A': ['a0', 'a1', 'a2'], 'B': ['b0', 'b1', 'b2'], 'C': ['c0', 'c1', 'c2'], 'E': ['e0', 'e1', 'e2']}, index=[0,1,2])
df_4

Unnamed: 0,A,B,C,E
0,a0,b0,c0,e0
1,a1,b1,c1,e1
2,a2,b2,c2,e2


In [54]:
df_1

Unnamed: 0,A,B,C,D
0,a0,b0,c0,d0
1,a1,b1,c1,d1
2,a2,b2,c2,d2


In [55]:
pd.concat([df_1, df_4], join='outer', sort=True)

Unnamed: 0,A,B,C,D,E
0,a0,b0,c0,d0,
1,a1,b1,c1,d1,
2,a2,b2,c2,d2,
0,a0,b0,c0,,e0
1,a1,b1,c1,,e1
2,a2,b2,c2,,e2


In [56]:
pd.concat([df_1, df_4], join='outer', axis=1)

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,E
0,a0,b0,c0,d0,a0,b0,c0,e0
1,a1,b1,c1,d1,a1,b1,c1,e1
2,a2,b2,c2,d2,a2,b2,c2,e2


In [59]:
pd.concat([df_1, df_4], join='inner')

Unnamed: 0,A,B,C
0,a0,b0,c0
1,a1,b1,c1
2,a2,b2,c2
0,a0,b0,c0
1,a1,b1,c1
2,a2,b2,c2


In [60]:
df_5 = pd.DataFrame({'A': ['a0', 'a1', 'a2'], 'B': ['b0', 'b1', 'b2'], 'C': ['c0', 'c1', 'c2'], 'D': ['d0', 'd1', 'd2']}, index=['r0', 'r1', 'r2'])
df_5

Unnamed: 0,A,B,C,D
r0,a0,b0,c0,d0
r1,a1,b1,c1,d1
r2,a2,b2,c2,d2


In [61]:
df_6 = pd.DataFrame({'A': ['a3', 'a4', 'a5'], 'B': ['b3', 'b4', 'b5'], 'C': ['c3', 'c4', 'c5'], 'D': ['d3', 'd4', 'd5']}, index=['r3', 'r4', 'r5'])
df_6

Unnamed: 0,A,B,C,D
r3,a3,b3,c3,d3
r4,a4,b4,c4,d4
r5,a5,b5,c5,d5


In [62]:
pd.concat([df_5, df_6])

Unnamed: 0,A,B,C,D
r0,a0,b0,c0,d0
r1,a1,b1,c1,d1
r2,a2,b2,c2,d2
r3,a3,b3,c3,d3
r4,a4,b4,c4,d4
r5,a5,b5,c5,d5


In [64]:
pd.concat([df_5, df_6], ignore_index=True)

Unnamed: 0,A,B,C,D
0,a0,b0,c0,d0
1,a1,b1,c1,d1
2,a2,b2,c2,d2
3,a3,b3,c3,d3
4,a4,b4,c4,d4
5,a5,b5,c5,d5


In [65]:
df_56 = pd.concat([df_5, df_6], keys=('df_5', 'df_6'))
df_56

Unnamed: 0,Unnamed: 1,A,B,C,D
df_5,r0,a0,b0,c0,d0
df_5,r1,a1,b1,c1,d1
df_5,r2,a2,b2,c2,d2
df_6,r3,a3,b3,c3,d3
df_6,r4,a4,b4,c4,d4
df_6,r5,a5,b5,c5,d5


In [66]:
df_56.loc['df_5']

Unnamed: 0,A,B,C,D
r0,a0,b0,c0,d0
r1,a1,b1,c1,d1
r2,a2,b2,c2,d2


In [67]:
df_56.loc['df_5'][0:2]

Unnamed: 0,A,B,C,D
r0,a0,b0,c0,d0
r1,a1,b1,c1,d1


In [68]:
pd.concat([df_5, df_6], keys=['df_5', 'df_6'], names=['df_name', 'row_number'])

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
df_name,row_number,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
df_5,r0,a0,b0,c0,d0
df_5,r1,a1,b1,c1,d1
df_5,r2,a2,b2,c2,d2
df_6,r3,a3,b3,c3,d3
df_6,r4,a4,b4,c4,d4
df_6,r5,a5,b5,c5,d5


In [69]:
df_1

Unnamed: 0,A,B,C,D
0,a0,b0,c0,d0
1,a1,b1,c1,d1
2,a2,b2,c2,d2


In [70]:
s1 = Series(['s0', 's1', 's2'], name='S')
s1

0    s0
1    s1
2    s2
Name: S, dtype: object

In [71]:
pd.concat([df_1, s1], axis=1)

Unnamed: 0,A,B,C,D,S
0,a0,b0,c0,d0,s0
1,a1,b1,c1,d1,s1
2,a2,b2,c2,d2,s2


In [72]:
pd.concat([df_1, s1], axis=1, ignore_index=True)

Unnamed: 0,0,1,2,3,4
0,a0,b0,c0,d0,s0
1,a1,b1,c1,d1,s1
2,a2,b2,c2,d2,s2


merge 함수를 이용한 결합도 가능.

In [4]:
df_left = DataFrame({'Key': ['k0', 'k1', 'k2', 'k3'],
                    'A': ['a0', 'a1', 'a2', 'a3'],
                    'B': ['b0', 'b1', 'b2', 'b3']})
df_left

Unnamed: 0,Key,A,B
0,k0,a0,b0
1,k1,a1,b1
2,k2,a2,b2
3,k3,a3,b3


In [5]:
df_right = DataFrame({'Key': ['k2', 'k3', 'k4', 'k5'],
                     'C': ['c2', 'c3', 'c4', 'c5'],
                     'D': ['d2', 'd3', 'd4', 'd5']})
df_right

Unnamed: 0,Key,C,D
0,k2,c2,d2
1,k3,c3,d3
2,k4,c4,d4
3,k5,c5,d5


In [6]:
pd.merge(df_left, df_right)

Unnamed: 0,Key,A,B,C,D
0,k2,a2,b2,c2,d2
1,k3,a3,b3,c3,d3


In [7]:
pd.merge(df_left, df_right, on='Key')

Unnamed: 0,Key,A,B,C,D
0,k2,a2,b2,c2,d2
1,k3,a3,b3,c3,d3


In [8]:
df_new = DataFrame({'Id': ['k2', 'k3', 'k4', 'k5'],
                   'C': ['c2', 'c3', 'c4', 'c5'],
                   'D': ['d2', 'd3', 'd4', 'd5']})
df_new

Unnamed: 0,Id,C,D
0,k2,c2,d2
1,k3,c3,d3
2,k4,c4,d4
3,k5,c5,d5


In [9]:
pd.merge(df_left, df_new, left_on='Key', right_on = 'Id')

Unnamed: 0,Key,A,B,Id,C,D
0,k2,a2,b2,k2,c2,d2
1,k3,a3,b3,k3,c3,d3


In [10]:
pd.merge(df_left, df_new, how='outer', left_on='Key', right_on='Id')

Unnamed: 0,Key,A,B,Id,C,D
0,k0,a0,b0,,,
1,k1,a1,b1,,,
2,k2,a2,b2,k2,c2,d2
3,k3,a3,b3,k3,c3,d3
4,,,,k4,c4,d4
5,,,,k5,c5,d5


In [11]:
pd.merge(df_left, df_right, how='outer', on='Key')

Unnamed: 0,Key,A,B,C,D
0,k0,a0,b0,,
1,k1,a1,b1,,
2,k2,a2,b2,c2,d2
3,k3,a3,b3,c3,d3
4,k4,,,c4,d4
5,k5,,,c5,d5


In [12]:
pd.merge(df_left, df_right, how='left', on='Key')

Unnamed: 0,Key,A,B,C,D
0,k0,a0,b0,,
1,k1,a1,b1,,
2,k2,a2,b2,c2,d2
3,k3,a3,b3,c3,d3


In [13]:
pd.merge(df_left, df_right, how='right', on='Key')

Unnamed: 0,Key,A,B,C,D
0,k2,a2,b2,c2,d2
1,k3,a3,b3,c3,d3
2,k4,,,c4,d4
3,k5,,,c5,d5


In [14]:
df5 = DataFrame({'Key': ['a', 'b', 'a', 'a', 'b', 'c'],
                'Value': range(6)})
df6 = DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])
print(df5) ; print(df6)

  Key  Value
0   a      0
1   b      1
2   a      2
3   a      3
4   b      4
5   c      5
   group_val
a        3.5
b        7.0


In [15]:
pd.merge(df5, df6, left_on='Key', right_index=True)

Unnamed: 0,Key,Value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0


join 함수를 이용해서 결합할 수도 있음.

In [17]:
df7 = DataFrame({'Ohio': [1,3,5], 'Seattle': [2,4,6]}, index=['a', 'c', 'e'])
df8 = DataFrame({'Portland': [7,9,11,13], 'Texas': [8,10,12,14]}, index=['b', 'c', 'd', 'e'])
print(df7) ; print(df8)

   Ohio  Seattle
a     1        2
c     3        4
e     5        6
   Portland  Texas
b         7      8
c         9     10
d        11     12
e        13     14


In [18]:
pd.merge(df7, df8, how='left', right_index=True, left_index=True)

Unnamed: 0,Ohio,Seattle,Portland,Texas
a,1,2,,
c,3,4,9.0,10.0
e,5,6,13.0,14.0


In [19]:
df7.join(df8)

Unnamed: 0,Ohio,Seattle,Portland,Texas
a,1,2,,
c,3,4,9.0,10.0
e,5,6,13.0,14.0


In [20]:
df8.join(df7)

Unnamed: 0,Portland,Texas,Ohio,Seattle
b,7,8,,
c,9,10,3.0,4.0
d,11,12,,
e,13,14,5.0,6.0


In [21]:
df7.join(df8, how='outer')

Unnamed: 0,Ohio,Seattle,Portland,Texas
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


Pandas의 가장 큰 장점은 결측치(missing value)를 쉽게 처리할 수 있다는 점.

In [22]:
data = Series([1, np.nan, 3.5, np.nan, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [23]:
data.isnull()

0    False
1     True
2    False
3     True
4    False
dtype: bool

In [24]:
data.notnull()

0     True
1    False
2     True
3    False
4     True
dtype: bool

In [25]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [26]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [29]:
data.fillna(np.average(data[data.notnull()]))

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [30]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [32]:
df = DataFrame(np.random.randn(6,3))
df

Unnamed: 0,0,1,2
0,0.367011,-1.919673,0.721472
1,0.686194,1.029872,1.923681
2,-0.045077,-0.082327,0.93662
3,1.511959,-0.52192,-0.431133
4,-0.91202,-1.595922,0.833053
5,0.310227,1.371582,-1.786521


In [33]:
df.iloc[2:,1] = np.nan
df.iloc[4:,0] = np.nan
df.iat[5,2] = np.nan
df

Unnamed: 0,0,1,2
0,0.367011,-1.919673,0.721472
1,0.686194,1.029872,1.923681
2,-0.045077,,0.93662
3,1.511959,,-0.431133
4,,,0.833053
5,,,


In [34]:
df.isnull()

Unnamed: 0,0,1,2
0,False,False,False
1,False,False,False
2,False,True,False
3,False,True,False
4,True,True,False
5,True,True,True


In [36]:
df.dropna()

Unnamed: 0,0,1,2
0,0.367011,-1.919673,0.721472
1,0.686194,1.029872,1.923681


In [37]:
df.dropna(how = 'all')

Unnamed: 0,0,1,2
0,0.367011,-1.919673,0.721472
1,0.686194,1.029872,1.923681
2,-0.045077,,0.93662
3,1.511959,,-0.431133
4,,,0.833053


In [45]:
df.fillna({0: df[0].mean(), 1:df[1].mean(), 2: df[2].mean()})

Unnamed: 0,0,1,2
0,0.367011,-1.919673,0.721472
1,0.686194,1.029872,1.923681
2,-0.045077,-0.4449,0.93662
3,1.511959,-0.4449,-0.431133
4,0.630022,-0.4449,0.833053
5,0.630022,-0.4449,0.796738


In [46]:
df.mean(axis=0)

0    0.630022
1   -0.444900
2    0.796738
dtype: float64

In [47]:
df.fillna(df.mean(axis=0))

Unnamed: 0,0,1,2
0,0.367011,-1.919673,0.721472
1,0.686194,1.029872,1.923681
2,-0.045077,-0.4449,0.93662
3,1.511959,-0.4449,-0.431133
4,0.630022,-0.4449,0.833053
5,0.630022,-0.4449,0.796738


이번에는 파일을 불러오고 내보내는 방법에 대해 알아볼 것.

In [49]:
data = {'ID': ['a1', 'a2', 'a3', 'a4', 'a5'],
       'X1': [1, 2, 3, 4, 5],
       'X2': [3.0, 4.5, 3.2, 4.0, 3.5]}
data_df = DataFrame(data, index=['a', 'b', 'c', 'd', 'e'])
data_df_2 = data_df.reindex(['a', 'b', 'c', 'd', 'e', 'f'])
print(data_df_2)

    ID   X1   X2
a   a1  1.0  3.0
b   a2  2.0  4.5
c   a3  3.0  3.2
d   a4  4.0  4.0
e   a5  5.0  3.5
f  NaN  NaN  NaN


In [50]:
data_df_2.to_csv("/Users/user/Desktop/Yonsei/Junior/R & Python Programming/data_df_2.csv",
                sep=',', na_rep='NaN')

In [51]:
data_df_2.to_csv("/Users/user/Desktop/Yonsei/Junior/R & Python Programming/data_df_2_noind.csv",
                sep=',', index=False)

In [52]:
data_df_2.to_csv("/Users/user/Desktop/Yonsei/Junior/R & Python Programming/data_df_2_noind.txt",
                sep=' ', index=False)

In [53]:
data_df_2.to_csv("/Users/user/Desktop/Yonsei/Junior/R & Python Programming/data_df_3.txt",
                sep=' ', index=False, header=False)

In [54]:
x_save2 = pd.read_csv("/Users/user/Desktop/Yonsei/Junior/R & Python Programming/data_df_2_noind.csv")
x_save2

Unnamed: 0,ID,X1,X2
0,a1,1.0,3.0
1,a2,2.0,4.5
2,a3,3.0,3.2
3,a4,4.0,4.0
4,a5,5.0,3.5
5,,,


In [55]:
x_save3 = pd.read_csv("/Users/user/Desktop/Yonsei/Junior/R & Python Programming/data_df_2_noind.txt", sep=' ')
x_save3

Unnamed: 0,ID,X1,X2
0,a1,1.0,3.0
1,a2,2.0,4.5
2,a3,3.0,3.2
3,a4,4.0,4.0
4,a5,5.0,3.5
5,,,


In [56]:
x_save1 = pd.read_csv("/Users/user/Desktop/Yonsei/Junior/R & Python Programming/data_df_2.csv")
x_save1

Unnamed: 0.1,Unnamed: 0,ID,X1,X2
0,a,a1,1.0,3.0
1,b,a2,2.0,4.5
2,c,a3,3.0,3.2
3,d,a4,4.0,4.0
4,e,a5,5.0,3.5
5,f,,,


In [57]:
x_save4 = pd.read_csv("/Users/user/Desktop/Yonsei/Junior/R & Python Programming/data_df_2.csv", index_col=0)
x_save4

Unnamed: 0,ID,X1,X2
a,a1,1.0,3.0
b,a2,2.0,4.5
c,a3,3.0,3.2
d,a4,4.0,4.0
e,a5,5.0,3.5
f,,,


In [59]:
x_save5 = pd.read_csv("/Users/user/Desktop/Yonsei/Junior/R & Python Programming/data_df_2_noind.txt",
                     sep=' ', index_col='ID')
x_save5

Unnamed: 0_level_0,X1,X2
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
a1,1.0,3.0
a2,2.0,4.5
a3,3.0,3.2
a4,4.0,4.0
a5,5.0,3.5
,,


In [60]:
pd.read_csv("/Users/user/Desktop/Yonsei/Junior/R & Python Programming/data_df_3.txt", sep=' ')

Unnamed: 0,a1,1.0,3.0
0,a2,2.0,4.5
1,a3,3.0,3.2
2,a4,4.0,4.0
3,a5,5.0,3.5
4,,,


In [61]:
pd.read_csv("/Users/user/Desktop/Yonsei/Junior/R & Python Programming/data_df_3.txt", sep=' ', header=None)

Unnamed: 0,0,1,2
0,a1,1.0,3.0
1,a2,2.0,4.5
2,a3,3.0,3.2
3,a4,4.0,4.0
4,a5,5.0,3.5
5,,,


In [62]:
pd.read_csv("/Users/user/Desktop/Yonsei/Junior/R & Python Programming/data_df_3.txt", 
            sep=' ', names=['ID', 'A', 'B'], header=None)

Unnamed: 0,ID,A,B
0,a1,1.0,3.0
1,a2,2.0,4.5
2,a3,3.0,3.2
3,a4,4.0,4.0
4,a5,5.0,3.5
5,,,


In [63]:
pd.read_csv("/Users/user/Desktop/Yonsei/Junior/R & Python Programming/data_df_3.txt",
           sep=' ', names=['ID', 'A', 'B'], header=None, skiprows=[1,2])

Unnamed: 0,ID,A,B
0,a1,1.0,3.0
1,a4,4.0,4.0
2,a5,5.0,3.5
3,,,


In [64]:
pd.read_csv("/Users/user/Desktop/Yonsei/Junior/R & Python Programming/data_df_3.txt",
           sep=' ', names=['ID', 'A', 'B'], header=None, nrows=3)

Unnamed: 0,ID,A,B
0,a1,1.0,3.0
1,a2,2.0,4.5
2,a3,3.0,3.2


In [65]:
f = pd.read_csv("/Users/user/Desktop/Yonsei/Junior/R & Python Programming/data_df_2.csv",
               na_values=['?', '??', 'N/A', 'nan', 'NaN'], index_col=0)

In [66]:
f

Unnamed: 0,ID,X1,X2
a,a1,1.0,3.0
b,a2,2.0,4.5
c,a3,3.0,3.2
d,a4,4.0,4.0
e,a5,5.0,3.5
f,,,


In [67]:
f.isnull()

Unnamed: 0,ID,X1,X2
a,False,False,False
b,False,False,False
c,False,False,False
d,False,False,False
e,False,False,False
f,True,True,True


In [68]:
df1 = pd.read_csv("/Users/user/Desktop/Yonsei/Junior/R & Python Programming/data_df_2.csv",
                 index_col = 0, nrows=5)
df1

Unnamed: 0,ID,X1,X2
a,a1,1.0,3.0
b,a2,2.0,4.5
c,a3,3.0,3.2
d,a4,4.0,4.0
e,a5,5.0,3.5


In [69]:
df1.ID.values

array(['a1', 'a2', 'a3', 'a4', 'a5'], dtype=object)

In [70]:
df1.X1.values.dtype

dtype('float64')

In [71]:
df1.X2.values.dtype

dtype('float64')

In [72]:
df2 = pd.read_csv("/Users/user/Desktop/Yonsei/Junior/R & Python Programming/data_df_2.csv",
                 index_col = 0, nrows=5,
                 dtype = {'ID' : str, 'X1': int, 'X2': float})
df2

Unnamed: 0,ID,X1,X2
a,a1,1,3.0
b,a2,2,4.5
c,a3,3,3.2
d,a4,4,4.0
e,a5,5,3.5


In [73]:
df2.X1.values.dtype

dtype('int64')

In [74]:
df2.X2.values.dtype

dtype('float64')

In [75]:
x_save5

Unnamed: 0_level_0,X1,X2
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
a1,1.0,3.0
a2,2.0,4.5
a3,3.0,3.2
a4,4.0,4.0
a5,5.0,3.5
,,


In [76]:
x_save5.head()

Unnamed: 0_level_0,X1,X2
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
a1,1.0,3.0
a2,2.0,4.5
a3,3.0,3.2
a4,4.0,4.0
a5,5.0,3.5


In [77]:
x_save5.tail()

Unnamed: 0_level_0,X1,X2
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
a2,2.0,4.5
a3,3.0,3.2
a4,4.0,4.0
a5,5.0,3.5
,,
