#### DataFrame 생성하기
 - 일반적으로 분석을 위한 데이터는 다른 데이터 소스(database, 외부 파일)을 통해 dataframe을 생성
 - 여기서는 실습을 통해, dummy 데이터를 생성하는 방법을 다룰 예정

In [1]:
import numpy as np
import pandas as pd

#### dictionary로 부터 생성하기
 - dict의 key -> column

In [2]:
data = {'a':100, 'b':200, 'c':300}
pd.DataFrame(data, index=['x','y','z'])

Unnamed: 0,a,b,c
x,100,200,300
y,100,200,300
z,100,200,300


In [3]:
data = {'a':[1,2,3], 'b':[4,5,6], 'c':[10,11,12]}
pd.DataFrame(data, index=[0,1,2])

Unnamed: 0,a,b,c
0,1,4,10
1,2,5,11
2,3,6,12


#### Series로 부터 생성하기
 - 각 Series의 인덱스 -> column

In [4]:
a = pd.Series([100, 200, 300], ['a', 'b', 'c'])
b = pd.Series([101, 201, 301], ['d', 'e', 'f'])
c = pd.Series([102, 202, 302], ['g', 'h', 'i'])
pd.DataFrame([a, b, c], index=[100, 101, 102])

Unnamed: 0,a,b,c,d,e,f,g,h,i
100,100.0,200.0,300.0,,,,,,
101,,,,101.0,201.0,301.0,,,
102,,,,,,,102.0,202.0,302.0


In [5]:
data = { 'state' : ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'], #'state', 'year', 'pop' -> column
         'year' : [2000, 2001, 2002, 2001, 2002, 2003],
         'pop' : [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


###  column의 순서를 원하는대로 지정 가능
#### eg) [state -> year -> pop] -> [year -> state -> pop]

In [6]:
data = pd.DataFrame(data, columns = ['year', 'state', 'pop'])
data

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [7]:
data1 = { 'state' : ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'], #'state', 'year', 'pop' -> column
         'year' : [2000, 2001, 2002, 2001, 2002, 2003],
         'pop' : [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame1 = pd.DataFrame(data1, columns = ['year', 'state', 'pop'], index = ['one', 'two', 'three', 'four', 'five', 'six'])
frame1

Unnamed: 0,year,state,pop
one,2000,Ohio,1.5
two,2001,Ohio,1.7
three,2002,Ohio,3.6
four,2001,Nevada,2.4
five,2002,Nevada,2.9
six,2003,Nevada,3.2


In [8]:
frame1.year # 열 데이터 얻기

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [9]:
frame1.loc['one'] #loc -> 행 인덱싱

year     2000
state    Ohio
pop       1.5
Name: one, dtype: object

In [10]:
frame1.iloc[0] # iloc -> 숫자로 행 인덱싱, 값 하나만

year     2000
state    Ohio
pop       1.5
Name: one, dtype: object

In [11]:
frame1['year'] = 1000 # frame1.year = 1000과 동일, column 생성으로는 frame['year']로만 가능
frame1

Unnamed: 0,year,state,pop
one,1000,Ohio,1.5
two,1000,Ohio,1.7
three,1000,Ohio,3.6
four,1000,Nevada,2.4
five,1000,Nevada,2.9
six,1000,Nevada,3.2


In [12]:
frame1['eastern'] = (frame1.state == 'Ohio')
frame1

Unnamed: 0,year,state,pop,eastern
one,1000,Ohio,1.5,True
two,1000,Ohio,1.7,True
three,1000,Ohio,3.6,True
four,1000,Nevada,2.4,False
five,1000,Nevada,2.9,False
six,1000,Nevada,3.2,False


In [13]:
frame1['dept'] = np.NaN
frame1

Unnamed: 0,year,state,pop,eastern,dept
one,1000,Ohio,1.5,True,
two,1000,Ohio,1.7,True,
three,1000,Ohio,3.6,True,
four,1000,Nevada,2.4,False,
five,1000,Nevada,2.9,False,
six,1000,Nevada,3.2,False,


In [15]:
val = pd.Series([-1.2,-1.5,-1.7],index=['two','four','five'])
frame1['dept'] = val
frame1

Unnamed: 0,year,state,pop,eastern,dept,debt
one,1000,Ohio,1.5,True,,
two,1000,Ohio,1.7,True,-1.2,-1.2
three,1000,Ohio,3.6,True,,
four,1000,Nevada,2.4,False,-1.5,-1.5
five,1000,Nevada,2.9,False,-1.7,-1.7
six,1000,Nevada,3.2,False,,


In [16]:
del frame1['debt']
frame1

Unnamed: 0,year,state,pop,eastern,dept
one,1000,Ohio,1.5,True,
two,1000,Ohio,1.7,True,-1.2
three,1000,Ohio,3.6,True,
four,1000,Nevada,2.4,False,-1.5
five,1000,Nevada,2.9,False,-1.7
six,1000,Nevada,3.2,False,


In [24]:
pop = {'Nevada' : {2001 : 2.4, 2002 : 2.9},
       'Ohio' : {2000 : 1.5, 2001 : 1.7, 2002 : 3.6}
      }
frame2 = pd.DataFrame(pop)
frame2

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [25]:
pop = {'Nevada' : {2001 : 2.4, 2002 : 2.9},
       'Ohio' : {2000 : 1.5, 2001 : 1.7, 2002 : 3.6}
      }
frame2 = pd.DataFrame(pop, index = [2000, 2001, 2002])
frame2

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [26]:
frame3 = pd.DataFrame(pop, index = ['Nevada', 'Ohio'])
frame3

Unnamed: 0,Nevada,Ohio
Nevada,,
Ohio,,


In [27]:
frame2.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [28]:
frame2.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [33]:
pframe2 = {'Ohio': frame2['Ohio'][ :-1],
           'Nevada' : frame2['Nevada'][ :2]}
pd.DataFrame(pframe2)

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4


In [70]:
frame2.loc[2000, 'Nevada'] # == frame2.iloc[0,0]

nan

In [85]:
frame.loc[]

state    Ohio
year     2000
pop       1.5
Name: 0, dtype: object

In [86]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [89]:
frame_series = frame.iloc[0, 1:2]
frame_series

year    2000
Name: 0, dtype: object

In [90]:
frame - frame_series

Unnamed: 0,pop,state,year
0,,,0
1,,,1
2,,,2
3,,,1
4,,,2
5,,,3
