In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Pandas version 0.25.1 (`pip install pandas==0.25.1`)**

# `Series` Data type

-  Numpy's ndarray + 숫자가 아닌 다른 type의 index (E.g. 문자열)

In [3]:
import pandas as pd

In [4]:
pd.__version__

'0.25.1'

In [6]:
a = pd.Series([1,2,3,4])
a

0    1
1    2
2    3
3    4
dtype: int64

In [7]:
# 첫번째 방법
s2 = pd.Series(
    [1, 2, 3, 4],
    index=['a', 'b', 'c', 'd']
)
s2

a    1
b    2
c    3
d    4
dtype: int64

In [8]:
s2.head(2) 

a    1
b    2
dtype: int64

In [9]:
# 두번째방법
s2 = pd.Series({'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5})
s2.head()

a    1
b    2
c    3
d    4
e    5
dtype: int64


- 한가지 data type만 가지고 있을 수 있음 

## `nan`과 관련된 함수

In [8]:
import numpy as np

In [9]:
np.nan

nan

In [10]:
s = pd.Series([10, 0, 1, 1, 2, 3, 4, 5, 6, np.nan])
s

0    10.0
1     0.0
2     1.0
3     1.0
4     2.0
5     3.0
6     4.0
7     5.0
8     6.0
9     NaN
dtype: float64

In [11]:
len(s)
s.shape
s.count()    # not count `nan`

10

(10,)

9

In [12]:
s.unique()

# 수업에서는 다루지 않았지만, nunique()는 unique한 값들의 총 갯수를 알려주는 함수입니다.
# s.nunique()

array([10.,  0.,  1.,  2.,  3.,  4.,  5.,  6., nan])

In [13]:
s.value_counts()

1.0     2
6.0     1
5.0     1
4.0     1
3.0     1
2.0     1
0.0     1
10.0    1
dtype: int64

- 이 외의 함수들에 대해서는 이후 수업에서 하나씩 다룰 예정!

## index label을 기준으로 Series간에 operation이 일어남

- Data의 '순서'가 아니라 index label이 자동으로 정렬되어 연산이 진행됨!

In [14]:
s3 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s4 = pd.Series([4, 3, 2, 1], index=['d', 'c', 'b', 'a']) 

In [15]:
s3 + s4

a    2
b    4
c    6
d    8
dtype: int64

# `DataFrame` Data type

- 다수의 Series를 하나의 변수로 관리할 수 있도록 만든 자료형
    - Series의 dict 형태라고 보면됨
        - `{'컬럼명1': Series1, '컬럼명2': Series2}`
        - 각 Series는 DataFrame의 column을 이룸
        - 당연히 DataFrame을 이루는 Series간의 index는 서로 다 같음! => 동일 index 사용

## DataFrame을 만드는 다양한 방법들

In [16]:
s1 = np.arange(1, 6, 1)
s2 = np.arange(6, 11, 1)
s1
s2

array([1, 2, 3, 4, 5])

array([ 6,  7,  8,  9, 10])

In [17]:
df = pd.DataFrame(
    {
        'c1': s1,
        'c2': s2
    }
)
df

Unnamed: 0,c1,c2
0,1,6
1,2,7
2,3,8
3,4,9
4,5,10


In [18]:
# 1번째 방법  (Default index and columns would be set)
pd.DataFrame(
    [
        [10,11],
        [10,12]
    ]
)
pd.DataFrame(
    np.array(
        [
            [10, 11],
            [20, 21]
        ]
    )
) 

Unnamed: 0,0,1
0,10,11
1,10,12


Unnamed: 0,0,1
0,10,11
1,20,21


In [19]:
# 2번째 방법 (많이 안쓰임)
pd.DataFrame(
    [
        pd.Series(np.arange(10, 15)),   # 굳이 Series가 아니고 list형태이기만 하면 됨(=iterable한 object면 다 가능)
        pd.Series(np.arange(15, 20)),   # 굳이 Series가 아니고 list형태이기만 하면 됨(=iterable한 object면 다 가능)
    ]
)

pd.DataFrame(
    [
        np.arange(10, 15),
        np.arange(15, 20),
    ]
)

Unnamed: 0,0,1,2,3,4
0,10,11,12,13,14
1,15,16,17,18,19


Unnamed: 0,0,1,2,3,4
0,10,11,12,13,14
1,15,16,17,18,19


In [20]:
# 3번째 방법 (with column & index names)
pd.DataFrame(
    np.array(
        [
            [10, 11],
            [20, 21]
        ]
    ), 
    columns=['a', 'b'],
    index=['r1', 'r2']
)

    

Unnamed: 0,a,b
r1,10,11
r2,20,21


In [21]:
# 4번째 방법
s1 = pd.Series(np.arange(1, 6, 1))    # 굳이 Series가 아니고 list형태이기만 하면 됨(=iterable한 object면 다 가능)
s2 = pd.Series(np.arange(6, 11, 1))   # 굳이 Series가 아니고 list형태이기만 하면 됨(=iterable한 object면 다 가능)
pd.DataFrame(
    {
        'c1': [1,2,3],    # list, np.array, Series 전부 다 올 수 있음!
        'c2': [4,5,6]
    }
)

Unnamed: 0,c1,c2
0,1,4
1,2,5
2,3,6


In [22]:
# 참고: 1줄짜리 만들 때도 dictionary의 value에 해당하는 값들은 iterable한 data type(e.g. list, np.array, Series 등)으로 설정해줘야함
pd.DataFrame({'c1': [0], 'c2': [1]})

Unnamed: 0,c1,c2
0,0,1


In [23]:
s1 = pd.Series(np.arange(1, 6, 1), index=['a', 'b', 'c', 'd', 'e'])
s2 = pd.Series(np.arange(6, 11, 1), index=['b', 'c', 'd', 'f', 'g'])
df = pd.DataFrame(
    {
        'c1': s1,
        'c2': s2
    }
)

## DataFrame 생성시, Series간에 Index 기준으로 자동정렬!

In [24]:
s1 = pd.Series(np.arange(1, 6, 1))
s2 = pd.Series(np.arange(6, 11, 1))
s3 = pd.Series(np.arange(12, 15), index=[1, 2, 10])  # this one has index values unlike s1, s2
s1
s2
s3

0    1
1    2
2    3
3    4
4    5
dtype: int32

0     6
1     7
2     8
3     9
4    10
dtype: int32

1     12
2     13
10    14
dtype: int32

In [25]:
df = pd.DataFrame({'c1': s1, 'c2': s2, 'c3': s3}) 
df

Unnamed: 0,c1,c2,c3
0,1.0,6.0,
1,2.0,7.0,12.0
2,3.0,8.0,13.0
3,4.0,9.0,
4,5.0,10.0,
10,,,14.0


## DataFrame에 새로운 column 추가하기

In [26]:
# my_dict['a'] = 1

NameError: name 'my_dict' is not defined

In [27]:
df['c4'] = pd.Series([1,2,3,4], index=[0, 1, 2, 10])

In [28]:
df

Unnamed: 0,c1,c2,c3,c4
0,1.0,6.0,,1.0
1,2.0,7.0,12.0,2.0
2,3.0,8.0,13.0,3.0
3,4.0,9.0,,
4,5.0,10.0,,
10,,,14.0,4.0


## Reindexing

- 새로운 index label을 기반으로 기존의 "index-value" mapping은 유지한채 재배열하는 것


### 참고: index 자체를 바꾸는 것("index-value" mapping이 깨짐)

In [29]:
s = pd.Series([1,2,3,4,5])
s

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [30]:
s.index = ['a', 'b', 'c', 'd', 'e']
s

a    1
b    2
c    3
d    4
e    5
dtype: int64

### 2.4.2 참고 :  `set_index()` : 특정 column을 index로 만듦

In [31]:
# 위의 'DataFrame 생성시, Series간에 Index 기준으로 자동정렬!' 챕터에서 정의한 dataframe입니다
df

Unnamed: 0,c1,c2,c3,c4
0,1.0,6.0,,1.0
1,2.0,7.0,12.0,2.0
2,3.0,8.0,13.0,3.0
3,4.0,9.0,,
4,5.0,10.0,,
10,,,14.0,4.0


In [32]:
df['c5'] = pd.Series([1,2,3,4,5,6], index=[0,1,2,3,4,10])
df

Unnamed: 0,c1,c2,c3,c4,c5
0,1.0,6.0,,1.0,1
1,2.0,7.0,12.0,2.0,2
2,3.0,8.0,13.0,3.0,3
3,4.0,9.0,,,4
4,5.0,10.0,,,5
10,,,14.0,4.0,6


In [34]:
df.set_index("c5")

Unnamed: 0_level_0,c1,c2,c3,c4
c5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1.0,6.0,,1.0
2,2.0,7.0,12.0,2.0
3,3.0,8.0,13.0,3.0
4,4.0,9.0,,
5,5.0,10.0,,
6,,,14.0,4.0


### Reindex

In [35]:
s2 = s.reindex(
    ['a', 'c', 'e', 'g']
)
s2

a    1.0
c    3.0
e    5.0
g    NaN
dtype: float64

In [36]:
# Copied
s2['a'] = 0
s2

a    0.0
c    3.0
e    5.0
g    NaN
dtype: float64

In [37]:
# s는 s2의 값을 바꿔도 안 건드려짐
s

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [38]:
# [X] 이렇게 하면 안됨
s1 = pd.Series([0, 1, 2], index=[0, 1, 2])
s2 = pd.Series([3, 4, 5], index=['0', '1', '2'])
s1
s2

0    0
1    1
2    2
dtype: int64

0    3
1    4
2    5
dtype: int64

In [39]:
s1 + s2

0   NaN
1   NaN
2   NaN
0   NaN
1   NaN
2   NaN
dtype: float64

In [40]:
s1.index

Int64Index([0, 1, 2], dtype='int64')

In [41]:
s2 = s2.reindex(s1.index)
s2

0   NaN
1   NaN
2   NaN
dtype: float64

In [42]:
# 첫번째 방법
s1 = pd.Series([0, 1, 2], index=[0, 1, 2])
s2 = pd.Series([3, 4, 5], index=['0', '1', '2'])

In [43]:
s2.index = s2.index.astype(int)

In [44]:
s2

0    3
1    4
2    5
dtype: int64

In [45]:
s2.index

Int64Index([0, 1, 2], dtype='int64')

In [46]:
s1 + s2

0    3
1    5
2    7
dtype: int64

In [47]:
# 두번째 방법
s1 = pd.Series([0, 1, 2], index=[0, 1, 2])
s2 = pd.Series([3, 4, 5], index=['0', '1', '2'])

In [48]:
s1.index = ['a', 'b', 'c']
s2.index = ['a', 'b', 'c']

In [49]:
s1 + s2

a    3
b    5
c    7
dtype: int64

#### `reindex()`의 유용한 Arguments

- `fill_value`

In [50]:
s2 = s.copy()
s2

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [51]:
s2.reindex(['a', 'f'])

a    1.0
f    NaN
dtype: float64

In [53]:
s2.reindex(['a', 'f'], fill_value=0)  # fill 0 insteand of Nan

a    1
f    0
dtype: int64

- `method`

In [54]:
s3 = pd.Series(['red', 'green', 'blue'], index=[0, 3, 5])
s3

0      red
3    green
5     blue
dtype: object

In [55]:
s3.reindex(np.arange(0,7))

0      red
1      NaN
2      NaN
3    green
4      NaN
5     blue
6      NaN
dtype: object

In [56]:
s3.reindex(np.arange(0,7), method='ffill')

0      red
1      red
2      red
3    green
4    green
5     blue
6     blue
dtype: object

#### 예제

In [68]:
# 맨 첫 강의에서 라이브러리를 설치할 때 requirements.txt를 이용해서 설치를 했으면, 건너뛰셔도 됩니다. 
!pip install finance_datareader



In [69]:
!pip install bs4

Collecting bs4
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
Collecting beautifulsoup4
  Using cached beautifulsoup4-4.9.3-py3-none-any.whl (115 kB)
Collecting soupsieve>1.2; python_version >= "3.0"
  Downloading soupsieve-2.1-py3-none-any.whl (32 kB)
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py): started
  Building wheel for bs4 (setup.py): finished with status 'done'
  Created wheel for bs4: filename=bs4-0.0.1-py3-none-any.whl size=1277 sha256=00d369b9a6dbaf8fc9567dd513e67bb0d6d0290a7f12f73c6df2be728d31ce78
  Stored in directory: c:\users\user\appdata\local\pip\cache\wheels\19\f5\6d\a97dd4f22376d4472d5f4c76c7646876052ff3166b3cf71050
Successfully built bs4
Installing collected packages: soupsieve, beautifulsoup4, bs4
Successfully installed beautifulsoup4-4.9.3 bs4-0.0.1 soupsieve-2.1


In [70]:
import FinanceDataReader as fdr

In [76]:
# 삼성전자
df1 = fdr.DataReader("005930", '2021-01-02', '2021-01-06')

# KODEX 200 (ETF)
df2 = fdr.DataReader("069500", '2021-01-02', '2021-01-06')

In [77]:
df1.head(2)
df1.tail(2)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-01-04,81000,84400,80200,83000,38655276,0.024691
2021-01-05,81600,83900,81600,83900,35335669,0.010843


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-01-05,81600,83900,81600,83900,35335669,0.010843
2021-01-06,83300,84500,82100,82200,41182187,-0.020262


In [78]:
df2.head(2)
df2.tail(2)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-01-04,39585,40615,39405,40540,12185235,0.026329
2021-01-05,40325,41150,40200,41150,9126676,0.015047


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-01-05,40325,41150,40200,41150,9126676,0.015047
2021-01-06,41270,41680,40595,40750,12958340,-0.009721


In [79]:
# 삼성전자
df1 = fdr.DataReader("005930", '2018-01-02', '2018-10-30')

# KODEX 200 (ETF)
df2 = fdr.DataReader("069500", '2018-01-02', '2018-10-30')

In [80]:
df1.shape
df2.shape

(202, 6)

(202, 6)

In [81]:
df2 = df2.drop(pd.to_datetime("2018-01-03"))
df2.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-02,30914,31005,30816,30934,4611286,0.004383
2018-01-04,31243,31258,30811,30821,8194471,-0.007854
2018-01-05,30901,31234,30901,31239,7465878,0.013562
2018-01-08,31366,31496,31192,31450,7375511,0.006754
2018-01-09,31357,31580,31187,31324,7326903,-0.004006


In [82]:
df1.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-02,51380,51400,50780,51020,169485,0.001177
2018-01-03,52540,52560,51420,51620,200270,0.01176
2018-01-04,52120,52180,50640,51080,233909,-0.010461
2018-01-05,51300,52120,51200,52120,189623,0.02036
2018-01-08,52400,52520,51500,52020,167673,-0.001919


In [83]:
new_df2 = df2.reindex(df1.index)
new_df2.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-02,30914.0,31005.0,30816.0,30934.0,4611286.0,0.004383
2018-01-03,,,,,,
2018-01-04,31243.0,31258.0,30811.0,30821.0,8194471.0,-0.007854
2018-01-05,30901.0,31234.0,30901.0,31239.0,7465878.0,0.013562
2018-01-08,31366.0,31496.0,31192.0,31450.0,7375511.0,0.006754


In [84]:
df1.shape
new_df2.shape

(202, 6)

(202, 6)

In [85]:
new_df2.fillna(method="ffill") 

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-02,30914.0,31005.0,30816.0,30934.0,4611286.0,0.004383
2018-01-03,30914.0,31005.0,30816.0,30934.0,4611286.0,0.004383
2018-01-04,31243.0,31258.0,30811.0,30821.0,8194471.0,-0.007854
2018-01-05,30901.0,31234.0,30901.0,31239.0,7465878.0,0.013562
2018-01-08,31366.0,31496.0,31192.0,31450.0,7375511.0,0.006754
...,...,...,...,...,...,...
2018-10-24,26292.0,26314.0,25977.0,26023.0,11305852.0,-0.005123
2018-10-25,25481.0,25619.0,25236.0,25608.0,10915732.0,-0.015947
2018-10-26,25634.0,25639.0,24975.0,25202.0,7692140.0,-0.015854
2018-10-29,25289.0,25457.0,24951.0,24997.0,4954632.0,-0.008134
