In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Pandas version 0.25.1 (`pip install pandas==0.25.1`)**

# `Series` Data type

-  Numpy's ndarray + 숫자가 아닌 다른 type의 index (E.g. 문자열)

In [5]:
import pandas as pd

In [6]:
a = pd.Series([1,2,3,4])
a

0    1
1    2
2    3
3    4
dtype: int64

In [7]:
# 첫번째 방법
s2 = pd.Series(
    [1, 2, 3, 4],
    index=['a', 'b', 'c', 'd']
)
s2

a    1
b    2
c    3
d    4
dtype: int64

In [8]:
s2.head(2)

a    1
b    2
dtype: int64

In [9]:
# 두번째방법
s2 = pd.Series({'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5})
s2.head()

a    1
b    2
c    3
d    4
e    5
dtype: int64


- 한가지 data type만 가지고 있을 수 있음 

## `nan`과 관련된 함수

In [10]:
import numpy as np

In [11]:
np.nan

nan

In [12]:
s = pd.Series([10, 0, 1, 1, 2, 3, 4, 5, 6, np.nan])
s

0    10.0
1     0.0
2     1.0
3     1.0
4     2.0
5     3.0
6     4.0
7     5.0
8     6.0
9     NaN
dtype: float64

In [13]:
len(s)

10

In [14]:
s.shape

(10,)

In [15]:
s.count()    # not count `nan`

np.int64(9)

In [16]:
s.unique()

# 수업에서는 다루지 않았지만, nunique()는 unique한 값들의 총 갯수를 알려주는 함수입니다.
# s.nunique()

array([10.,  0.,  1.,  2.,  3.,  4.,  5.,  6., nan])

In [17]:
s.value_counts()

1.0     2
10.0    1
0.0     1
2.0     1
3.0     1
4.0     1
5.0     1
6.0     1
Name: count, dtype: int64

- 이 외의 함수들에 대해서는 이후 수업에서 하나씩 다룰 예정!

## index label을 기준으로 Series간에 operation이 일어남

- Data의 '순서'가 아니라 index label이 자동으로 정렬되어 연산이 진행됨!

In [18]:
s3 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s4 = pd.Series([4, 3, 2, 1], index=['d', 'c', 'b', 'a']) 

In [19]:
s3 + s4

a    2
b    4
c    6
d    8
dtype: int64

# `DataFrame` Data type

- 다수의 Series를 하나의 변수로 관리할 수 있도록 만든 자료형
    - Series의 dict 형태라고 보면됨
        - `{'컬럼명1': Series1, '컬럼명2': Series2}`
        - 각 Series는 DataFrame의 column을 이룸
        - 당연히 DataFrame을 이루는 Series간의 index는 서로 다 같음! => 동일 index 사용

## DataFrame을 만드는 다양한 방법들

In [20]:
s1 = np.arange(1, 6, 1)
s2 = np.arange(6, 11, 1)
s1
s2

array([ 6,  7,  8,  9, 10])

In [21]:
df = pd.DataFrame(
    {
        'c1': s1,
        'c2': s2
    }
)
df

Unnamed: 0,c1,c2
0,1,6
1,2,7
2,3,8
3,4,9
4,5,10


In [22]:
# 1번째 방법  (Default index and columns would be set)
pd.DataFrame(
    [
        [10,11],
        [10,12]
    ]
)
pd.DataFrame(
    np.array(
        [
            [10, 11],
            [20, 21]
        ]
    )
) 

Unnamed: 0,0,1
0,10,11
1,20,21


In [23]:
# 2번째 방법 (많이 안쓰임)
pd.DataFrame(
    [
        pd.Series(np.arange(10, 15)),   # 굳이 Series가 아니고 list형태이기만 하면 됨(=iterable한 object면 다 가능)
        pd.Series(np.arange(15, 20)),   # 굳이 Series가 아니고 list형태이기만 하면 됨(=iterable한 object면 다 가능)
    ]
)

pd.DataFrame(
    [
        np.arange(10, 15),
        np.arange(15, 20),
    ]
)

Unnamed: 0,0,1,2,3,4
0,10,11,12,13,14
1,15,16,17,18,19


In [24]:
# 3번째 방법 (with column & index names)
pd.DataFrame(
    np.array(
        [
            [10, 11],
            [20, 21]
        ]
    ), 
    columns=['a', 'b'],
    index=['r1', 'r2']
)

    

Unnamed: 0,a,b
r1,10,11
r2,20,21


In [25]:
# 4번째 방법
s1 = pd.Series(np.arange(1, 6, 1))    # 굳이 Series가 아니고 list형태이기만 하면 됨(=iterable한 object면 다 가능)
s2 = pd.Series(np.arange(6, 11, 1))   # 굳이 Series가 아니고 list형태이기만 하면 됨(=iterable한 object면 다 가능)
pd.DataFrame(
    {
        'c1': [1,2,3],    # list, np.array, Series 전부 다 올 수 있음!
        'c2': [4,5,6]
    }
)

Unnamed: 0,c1,c2
0,1,4
1,2,5
2,3,6


In [26]:
# 참고: 1줄짜리 만들 때도 dictionary의 value에 해당하는 값들은 iterable한 data type(e.g. list, np.array, Series 등)으로 설정해줘야함
pd.DataFrame({'c1': [0], 'c2': [1]})

Unnamed: 0,c1,c2
0,0,1


In [28]:
s1 = pd.Series(np.arange(1, 6, 1), index=['a', 'b', 'c', 'd', 'e'])
s2 = pd.Series(np.arange(6, 11, 1), index=['b', 'c', 'd', 'f', 'g'])
df = pd.DataFrame(
    {
        'c1': s1,
        'c2': s2
    }
)
df

Unnamed: 0,c1,c2
a,1.0,
b,2.0,6.0
c,3.0,7.0
d,4.0,8.0
e,5.0,
f,,9.0
g,,10.0


## DataFrame 생성시, Series간에 Index 기준으로 자동정렬!

In [30]:
s1 = pd.Series(np.arange(1, 6, 1))
s2 = pd.Series(np.arange(6, 11, 1))
s3 = pd.Series(np.arange(12, 15), index=[1, 2, 10])  # this one has index values unlike s1, s2
s1

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [31]:
s2

0     6
1     7
2     8
3     9
4    10
dtype: int64

In [32]:
s3

1     12
2     13
10    14
dtype: int64

In [33]:
df = pd.DataFrame({'c1': s1, 'c2': s2, 'c3': s3}) 
df

Unnamed: 0,c1,c2,c3
0,1.0,6.0,
1,2.0,7.0,12.0
2,3.0,8.0,13.0
3,4.0,9.0,
4,5.0,10.0,
10,,,14.0


## DataFrame에 새로운 column 추가하기

In [34]:
my_dict['a'] = 1

NameError: name 'my_dict' is not defined

In [35]:
df['c4'] = pd.Series([1,2,3,4], index=[0, 1, 2, 10])

In [36]:
df

Unnamed: 0,c1,c2,c3,c4
0,1.0,6.0,,1.0
1,2.0,7.0,12.0,2.0
2,3.0,8.0,13.0,3.0
3,4.0,9.0,,
4,5.0,10.0,,
10,,,14.0,4.0


## Reindexing

- 새로운 index label을 기반으로 기존의 "index-value" mapping은 유지한채 재배열하는 것


### 참고: index 자체를 바꾸는 것("index-value" mapping이 깨짐)

In [37]:
s = pd.Series([1,2,3,4,5])
s

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [38]:
s.index = ['a', 'b', 'c', 'd', 'e']
s

a    1
b    2
c    3
d    4
e    5
dtype: int64

### 참고 :  `set_index()` : 특정 column을 index로 만듦

In [39]:
# 위의 'DataFrame 생성시, Series간에 Index 기준으로 자동정렬!' 챕터에서 정의한 dataframe입니다
df

Unnamed: 0,c1,c2,c3,c4
0,1.0,6.0,,1.0
1,2.0,7.0,12.0,2.0
2,3.0,8.0,13.0,3.0
3,4.0,9.0,,
4,5.0,10.0,,
10,,,14.0,4.0


In [42]:
df['c5'] = pd.Series([1,2,3,4,5,6], index=[0,1,2,3,4,10])
df

Unnamed: 0,c1,c2,c3,c4,c5
0,1.0,6.0,,1.0,1
1,2.0,7.0,12.0,2.0,2
2,3.0,8.0,13.0,3.0,3
3,4.0,9.0,,,4
4,5.0,10.0,,,5
10,,,14.0,4.0,6


In [43]:
df.set_index("c5")

Unnamed: 0_level_0,c1,c2,c3,c4
c5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1.0,6.0,,1.0
2,2.0,7.0,12.0,2.0
3,3.0,8.0,13.0,3.0
4,4.0,9.0,,
5,5.0,10.0,,
6,,,14.0,4.0


### Reindex

In [44]:
s2 = s.reindex(
    ['a', 'c', 'e', 'g']
)
s2

a    1.0
c    3.0
e    5.0
g    NaN
dtype: float64

In [45]:
# Copied
s2['a'] = 0
s2

a    0.0
c    3.0
e    5.0
g    NaN
dtype: float64

In [46]:
# s는 s2의 값을 바꿔도 안 건드려짐
s

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [47]:
# [X] 이렇게 하면 안됨
s1 = pd.Series([0, 1, 2], index=[0, 1, 2])
s2 = pd.Series([3, 4, 5], index=['0', '1', '2'])
s1
s2

0    3
1    4
2    5
dtype: int64

In [48]:
s1 + s2

0   NaN
1   NaN
2   NaN
0   NaN
1   NaN
2   NaN
dtype: float64

In [49]:
s1.index

Index([0, 1, 2], dtype='int64')

In [50]:
s2 = s2.reindex(s1.index)
s2

0   NaN
1   NaN
2   NaN
dtype: float64

In [51]:
# 첫번째 방법
s1 = pd.Series([0, 1, 2], index=[0, 1, 2])
s2 = pd.Series([3, 4, 5], index=['0', '1', '2'])

In [52]:
s2.index = s2.index.astype(int)

In [53]:
s2

0    3
1    4
2    5
dtype: int64

In [54]:
s2.index

Index([0, 1, 2], dtype='int64')

In [55]:
s1 + s2

0    3
1    5
2    7
dtype: int64

In [56]:
# 두번째 방법
s1 = pd.Series([0, 1, 2], index=[0, 1, 2])
s2 = pd.Series([3, 4, 5], index=['0', '1', '2'])

In [57]:
s1.index = ['a', 'b', 'c']
s2.index = ['a', 'b', 'c']

In [58]:
s1 + s2

a    3
b    5
c    7
dtype: int64

#### `reindex()`의 유용한 Arguments

- `fill_value`

In [59]:
s2 = s.copy()
s2

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [60]:
s2.reindex(['a', 'f'])

a    1.0
f    NaN
dtype: float64

In [61]:
s2.reindex(['a', 'f'], fill_value=0)  # fill 0 insteand of Nan

a    1
f    0
dtype: int64

- `method`

In [62]:
s3 = pd.Series(['red', 'green', 'blue'], index=[0, 3, 5])
s3

0      red
3    green
5     blue
dtype: str

In [63]:
s3.reindex(np.arange(0,7))

0      red
1      NaN
2      NaN
3    green
4      NaN
5     blue
6      NaN
dtype: str

In [64]:
s3.reindex(np.arange(0,7), method='ffill')

0      red
1      red
2      red
3    green
4    green
5     blue
6     blue
dtype: str

#### 예제

In [65]:
# 맨 첫 강의에서 라이브러리를 설치할 때 requirements.txt를 이용해서 설치를 했으면, 건너뛰셔도 됩니다. 
!pip install finance_datareader == 0.9.1

zsh:1: = not found


In [76]:
import numpy as np
import pandas as pd

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Pandas DataFrame의 사이즈가 큰 경우, 어떻게 화면에 출력을 할지를 세팅하는 코드
pd.set_option("display.float_format", lambda x: f"{x:.3f}")
pd.set_option("display.max_columns", None)

In [66]:
import FinanceDataReader as fdr

In [71]:
# 삼성전자
df1 = fdr.DataReader("005930", '2025-01-02', '2026-10-30')

# KODEX 200 (ETF)
df2 = fdr.DataReader("069500", '2025-01-03', '2026-10-30')

In [72]:
df1.head(2)
df1.tail(2)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2026-02-05,162000,164000,158500,159300,38435228,-0.057954
2026-02-06,154100,160300,151600,158600,36252250,-0.004394


In [73]:
df2.head(2)
df2.tail(2)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2026-02-05,77150,78170,75625,76010,23642888,-0.042514
2026-02-06,73650,75655,72105,75195,17873907,-0.010722


In [78]:
# 삼성전자
df1 = fdr.DataReader("005930", '2025-01-02', '2026-10-30')

# KODEX 200 (ETF)
df2 = fdr.DataReader("069500", '2025-01-02', '2026-10-30')

In [79]:
df1.shape
df2.shape

(268, 6)

(268, 6)

In [80]:
df2 = df2.drop(pd.to_datetime("2025-01-03"))
df2.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2025-01-02,31193,31355,31008,31252,5870312,-0.002
2025-01-06,31975,32639,31926,32562,7800165,0.022
2025-01-07,32869,33079,32562,32571,7868239,0.0
2025-01-08,32298,33060,32298,32884,8342990,0.01
2025-01-09,32869,33197,32829,32908,9190295,0.001


In [81]:
df1.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2025-01-02,52700,53600,52300,53400,16630538,0.004
2025-01-03,52800,55100,52800,54400,19318046,0.019
2025-01-06,54400,56200,54300,55900,19034284,0.028
2025-01-07,56800,57300,55400,55400,17030235,-0.009
2025-01-08,54800,57500,54700,57300,26593553,0.034


In [82]:
new_df2 = df2.reindex(df1.index)
new_df2.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2025-01-02,31193.0,31355.0,31008.0,31252.0,5870312.0,-0.002
2025-01-03,,,,,,
2025-01-06,31975.0,32639.0,31926.0,32562.0,7800165.0,0.022
2025-01-07,32869.0,33079.0,32562.0,32571.0,7868239.0,0.0
2025-01-08,32298.0,33060.0,32298.0,32884.0,8342990.0,0.01


In [83]:
df1.shape
new_df2.shape

(268, 6)

(268, 6)

In [85]:
new_df2.ffill()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2025-01-02,31193.000,31355.000,31008.000,31252.000,5870312.000,-0.002
2025-01-03,31193.000,31355.000,31008.000,31252.000,5870312.000,-0.002
2025-01-06,31975.000,32639.000,31926.000,32562.000,7800165.000,0.022
2025-01-07,32869.000,33079.000,32562.000,32571.000,7868239.000,0.000
2025-01-08,32298.000,33060.000,32298.000,32884.000,8342990.000,0.010
...,...,...,...,...,...,...
2026-02-02,75680.000,76705.000,72590.000,72700.000,27331588.000,-0.060
2026-02-03,75450.000,78290.000,75280.000,78290.000,15709267.000,0.077
2026-02-04,77635.000,79550.000,77265.000,79385.000,19180981.000,0.014
2026-02-05,77150.000,78170.000,75625.000,76010.000,23642888.000,-0.043
