# Pandas Basic III

In [1]:
# Pandas Basics
from IPython.display import Image 

from pandas import Series, DataFrame
import pandas as pd
import numpy as np

### 3. Summarizing and computing descriptive statistics
> 기술통계 계산과 요약

- pandas 객체는 일반적인 수학 메소드와 통계 메소드를 가지고 있다.
- 대부분 Series나 DataFrame 하나의 칼럼이나 로우에서 단일 값(합이나 평균같은)을 구하는 축소 혹은 요약통계 범주
- 순수 NumPy 배열에서 제공하는 동일한 메소드와 비교하여 pandas의 메소드는 처음부터 누락된 데이터를 제외하도록 설계되었다.

In [2]:
df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75,-1.3]],
               index = ['a','b','c','d'],
               columns = ['one','two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [3]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [5]:
df.sum(axis = 1) # NaN값 무시하고 계산

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [6]:
# skipna : 누락된 값을 제외할 것인지 정하는 옵션
df.mean(axis = 1, skipna = False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [7]:
df.mean(axis = 1, skipna = True)

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [9]:
df.idxmax() # index 값 중 max값

one    b
two    d
dtype: object

In [11]:
df.cumsum() # 누적 합계

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [12]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [13]:
obj = Series(['a','a','b','c'] * 4)
obj

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [14]:
obj.describe()   # top : 가장 많은 값   /   freq : 가장 많은 값의 빈도값

count     16
unique     3
top        a
freq       8
dtype: object

### Correlation and covariance
>상관관계와 공분산

- 상관관계와 공분산 같은 요약통계 계산은 인자가 두 벌 필요하다.
- Yahoo 금융사이트에서 구한 주식가격과 시가총액을 담고 있는 DataFrame에 대해 생각해보자

In [24]:
# %pip list

In [25]:
# !pip install pandas-datareader

In [26]:
# import pandas.io.data as web
import pandas_datareader as web

all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker)

# price = DataFrame({tic: data['Adj Close']
#                    for tic, data in all_data.iteritems()})
# volume = DataFrame({tic: data['Volume']
#                     for tic, data in all_data.iteritems()})

In [27]:
type(all_data['AAPL'])

pandas.core.frame.DataFrame

In [28]:
all_data['AAPL'][0:10] # real data

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-05-30,38.607498,38.3325,38.355,38.4175,80507600.0,36.576328
2017-05-31,38.5425,38.095001,38.4925,38.189999,97804800.0,36.359734
2017-06-01,38.3325,38.055,38.2925,38.294998,65616400.0,36.459702
2017-06-02,38.862499,38.2225,38.395,38.862499,111082800.0,36.999996
2017-06-05,38.612499,38.365002,38.584999,38.482498,101326800.0,36.63821
2017-06-06,38.952499,38.445,38.474998,38.612499,106499600.0,36.761978
2017-06-07,38.994999,38.619999,38.755001,38.842499,84278400.0,36.980961
2017-06-08,38.884998,38.599998,38.8125,38.747501,85003200.0,36.890507
2017-06-09,38.797501,36.505001,38.797501,37.244999,259530800.0,35.460018
2017-06-12,36.522499,35.627499,36.435001,36.355,289229200.0,34.612671


In [38]:
# jb = web.get_data_yahoo('{code}.KS'.format(code = '175330')) # JB금융지주
# jb[-20:] # 최근 20영업일

In [41]:
price = DataFrame({tic : data['Adj Close']
                    for tic, data in all_data.items()})
price.head()        # head / tail

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-05-30,36.576328,114.918365,66.488335,975.880005
2017-05-31,36.359734,115.600021,65.95005,964.859985
2017-06-01,36.459702,115.630325,66.19558,966.950012
2017-06-02,36.999996,115.160767,67.763138,975.599976
2017-06-05,36.63821,115.433395,68.25415,983.679993


In [42]:
volume = DataFrame({tic : data['Volume']
                    for tic, data in all_data.items()})
volume.head(10)

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-05-30,80507600.0,3834636.0,17072800.0,1466700
2017-05-31,97804800.0,3706396.0,30436400.0,2448100
2017-06-01,65616400.0,3052333.0,21603600.0,1410500
2017-06-02,111082800.0,3750642.0,34770300.0,1751000
2017-06-05,101326800.0,4157955.0,33316800.0,1252100
2017-06-06,106499600.0,3971871.0,31511100.0,1814600
2017-06-07,84278400.0,5089522.0,22301800.0,1453900
2017-06-08,85003200.0,3879614.0,24588300.0,1481900
2017-06-09,259530800.0,4562129.0,49187400.0,3309400
2017-06-12,289229200.0,6769189.0,47761700.0,3763500


In [44]:
returns = price.pct_change() # 전일 대비 변화율
returns.tail()        # head / tail

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-05-23,0.040119,0.020937,0.032032,0.02153
2022-05-24,-0.019216,0.02005,-0.003952,-0.051408
2022-05-25,0.00114,0.00441,0.01117,-0.000817
2022-05-26,0.0232,0.018603,0.012875,0.02321
2022-05-27,0.040757,0.017386,0.027604,0.041581


In [45]:
returns.MSFT.corr(returns.IBM)

0.4782894828523976

In [46]:
returns.MSFT.cov(returns.IBM)

0.00014940137142860618

In [47]:
returns.corr()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,1.0,0.431877,0.754194,0.679525
IBM,0.431877,1.0,0.478289,0.45095
MSFT,0.754194,0.478289,1.0,0.784601
GOOG,0.679525,0.45095,0.784601,1.0


In [48]:
returns.cov()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,0.000401,0.000147,0.000277,0.000247
IBM,0.000147,0.00029,0.000149,0.000139
MSFT,0.000277,0.000149,0.000337,0.000261
GOOG,0.000247,0.000139,0.000261,0.000329


In [49]:
returns.corrwith(returns.IBM)

AAPL    0.431877
IBM     1.000000
MSFT    0.478289
GOOG    0.450950
dtype: float64

In [50]:
returns.corrwith(volume)

AAPL   -0.083214
IBM    -0.102895
MSFT   -0.073719
GOOG   -0.102146
dtype: float64

### Unique values, value counts, and membership
> 유일 값, 값 세기, 멤버십

In [51]:
obj = Series(['c','a','d','a','a','b','b','c','c'])

In [52]:
uniques = obj.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [54]:
obj.value_counts()   # value 갯수 세어주는 함수! / 매우 유용!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

c    3
a    3
b    2
d    1
dtype: int64

In [56]:
pd.value_counts(obj.values, sort = False)

c    3
a    3
d    1
b    2
dtype: int64

In [57]:
pd.value_counts(obj.values, sort = True)

c    3
a    3
b    2
d    1
dtype: int64

In [58]:
mask = obj.isin(['b','c'])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [59]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [60]:
data = DataFrame({'Qu1' : [1,3,4,3,4],
                  'Qu2' : [2,3,1,2,3],
                  'Qu3' : [1,5,2,4,4]})
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [62]:
# 배운 함수 2개 적용
# 1) apply() : 함수 적용
# 2) fillna() : NaN값을 특정값으로 대체함
result = data.apply(pd.value_counts).fillna(0)
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0
