# 5.3 기술 통계 계산과 요약

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],index = ['a','b','c','d'],columns=['one','two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [3]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [6]:
df.sum(axis = "index")

one    9.25
two   -5.80
dtype: float64

In [4]:
df.sum(axis="columns")

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [5]:
# NA값이 있을 경우 결과값이 NA가 나오려면 skipna 옵션을 비활성화 시킨다
df.sum(axis = 1, skipna = False)

a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64

In [7]:
# 평균 같은 일부 집계에는 결괏값을 생성하기 위해 최소 하나 이상의 NA가 아닌 값이 필요
df.mean(axis=1)

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [8]:
# 최소, 최댓값을 가진 index값(간접 통계)을 반환하는 idxmin, idxmax
df.idxmax()

one    b
two    d
dtype: object

In [11]:
df.idxmin()

one    d
two    b
dtype: object

In [9]:
df.idxmax(axis = 1)

a    one
b    one
c    NaN
d    one
dtype: object

In [10]:
df.idxmin(axis = 1)

a    one
b    two
c    NaN
d    two
dtype: object

In [12]:
# 누산(누적 합산)
# 열 방향으로 누적 합산, 행 방향으로 바꿀 경우 axis = 1로 지정
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [13]:
df.cumsum(axis = 1)

Unnamed: 0,one,two
a,1.4,
b,7.1,2.6
c,,
d,0.75,-0.55


In [14]:
# 한 번에 여러 개의 요약 통계를 만드는 describe 메서드
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [17]:
# 수치 데이터가 아닐 경우 다른 요약 통계를 생성
obj = pd.Series(['a','a','b','c']*4)
obj
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

## 상관관계와 공분산

상관관계나 공분산 같은 요약 통계 계산은 두 쌍의 인수가 필요

In [19]:
price = pd.read_pickle("C:\Python for Data Analysis\yahoo_price.pkl")
volume = pd.read_pickle("C:\Python for Data Analysis\yahoo_volume.pkl")

In [30]:
volume.pct_change().tail()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-10-17,-0.337351,0.280108,0.351567,-0.130371
2016-10-18,0.039306,0.831666,1.168036,-0.196412
2016-10-19,-0.184043,-0.941571,-0.637221,0.194726
2016-10-20,0.204207,13.87307,-0.131624,1.161672
2016-10-21,-0.072163,-0.273152,0.094156,0.617091


In [21]:
returns =price.pct_change()
returns.tail()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-10-17,-0.00068,0.001837,0.002072,-0.003483
2016-10-18,-0.000681,0.019616,-0.026168,0.00769
2016-10-19,-0.002979,0.007846,0.003583,-0.002255
2016-10-20,-0.000512,-0.005652,0.001719,-0.004867
2016-10-21,-0.00393,0.003011,-0.012474,0.042096


In [22]:
# corr : NA가 아니며 정렬된 index에서 연속하는 두 Series의 상관관계 계산
# cov : 공분산 계산
returns["MSFT"].corr(returns["IBM"])

0.4997636114415114

In [23]:
returns["MSFT"].cov(returns["IBM"])

8.870655479703546e-05

DataFrame 에서 corr과 cov는 행렬상의 상관관계와 공분산을 계산

In [24]:
returns.corr()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,1.0,0.407919,0.386817,0.389695
GOOG,0.407919,1.0,0.405099,0.465919
IBM,0.386817,0.405099,1.0,0.499764
MSFT,0.389695,0.465919,0.499764,1.0


In [25]:
returns.cov()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,0.000277,0.000107,7.8e-05,9.5e-05
GOOG,0.000107,0.000251,7.8e-05,0.000108
IBM,7.8e-05,7.8e-05,0.000146,8.9e-05
MSFT,9.5e-05,0.000108,8.9e-05,0.000215


In [26]:
# DataFrame과 Series객체의 상관관계를 계산할 경우
# corrwith 메서드 이용
returns.corrwith(returns["IBM"])
# 각 열에 대해서 계산한 상관관계가 담긴 Series객체 반환

AAPL    0.386817
GOOG    0.405099
IBM     1.000000
MSFT    0.499764
dtype: float64

In [27]:
# DataFrame - DataFrame 을 corrwith을 이용하여 상관관계를 계산할 경우
# 맞아떨어지는 열 이름에 대한 상관관계를 계산
# DataFrame price와 DataFrame volume 의 상관관계를 계산
returns.corrwith(volume)

AAPL   -0.075565
GOOG   -0.007067
IBM    -0.204849
MSFT   -0.092950
dtype: float64

In [31]:
# 각 행에 대해서 계산할 경우
returns.corrwith(volume, axis = 1)

Date
2010-01-04         NaN
2010-01-05    0.737298
2010-01-06    0.017069
2010-01-07    0.507614
2010-01-08   -0.779646
                ...   
2016-10-17   -0.881606
2016-10-18   -0.303369
2016-10-19   -0.970723
2016-10-20   -0.304414
2016-10-21    0.927824
Length: 1714, dtype: float64

## 유일값, 값 세기, 맴버십