# 산포통계

In [1]:
import numpy as np
from scipy import stats
import pandas as pd

## 분산계산

In [6]:
x = [1,2,3,4,5]
print(np.var(x, ddof = 1))
print("-"*20)
print(np.array(x).var())
print("-"*20)
print(pd.Series(x).var(ddof = 0))

2.5
--------------------
2.0
--------------------
2.0


## 표준편차 계산

In [10]:
x = [1,2,3,4,5]
print(np.std(x, ddof = 1))
print("-"*20)
print(np.array(x).std(ddof = 0))
print("-"*20)
print(pd.Series(x).std(ddof = 1))

1.5811388300841898
--------------------
1.4142135623730951
--------------------
1.5811388300841898


## 변동계수의 필요성

In [13]:
# 스케일 영향을 받아서 분사과 표준편차가 상대적 산포를 보여주는데 부적합

x1 = np.array([1, 2, 3, 4, 5])
x2 = x1 * 10

print(np.std(x1, ddof = 1))
print(np.std(x2, ddof = 1))

1.5811388300841898
15.811388300841896


In [17]:
# 변동계수 = 표준편차 / 평균

print(stats.variation(x1))
print(stats.variation(x2))

#변동계수는 두개의 집단이 동일함.

0.47140452079103173
0.4714045207910317


In [20]:
print(np.std(x1, ddof = 1) / np.mean(x1))
print(np.std(x2, ddof = 1) / np.mean(x2))

#자유도가 1인경우 > 샘플데이터로 추출한 값의 변동계수를 계산.

0.5270462766947299
0.5270462766947299


## 스케일링
- 둘 이상의 변수의 값을 상대적으로 비교할 때 사용

In [23]:
x1 = np.array([1, 2, 3, 4, 5])
x2 = x1 * 10

print(x1) ; print(x2)

[1 2 3 4 5]
[10 20 30 40 50]


In [25]:
# Standard Scaling
z1 = (x1 - x1.mean()) / x1.std()
z2 = (x2 - x2.mean()) / x2.std()

#각 요소의 편차/표준편차
print(z1)
print(z2)

[-1.41421356 -0.70710678  0.          0.70710678  1.41421356]
[-1.41421356 -0.70710678  0.          0.70710678  1.41421356]


In [26]:
# Min-max Scaling
z1 = (x1 - x1.min()) / (x1.max() - x1.min())
z2 = (x2 - x2.min()) / (x2.max() - x2.min())

print(z1)
print(z2)

[0.   0.25 0.5  0.75 1.  ]
[0.   0.25 0.5  0.75 1.  ]


## 데이터 표준화 하기

### 데이터 프레임 만들기

In [8]:
import pandas as pd

In [38]:
X = pd.DataFrame(
    {"x1" : [1,2,3,4,5],
     "x2" : [10,20,30,40,50]
    }
)
     

In [39]:
X

Unnamed: 0,x1,x2
0,1,10
1,2,20
2,3,30
3,4,40
4,5,50


### scikit learn을 활용한 데이터 표준화하기

In [9]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp310-cp310-win_amd64.whl.metadata (11 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.2-cp310-cp310-win_amd64.whl (8.9 MB)
   ---------------------------------------- 0.0/8.9 MB ? eta -:--:--
   ------------ --------------------------- 2.9/8.9 MB 16.8 MB/s eta 0:00:01
   ------------------ --------------------- 4.2/8.9 MB 10.1 MB/s eta 0:00:01
   ------------------------------------ --- 8.1/8.9 MB 12.6 MB/s eta 0:00:01
   ---------------------------------------- 8.9/8.9 MB 12.8 MB/s  0:00:00
Downloading joblib-1.5.2-py3-none-any.whl (308 kB)
Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn

   ------------- -------------------------- 1/3 [joblib]


In [10]:
# !dir 
#! 는 터미널 명령어를 사용하는 실행어.

 C 드라이브의 볼륨에는 이름이 없습니다.
 볼륨 일련 번호: 1C12-191A

 C:\google_drive\KPMG_7th_lab\statistics_ex 디렉터리

2025-10-16  오전 09:34    <DIR>          .
2025-10-16  오전 09:34    <DIR>          ..
2025-10-14  오후 04:12             4,895 .gitignore
2025-10-15  오전 11:47    <DIR>          .ipynb_checkpoints
2025-10-14  오후 04:12                39 README.md
2025-10-15  오전 09:26             1,074 test.ipynb
2025-10-15  오전 11:48            11,056 [실습1-1]통계_대표통계.ipynb
2025-10-16  오전 09:34            11,464 [실습1-2]통계_산포통계.ipynb
               5개 파일              28,528 바이트
               3개 디렉터리  419,254,734,848 바이트 남음


In [12]:
# MinMaxScaler를 메모리에 로딩
from sklearn.preprocessing import MinMaxScaler

In [15]:
# MinMaxScaler 객체 생성
scaler = MinMaxScaler()
scaled = scaler.fit_transform(X)

In [16]:
scaled

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [0.75, 0.75],
       [1.  , 1.  ]])

In [17]:
# docstring 불러오기 : shift + tab
#자동완성 : tab

In [19]:
pd.DataFrame(scaled, columns=['x1', 'x2'])

Unnamed: 0,x1,x2
0,0.0,0.0
1,0.25,0.25
2,0.5,0.5
3,0.75,0.75
4,1.0,1.0


## 표준정규분포를 따르는 data로 변환

In [20]:
from sklearn.preprocessing import StandardScaler

In [21]:
st_scaler = StandardScaler()
st_scaled = st_scaler.fit_transform(X)

#fit_transform(X) 에 대해 물어볼 것.

In [22]:
st_scaled

array([[-1.41421356, -1.41421356],
       [-0.70710678, -0.70710678],
       [ 0.        ,  0.        ],
       [ 0.70710678,  0.70710678],
       [ 1.41421356,  1.41421356]])

In [23]:
pd.DataFrame(st_scaled, columns=['x1','x2'])

Unnamed: 0,x1,x2
0,-1.414214,-1.414214
1,-0.707107,-0.707107
2,0.0,0.0
3,0.707107,0.707107
4,1.414214,1.414214


## 범위와 사분위 범위 계산하기

In [24]:
# numpy를 사용하여 정규분포를 따르는 랜덤한 데이터 만들기

import numpy as np

In [25]:
y = np.random.normal(100, 20, size = 1000)

In [26]:
# 소문자(x) : 칼럼이 1개일 경우
# 대문자(X) : 칼럼이 2개 이상일 경우

In [28]:
y

array([100.56890318, 141.5368697 ,  92.8244399 , 121.49153546,
        89.80525226,  92.81694898,  88.3717851 , 112.311679  ,
        77.33845833, 101.21075532,  89.82842445,  89.62166581,
        92.7286809 ,  94.52172933,  84.1482591 , 116.35282998,
        77.14688696,  94.44955353,  94.12831804, 114.92622742,
        90.97695224, 105.4074812 , 109.30732733, 116.90781275,
        93.6538421 ,  75.63741227, 112.22828319, 107.07604645,
       113.87532949, 110.19433647, 108.32848253,  68.0912474 ,
       100.06929283,  58.94734228, 105.14810858,  82.07107012,
       144.12256938,  61.51795625,  87.93715305,  91.76951584,
       105.8797572 , 117.48633988, 100.27071227, 111.48734658,
       108.92331112,  89.74814639, 126.4358778 ,  62.44661446,
       132.35370829,  91.71539399,  79.05012059,  63.45004404,
       103.99508902,  99.9426575 , 148.11706733, 112.27020035,
       149.81186815, 104.9787121 ,  88.12013202, 115.05533751,
       101.48275295, 116.96103922, 146.09943309, 124.42

-m : 문자 셀로 변환
-y : 코드 셀로 변환

In [31]:
# 범위계산, 총 범위
print(np.ptp(y))
print(np.max(y) - np.min(y))

128.37547502198038
128.37547502198038


In [33]:
# median(중앙값)
print(np.median(y))
print(np.quantile(y, 0.5))

98.62213282135426
98.62213282135426


In [34]:
# IQR(3분위 - 1분위)
print(np.quantile(y, 0.75) - np.quantile(y, 0.25))

28.13293668110397


In [35]:
from scipy import stats
#import scipy.stats 

In [37]:
print(stats.iqr(y))

28.13293668110397
