In [2]:
import pandas as pd
import numpy as np

# 데이터 시각화 패키지
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# 가설검정 및 통계 분석 패키지
from scipy import stats

# 선형모델 formula ( y ~ x1 + x2 + ...)
import statsmodels.formula.api as smf
import statsmodels.api as sm

from statsmodels.stats.proportion import proportions_ztest
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

%matplotlib inline

matplotlib.rc('font', family = 'Malgun Gothic')
plt.rc('font', family = 'Malgun Gothic')
plt.rcParams['axes.unicode_minus'] = False

In [3]:
mu  = 115
sigma = 20
x = 135
prob = stats.norm.cdf(x, mu, sigma) # cdf: 누적확률분포
print("{0}분 이상 걸리는 A/S작업 비율: {1:.1f}%".format(x, (1-prob)*100))

135분 이상 걸리는 A/S작업 비율: 15.9%


## 실습

In [53]:
mu = 12
sigma = 3
x1 = 15

prob1 = stats.norm.cdf(x1, mu, sigma)
print("{0}km 이상 {1}km 이하를 달릴 확률: {2:.1f}%".format(mu, x1,(prob1-0.5)*100))

12km 이상 15km 이하를 달릴 확률: 34.1%


## t분포

In [33]:
t = 1.53
df = 5

prob = stats.t.cdf(t, df) # t분포, t값과 자유도
print("P(T<=F): {:.3f}".format(prob))

P(T<=F): 0.907


In [34]:
t = 2.0
df = 100

prob = stats.t.cdf(t, df) # t분포, t값과 자유도
print("P(T<=F): {:.3f}".format(prob))

P(T<=F): 0.976


## 카이제곱 분포

In [35]:
chisq = 10
df = 30 # df = degree of freedom

prob = stats.chi2.cdf(chisq, df)
print("P(X<=x2): {:.4f}".format(prob))

P(X<=x2): 0.0002


## F분포

In [39]:
f = 2.0
dfnum = 15
dfnum2 = 15
prob = stats.f.cdf(x=f, dfn = dfnum, dfd=dfnum2)
print("P(X<=F):{:.3f}".format(prob))

P(X<=F):0.904


## 와이블 분포

In [44]:
x = 1500
alpha = 2.2
beta = 1200

prob = stats.weibull_min.cdf(x, alpha, scale = beta)
print("P(X>=x): {:.3f}".format((1-prob)*100))

P(X>=x): 19.518


## 이항분포

In [45]:
n = 3
for i in range(n+1):
    prob = stats.binom.pmf(k = i, n = n, p = 0.4)
    print("P(X={0}) = {1:.3f}".format(i, prob))

P(X=0) = 0.216
P(X=1) = 0.432
P(X=2) = 0.288
P(X=3) = 0.064


In [1]:
# 두 대 이하일 확률은 cdf를 이용
n = 3
p = 0.4
prob = stats.binom.cdf(2, n, p)
print("P(X<={0}) = {1:.3f}".format(2, prob))

NameError: name 'stats' is not defined

## 포아송 분포

In [51]:
mu = 2
prob = stats.poisson.pmf(3, mu)
cdf_prob = stats.poisson.cdf(2, mu)

print("1분당 {}번의 전화가 걸려올 확률: {:.1f}%".format(3, prob*100))
print("1분당 최대 {}번의 전화가 걸려올 확률: {:.1f}%".format(2, cdf_prob*100))

1분당 3번의 전화가 걸려올 확률: 18.0%
1분당 최대 2번의 전화가 걸려올 확률: 67.7%


## 정규성 검정

In [54]:
from scipy.stats import shapiro

In [57]:
ds_camshaft = pd.read_csv("./Data/camshaft.csv",engine = "python")
ds_camshaft.head()

Unnamed: 0,Length,Supp1,Supp2
0,601.4,598.0,601.6
1,601.6,599.8,600.4
2,598.0,600.0,598.4
3,601.4,599.8,600.0
4,599.4,600.0,596.8


In [61]:
statistic, p = stats.shapiro(ds_camshaft['Supp1'])
print("Shapiro-Wilk Test: statistic = {}, p-value={}".format(statistic, p))

Shapiro-Wilk Test: statistic = 0.9788156747817993, p-value=0.10744144767522812


### 실습(정규성 검정)

In [64]:
statistic, p = stats.shapiro(ds_mat['Mat_A'])
print("Shapiro-Wilk Test: statistic = {}, p-value={}".format(statistic, p))

# 데이터 셋의 Nan 데이터가 있기 때문에 나올 수 없는 값이 나온다.

Shapiro-Wilk Test: statistic = nan, p-value=1.0


In [67]:
statistic, p = stats.shapiro(ds_mat['Mat_A'][:10])
print("Shapiro-Wilk Test: statistic = {}, p-value={}".format(statistic, p))

# H_0을 따른다. P-value = 0.25이기 때문

Shapiro-Wilk Test: statistic = 0.9062636494636536, p-value=0.2562994360923767


In [68]:
statistic, p = stats.shapiro(ds_mat['Mat_B'][:10])
print("Shapiro-Wilk Test: statistic = {}, p-value={}".format(statistic, p))

Shapiro-Wilk Test: statistic = 0.9729753732681274, p-value=0.9169777035713196


### 모평균 추정

In [77]:
df = pd.DataFrame({'Sample': [3.6, 2.9, 2.8, 2.6, 3.4, 3.2, 2.2, 2.6, 2.6, 2.4,
                             2.4, 2.6, 3.4, 2.4, 2.1, 2.6, 3.0, 2.4, 2.7, 2.4,
                             2.6, 2.9, 2.9, 2.0, 2.7, 2.2, 2.8, 2.7, 1.8, 2.5,
                             3.0, 3.2, 2.8, 2.6, 3.2, 3.1, 2.9, 2.7, 2.7, 2.2]})
statistic, p =stats.shapiro(df)
print("statistic = {}, p-value={}".format(statistic, p))

statistic = 0.985405445098877, p-value=0.8764991760253906


In [76]:
lower, upper = stats.norm.interval(0.95, loc = np.mean(df), 
                                   scale = 0.397/np.sqrt(40))
print("신뢰구간: ({},{})".format(lower.round(2), upper.round(2)))

신뢰구간: ([2.57],[2.82])


In [77]:
df = pd.DataFrame({'Sample': [3.6, 2.9, 2.8, 2.6, 3.4, 3.2, 2.2, 2.6, 2.6, 2.4,
                             2.4, 2.6, 3.4, 2.4, 2.1, 2.6, 3.0, 2.4, 2.7, 2.4,
                             2.6, 2.9, 2.9, 2.0, 2.7, 2.2, 2.8, 2.7, 1.8, 2.5,
                             3.0, 3.2, 2.8, 2.6, 3.2, 3.1, 2.9, 2.7, 2.7, 2.2]})
statistic, p =stats.shapiro(df)
print("statistic = {}, p-value={}".foramat(statistic, p))

statistic = 0.985405445098877, p-value=0.8764991760253906


In [None]:
lower, upper = stats.norm.interval(0.95, loc = np.mean(df), 
                                   scale = 0.397/np.sqrt(40))
print("신뢰구간: ({},{})".format(lower.round(2), upper.round(2)))

### 모평균추정(모표준편차 모르는 경우)

In [112]:
df = pd.DataFrame({'Sample': [54.1, 53.3, 56.1, 55.7, 54.0, 54.1, 54.5, 57.1, 55.2, 53.8,
                             54.1, 54.1, 56.1, 55.0, 55.9, 56.0, 54.9, 54.3, 53.9, 55.0]})
df.head()

Unnamed: 0,Sample
0,54.1
1,53.3
2,56.1
3,55.7
4,54.0


In [115]:
lower, upper = stats.t.interval(0.95, len(df)-1, loc = np.mean(df), 
                                   scale = stats.sem(df)) # s/ sqrt(n)
print("신뢰구간: ({},{})".format(lower.round(2), upper.round(2)))

신뢰구간: ([54.39],[55.33])


## 모분산 신뢰구간 추정
- 카이제곱 인터벌 값

In [116]:
df = pd.DataFrame({'Sample': [12, 13, 16, 9, 5, 19, 13, 8, 11, 17,
                              9, 5, 12, 11, 8, 6, 10, 12, 17, 11,
                              10, 9, 12, 14, 9, 6, 8, 14, 5, 8,
                              5, 8, 4, 9, 6, 8, 13, 16, 9, 5, 
                              16, 13, 14, 17, 11, 9, 19, 16, 12, 8]})
print(np.mean(df), np.std(df), np.var(df))

Sample    10.74
dtype: float64 Sample    3.943653
dtype: float64 Sample    15.5524
dtype: float64


In [109]:
lower, upper = stats.chi2.interval(0.95, len(df)-1, loc = np.mean(df),
                                   scale = np.var(df))
print("신뢰구간: ({},{})".format(lower.round(2), upper.round(2)))
# [10.889, 24.033]

신뢰구간: ([501.49],[1102.87])


In [185]:
print(49*(3.94)**2/70.22,"<= chi2 <=", 49*(3.94)**2/31.56)

10.832475078325263 <= chi2 <= 24.10191381495564


In [186]:
prob = stats.chi2.cdf(16.92, 9)
print("P(X<=x2): {:.4f}".format(prob))

P(X<=x2): 0.9500


In [207]:
(0.18 - (1.96*((0.18*(1-0.18))/50)**0.5)), (0.18 + (1.96*((0.18*(1-0.18))/50)**0.5))


(0.07350867077550396, 0.28649132922449605)

In [204]:
1.96*((0.18*(1-0.18))/50)**0.5

0.10649132922449603