[구글 코랩(Colab)에서 실행하기](https://colab.research.google.com/github/lovedlim/bigdata_analyst_cert/blob/main/part3/ch6/ch6_ex_type3.ipynb)

### Section1.

In [3]:
# 데이터
import pandas as pd
df = pd.DataFrame({
    'Caffeine(mg)': [
        94.2, 93.7, 95.5, 93.9, 94.0, 95.2, 94.7, 93.5, 92.8, 94.4,
        93.8, 94.6, 93.3, 95.1, 94.3, 94.9, 93.9, 94.8, 95.0, 94.2,
        93.7, 94.4, 95.1, 94.0, 93.6
    ]
})

In [4]:
# 1. 표본 평균
print(df.mean())

# 2. 정규성 검정
from scipy import stats
print(stats.shapiro(df['Caffeine(mg)']))

# 3~5.단일 표본 t-검정
print(stats.ttest_1samp(df['Caffeine(mg)'], 95, alternative='less'))

Caffeine(mg)    94.264
dtype: float64
ShapiroResult(statistic=0.9826578166170536, pvalue=0.9322031137746971)
TtestResult(statistic=-5.501737036221897, pvalue=5.8686553916715e-06, df=24)


In [5]:
statistic, pvalue = stats.ttest_1samp(df['Caffeine(mg)'], 95, alternative='less')
print("{:.10f}".format(pvalue))

0.0000058687


### Section2.

In [6]:
# 데이터
import pandas as pd
df = pd.DataFrame({
    '충전기': ['New'] * 10 + ['Old'] * 10,
    '충전시간': [
        1.5, 1.6, 1.4, 1.7, 1.5, 1.6, 1.7, 1.4, 1.6, 1.5,
        1.7, 1.8, 1.7, 1.9, 1.8, 1.7, 1.8, 1.9, 1.7, 1.6
    ]
})
print(df.head(2))

   충전기  충전시간
0  New   1.5
1  New   1.6


In [7]:
# 1~3. 독립 표본 t-검정
new_cond = df['충전기'] == 'New'
old_cond = df['충전기'] == 'Old'
print(stats.ttest_ind(df[new_cond]['충전시간'], df[old_cond]['충전시간'], alternative='less', equal_var=True))

TtestResult(statistic=-4.582575694955849, pvalue=0.00011546547787696304, df=18.0)


### Section3.

In [8]:
# 데이터
import pandas as pd
df = pd.DataFrame({
    'User': list(range(1, 11)),
    '기존방법': [60.4, 60.7, 60.5, 60.3, 60.8, 60.6, 60.2, 60.5, 60.7, 60.4],
    '새로운방법': [59.8, 60.2, 60.1, 59.9, 59.7, 58.4, 57.0, 60.3, 59.6, 59.8]
})
print(df.head(2))

   User  기존방법  새로운방법
0     1  60.4   59.8
1     2  60.7   60.2


In [9]:
# 1. 표본 평균
df['diff'] = df['새로운방법'] - df['기존방법']
print(df['diff'].mean())

# 2~4. 대응 표본 t-검정
print(stats.ttest_rel(df['새로운방법'], df['기존방법'], alternative='less'))

-1.0300000000000005
TtestResult(statistic=-3.407973078114844, pvalue=0.0038872633380070652, df=9)


### Section4.

In [10]:
import pandas as pd
# df = pd.read_csv("math.csv")
df = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part3/ch6/math.csv")
print(df.head())

    groups  scores
0  group_A      85
1  group_A      88
2  group_A      90
3  group_A      82
4  group_A      87


In [11]:
from scipy import stats

# 1. Shapiro-Wilk 검정 (정규성)
condA = df['groups']=='group_A'
print(stats.shapiro(df[condA]['scores']))

condB = df['groups']=='group_B'
print(stats.shapiro(df[condB]['scores']))

condC = df['groups']=='group_C'
print(stats.shapiro(df[condC]['scores']))

condD = df['groups']=='group_D'
print(stats.shapiro(df[condD]['scores']))

# 2. Levene 검정 (등분산성)
print(stats.levene(df[condA]['scores'], df[condB]['scores'], df[condC]['scores'], df[condD]['scores']))

ShapiroResult(statistic=0.9715896670696531, pvalue=0.9051800443853569)
ShapiroResult(statistic=0.9499422438060351, pvalue=0.6678172590861611)
ShapiroResult(statistic=0.9299424104842702, pvalue=0.44732595113862045)
ShapiroResult(statistic=0.9065684572704982, pvalue=0.25824165549017347)
LeveneResult(statistic=1.757685352622062, pvalue=0.17270284963232108)


In [12]:
# 일원분산분석을 위한 모델 학습
from statsmodels.formula.api import ols
model = ols('scores ~ groups', df).fit()

# 3~9. ANOVA 테이블
from statsmodels.stats.anova import anova_lm
print(anova_lm(model))

            df  sum_sq     mean_sq          F        PR(>F)
groups     3.0   411.8  137.266667  34.174274  1.240642e-10
Residual  36.0   144.6    4.016667        NaN           NaN


### Section5.

In [13]:
# 데이터
import pandas as pd
# df = pd.read_csv("tomato2.csv")
df = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part3/ch6/tomato2.csv")
print(df.head())

  비료유형  물주기  수확량
0    A    1  514
1    A    1  480
2    A    1  507
3    A    2  452
4    A    2  526


In [14]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

# 1~9 이원 분산 분석
model = ols('수확량 ~ C(비료유형) * C(물주기)', data=df).fit()
anova_table = sm.stats.anova_lm(model)
print(anova_table)

                  df        sum_sq      mean_sq         F    PR(>F)
C(비료유형)          2.0   5251.722222  2625.861111  3.184685  0.059334
C(물주기)           3.0   9057.000000  3019.000000  3.661490  0.026460
C(비료유형):C(물주기)   6.0   4271.833333   711.972222  0.863491  0.535426
Residual        24.0  19788.666667   824.527778       NaN       NaN


### Section6.

In [15]:
# 1. 교통사고 5회 이상 경험 비율
print(30 / 1000)

# 2~4. 적합도 검정
from scipy.stats import chisquare
observed = [550, 250, 100, 70, 30]
expected = [1000*0.60, 1000*0.25, 1000*0.08, 1000*0.05, 1000*0.02]
print(chisquare(observed, expected))

0.03
Power_divergenceResult(statistic=22.166666666666668, pvalue=0.00018567620386641427)


### Section7.

In [16]:
import pandas as pd
from scipy.stats import chi2_contingency

# 1~3. 독립성 검정
observed = pd.DataFrame([[50, 30], [60, 40]])
print(chi2_contingency(observed))

Chi2ContingencyResult(statistic=0.03535714285714309, pvalue=0.8508492527705047, dof=1, expected_freq=array([[48.88888889, 31.11111111],
       [61.11111111, 38.88888889]]))


In [17]:
# 데이터
import pandas as pd
df = pd.DataFrame({
        '캠프': ['빅분기']*80 + ['정처기']*100,
        '등록여부': ['등록']*50 + ['등록안함']*30 + ['등록']*60 + ['등록안함']*40
})
print(df.head())

    캠프 등록여부
0  빅분기   등록
1  빅분기   등록
2  빅분기   등록
3  빅분기   등록
4  빅분기   등록


In [18]:
# 교차표로 변경
df = pd.crosstab(df['캠프'], df['등록여부'])
print(df)

# 4~6. 독립성 검정
print(chi2_contingency(df))

등록여부  등록  등록안함
캠프            
빅분기   50    30
정처기   60    40
Chi2ContingencyResult(statistic=0.03535714285714309, pvalue=0.8508492527705047, dof=1, expected_freq=array([[48.88888889, 31.11111111],
       [61.11111111, 38.88888889]]))


### Section8.

In [19]:
# 데이터
import pandas as pd
df = pd.DataFrame({
    '할인율': [28, 24, 13, 0, 27, 30, 10, 16, 6, 5, 7, 11, 11, 30, 25,
            4, 7, 24, 19, 21, 6, 10, 26, 13, 15, 6, 12, 6, 20, 2],
    '온도': [15, 34, 15, 22, 29, 30, 14, 17, 28, 29, 19, 19, 34, 10,
           29, 28, 12, 25, 32, 28, 22, 16, 30, 11, 16, 18, 16, 33, 12, 22],
    '광고비': [342, 666, 224, 764, 148, 499, 711, 596, 797, 484, 986, 347, 146, 362, 642,
            591, 846, 260, 560, 941, 469, 309, 730, 305, 892, 147, 887, 526, 525, 884],
    '주문량': [635, 958, 525, 25, 607, 872, 858, 732, 1082, 863, 904, 686, 699, 615, 893,
            830, 856, 679, 918, 951, 789, 583, 988, 631, 866, 549, 910, 946, 647, 943]
})
print(df.head(3))

   할인율  온도  광고비  주문량
0   28  15  342  635
1   24  34  666  958
2   13  15  224  525


In [20]:
# 다중 선형 회귀 모델 적합
from statsmodels.formula.api import ols
model = ols('주문량 ~ 할인율 + 온도 + 광고비', data=df).fit()

# 1. 상관계수
print("1. 상관계수:", round(df['할인율'].corr(df['온도']), 2))

# 2. 결정계수
print("2. 결정계수(R-squared):", round(model.rsquared, 2))

# 3. 회귀계수(기울기)
print("3. 회귀계수:", round(model.params, 4))

# 4. 절편
print("4. 절편:", round(model.params['Intercept'], 4))

# 5. 회귀계수 검정
print("5. pvalue:", round(model.pvalues['온도'], 4))

# 6. 예측 판매량
new_data = pd.DataFrame({"할인율": [10], "온도": [20], "광고비": [500]})
result = model.predict(new_data)
print("6. 새로운 데이터:", int(result[0]))

# 7. 잔차 제곱합
df['잔차'] = df['주문량'] - model.predict(df)
print("7. 잔차 제곱합:", round(sum(df['잔차']**2), 2))

# 8. MSE(Mean Squared Error)
MSE = (df['잔차'] ** 2).mean()
print('8. MSE:', round(MSE, 4))

# 9. 각 변수에 대한 90% 신뢰구간
print("9. 신뢰구간:\n", model.conf_int(alpha=0.1))

# 10. 새로운 데이터의 예측값의 90% 신뢰구간과 예측구간
new_data = pd.DataFrame({"할인율": [15], "온도": [25], "광고비": [300]})
pred = model.get_prediction(new_data)
result = pred.summary_frame(alpha=0.1)
print("10. 예측값의 신뢰구간과 예측구간:\n", result)

# 11. 광고비는 배달 주문량에 영향을 주는지 가설 검정
cond = model.pvalues['광고비'] < 0.05
if cond:
    result = "기각"
else:
    result = "채택"
print("11. 귀무가설", result)

# 선형 회귀 모델의 요약 결과
print(model.summary())

1. 상관계수: 0.09
2. 결정계수(R-squared): 0.4
3. 회귀계수: Intercept    267.6609
할인율            4.2068
온도             9.4798
광고비            0.4148
dtype: float64
4. 절편: 267.6609
5. pvalue: 0.0289
6. 새로운 데이터: 706
7. 잔차 제곱합: 732197.9
8. MSE: 24406.5966
9. 신뢰구간:
                    0           1
Intercept  45.955720  489.366084
할인율        -1.847229   10.260887
온도          2.490702   16.468984
광고비         0.201064    0.628589
10. 예측값의 신뢰구간과 예측구간:
          mean    mean_se  mean_ci_lower  mean_ci_upper  obs_ci_lower  \
0  692.207386  45.555397     614.507283     769.907488    395.622293   

   obs_ci_upper  
0    988.792478  
11. 귀무가설 기각
                            OLS Regression Results                            
Dep. Variable:                    주문량   R-squared:                       0.400
Model:                            OLS   Adj. R-squared:                  0.330
Method:                 Least Squares   F-statistic:                     5.770
Date:                Sun, 24 Nov 2024   Prob (F-statist

### Section9.

In [21]:
import pandas as pd
# df = pd.read_csv("customer_travel.csv")
df = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/refs/heads/main/part3/ch6/customer_travel.csv")

df.head()

Unnamed: 0,age,service,social,booked,target
0,34,6,0,1,0
1,34,5,1,0,1
2,37,3,1,0,0
3,30,2,0,0,0
4,30,1,0,0,0


In [22]:
# 데이터 분할
midpoint = len(df) // 2 #몫(정수)
a = df.iloc[:midpoint]
b = df.iloc[midpoint:]

# 데이터 확인
a.shape, b.shape

((400, 5), (400, 5))

In [23]:
# 1) 유의하지 않은 독립변수의 개수
from statsmodels.formula.api import logit
formula = "target ~ age + service + social + booked"
model = logit(formula, data=a).fit()
print(model.summary())
print("1.", sum(model.pvalues[1:] >= 0.05)) # 상수항(Intercept) 제외

# 2. 수정된 모델에서 가장 큰 p-value를 가진 변수의 이름
formula = 'target ~ age + booked'
model = logit(formula, data=a).fit()
print(model.summary())
print("2.", model.pvalues[1:].idxmax()) # 상수항(Intercept) 제외


# 3. 수정된 모델에서 독립변수 중 가장 큰 양의 회귀계수를 가진 변수의 이름
# print("3.", model.params[1:].idxmax())
print("3. 답 없음(양의 회귀계수가 없음)")

# 4. 로그 우도
print("4.", model.llf)

# 5. 잔차이탈도
print("5.", -2 * model.llf)

# 6. 'booked' 변수가 3 증가할 때 오즈비
import numpy as np
print("6.", np.exp(model.params['booked'] * 3))

# 7. p-value가 0.05보다 작은 회귀계수의 총합
print("7.", model.params[model.pvalues < 0.05].sum()) # 상수항(절편)도 포함

# 8. 정확도
pred = model.predict(b)
pred = (pred > 0.5).astype(int)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(b['target'], pred)
print("8.", accuracy)

# 9. 오류율
error_rate = 1 - accuracy
error_rate
print("9.", error_rate)

Optimization terminated successfully.
         Current function value: 0.527521
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                 target   No. Observations:                  400
Model:                          Logit   Df Residuals:                      395
Method:                           MLE   Df Model:                            4
Date:                Sun, 24 Nov 2024   Pseudo R-squ.:                 0.05254
Time:                        02:46:58   Log-Likelihood:                -211.01
converged:                       True   LL-Null:                       -222.71
Covariance Type:            nonrobust   LLR p-value:                 0.0001052
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.3314      1.204      1.937      0.053      -0.028       4.691
age           -0.1043      0.