In [73]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

from xgboost import XGBClassifier

Seoul = pd.read_csv("SeoulTotal.csv", index_col=0)
Pusan = pd.read_csv("PusanTotal.csv", index_col=0)
Incheon = pd.read_csv("IncheonTotal.csv", index_col=0)
Gwangju = pd.read_csv("GwangjuTotal.csv", index_col=0)
Daejeon = pd.read_csv("DaejeonTotal.csv", index_col=0)
Ulsan = pd.read_csv("UlsanTotal.csv", index_col=0)

In [3]:
dataset = [Seoul, Pusan, Incheon, Gwangju, Daejeon, Ulsan]
newdata = []
for data in dataset:
    data['천식폐질환'] = data.J45 + data.J46 + data.J44
    data['뇌졸증'] = data.I60 = data.I61 + data.I62 + data.I63
    data['감기'] = data.J00 + data.J03 + data.J06 + data.J20
    data = data[data.columns[:6].tolist() + data.columns[17:].tolist()]
    newdata.append(data)
    
dataset = newdata

In [4]:
target = ['천식폐질환', '뇌졸증', '감기']
for data in dataset:
    for tgt in target:
        dist = data[tgt].fillna(0).tolist()
        dist.sort(reverse=True)
        bound1 = dist[int(len(dist)*.05)]
        bound2 = dist[int(len(dist)*.15)]
        bound3 = dist[int(len(dist)*.60)]
        print(tgt)
        print(bound1, bound2, bound3)
    print("-"*10)
        
# 뇌졸증의 경우 너무 환자 수가 적어 정확한 경계값을 환자 수 기준으로 세우기 어려웠다.
# 실제 뇌졸증지수 예보값의 분포를 한 번 봐서 이상적으로 분포되어 있는지 체크할 필요가 있어 보임.
# 일단 뇌졸증은 빼고 해보는걸로

# 천식폐질환 지수도 bound3 이 0이 나오는 경우 경계값을 삼기가 힘들다.
# 따라서 명료한 도시 우선으로 먼저 해본다. 선정된 도시는
# 서울, 부산, 인천
# 세 도시를 우선으로 해보기로 한다.

천식폐질환
20 13 2
뇌졸증
5 3 0
감기
288 209 94
----------
천식폐질환
9 6 1
뇌졸증
2 1 0
감기
126 91 40
----------
천식폐질환
9 5 1
뇌졸증
3 2 0
감기
92 67 28
----------
천식폐질환
5.0 3.0 0.0
뇌졸증
1.0 1.0 0.0
감기
44.0 32.0 13.0
----------
천식폐질환
4 2 0
뇌졸증
2 1 0
감기
58 43 20
----------
천식폐질환
3.0 2.0 0.0
뇌졸증
1.0 0.0 0.0
감기
47.0 33.0 15.0
----------


In [35]:
target = ['천식폐질환', '감기']
for data in dataset[:3]:
    for tgt in target:
        dist = data[tgt].fillna(0).tolist()
        dist.sort(reverse=True)
        bound1 = dist[int(len(dist)*.05)]
        bound2 = dist[int(len(dist)*.15)]
        bound3 = dist[int(len(dist)*.60)]
        data[tgt[:2]+"지수"] = data[tgt].map(lambda x: (x>=bound1)+(x>=bound2)+(x>=bound3))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [45]:
mydata = dataset[0]

In [49]:
mydata.columns[:6]

Index(['MIN_TA', 'MAX_TA', 'AVG_RHM', 'AVG_PA', 'AVG_WS', 'PM10'], dtype='object')

In [50]:
X_train, X_test, y_train, y_test = train_test_split(mydata[mydata.columns[:6]],
                                                    mydata['천식지수'],
                                                    test_size=0.2,
                                                    shuffle=True)

In [51]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(2921, 6) (731, 6) (2921,) (731,)


In [57]:
model = XGBClassifier()

In [58]:
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [59]:
y_pred = model.predict(X_test)

In [76]:
CM = confusion_matrix(y_true=y_test, y_pred=y_pred)
CM

array([[ 97, 174,   0,   1],
       [ 74, 273,   1,   0],
       [ 18,  57,   0,   0],
       [ 14,  22,   0,   0]], dtype=int64)

In [78]:
np.diag(CM).sum() / CM.sum()

0.506155950752394

In [90]:
# 일단 서울의 경우에만
X_test['DIFF_TA'] = X_test['MAX_TA'] - X_test['MIN_TA']
y_pred = []
for i in range(len(X_test)):
    row = X_test.iloc[i,]
    
    if row.MIN_TA < -8.1: temp_min = 4
    elif row.MIN_TA < 0.6: temp_min = 3
    elif row.MIN_TA < 13.5: temp_min = 2
    else: temp_min = 1
    
    if row.DIFF_TA >= 12.5: temp_dif = 4
    elif row.DIFF_TA >= 9.9: temp_dif = 3
    elif row.DIFF_TA >= 7.3: temp_dif = 2
    else: temp_dif = 1
        
    if row.AVG_PA >= 1017.9: lp = 4
    elif row.AVG_PA >= 1011.9: lp = 3
    elif row.AVG_PA >= 1003.5: lp = 2
    else: lp = 1
    
    if row.AVG_RHM < 37.4: rh = 4
    elif row.AVG_RHM < 50.0: rh = 3
    elif row.AVG_RHM < 65.9: rh = 2
    else: rh = 1
    
    pred = .443*temp_min + .202*temp_dif + .315*lp + .04*rh
    if pred >= 3.0525: pred = 3
    elif pred >= 2.6452: pred = 2
    elif pred >= 1.5354: pred = 1
    else: pred = 0
    y_pred.append(pred)

In [91]:
CM = confusion_matrix(y_true=y_test, y_pred=y_pred)
CM

array([[109, 124,  27,  12],
       [136, 160,  34,  18],
       [ 17,  43,   9,   6],
       [ 10,  21,   3,   2]], dtype=int64)

In [92]:
np.diag(CM).sum() / CM.sum()

0.38303693570451436