## 텐서후로우 1 버전과 OLS 사용한
## 사망 기간 예측 Linear Regression

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import statsmodels.api as sm
import seaborn as sns

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [81]:
df = pd.read_csv("C://workspace//project_data//lata//d_final.csv")
df.head()

Unnamed: 0,id,sex,intubed,pneumonia,age,pregnancy,diabetes,copd,asthma,inmsupr,...,tobacco,contact_other_covid,covid_res,icu,DEATH,d_ent,d_sym,d_last,diff_days,age_grp
0,167386,1,2,2,54,2,2,2,2,2,...,2,3,1,2,0,2020-04-06,2020-04-01,2020-06-29,89,3.0
1,0b5948,2,2,1,30,2,2,2,2,2,...,2,3,1,2,0,2020-04-17,2020-04-10,2020-06-29,80,2.0
2,0d01b5,1,2,2,60,2,1,2,2,2,...,2,3,1,2,1,2020-04-13,2020-04-13,2020-04-22,9,4.0
3,1beec8,2,2,1,47,2,1,2,2,2,...,2,3,1,1,1,2020-04-16,2020-04-16,2020-04-29,13,3.0
4,1.75E+56,2,2,2,63,2,2,2,2,2,...,2,3,1,2,0,2020-04-22,2020-04-13,2020-06-29,77,4.0


In [82]:
df['DEATH'].value_counts()

0    88993
1    32402
Name: DEATH, dtype: int64

## 사망기간 예측을 위한 생존자 데이터 제거

In [83]:
df['DEATH'] = df['DEATH'].replace(0, np.nan)
df.dropna(inplace=True)
df['DEATH'].value_counts()

1.0    32402
Name: DEATH, dtype: int64

## 변수 설정

In [84]:
t_train_raw = df[['sex', 'intubed','age_grp','pneumonia',
                  'pregnancy', 'diabetes', 'copd', 'asthma',
                  'inmsupr','hypertension','other_disease',
                  'cardiovascular','obesity', 'renal_chronic', 
                  'tobacco', 'contact_other_covid', 'covid_res', 
                  'icu']]

target_raw = df['diff_days']

In [85]:
# 정규화 적용
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(t_train_raw)
x_train_raw = pd.DataFrame(x_scaled)

## train/test 셋 분리
x_train, x_test, target_train, target_test = train_test_split(x_train_raw, target_raw, test_size=0.2)

In [86]:
print(x_train.shape, target_train.shape)
print(x_test.shape, target_test.shape)

(25921, 18) (25921,)
(6481, 18) (6481,)


## 텐서에 탑재를 위한 차원 변경

- target의 시리즈 데이터를 데이터 프레임으로 변환

In [87]:
target_train1 = target_train.values.reshape(25921, 1)
target_test1 = target_test.values.reshape(6481, 1)

print(x_train.shape, target_train1.shape)
print(x_test.shape, target_test1.shape)

(25921, 18) (25921, 1)
(6481, 18) (6481, 1)


## 텐서후로우 환경 설정
- 텐서후로우 1 버전으로 작성
- multivariable linear regression 모델

In [88]:
tf.set_random_seed(777)

x = tf.placeholder(tf.float32, shape=[None, 18])
y = tf.placeholder(tf.float32, shape=[None, 1])

w = tf.Variable(tf.random_normal([18, 1]))
b = tf.Variable(tf.random_normal([1]))

hypothesis = tf.matmul(x,w)+b

loss = tf.reduce_mean(tf.square(hypothesis - y))

train = tf.train.GradientDescentOptimizer(learning_rate=0.001).minimize(loss)

In [91]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for step in range(6001):
        loss_val, hy_val, _ = sess.run([loss, hypothesis, train], 
                 feed_dict={x:x_train, y:target_train1})
        if step % 200 == 0:
            print(step, "loss: ", loss_val, "\naccur: \n", hy_val)
    
    pred = sess.run(hypothesis, feed_dict={x:x_test})
    print("\nTest-set: \n", pred)

0 loss:  214.53139 
accur: 
 [[-2.7818787 ]
 [-1.7000356 ]
 [-0.88688165]
 ...
 [-1.8952549 ]
 [-0.7691507 ]
 [-1.663852  ]]
200 loss:  61.64809 
accur: 
 [[ 9.0798435]
 [ 8.48363  ]
 [11.567221 ]
 ...
 [10.486616 ]
 [ 9.449807 ]
 [ 9.3177   ]]
400 loss:  58.761776 
accur: 
 [[10.651594]
 [ 9.862703]
 [12.996555]
 ...
 [11.982484]
 [10.675323]
 [10.659082]]
600 loss:  58.40308 
accur: 
 [[10.894791 ]
 [10.104017 ]
 [13.0172415]
 ...
 [12.084607 ]
 [10.751709 ]
 [10.766776 ]]
800 loss:  58.120293 
accur: 
 [[10.96396 ]
 [10.19528 ]
 [12.867076]
 ...
 [12.014976]
 [10.688531]
 [10.722848]]
1000 loss:  57.86852 
accur: 
 [[11.008047]
 [10.263898]
 [12.704758]
 ...
 [11.930387]
 [10.615085]
 [10.666013]]
1200 loss:  57.64319 
accur: 
 [[11.046457]
 [10.326456]
 [12.550045]
 ...
 [11.850483]
 [10.547344]
 [10.613538]]
1400 loss:  57.440907 
accur: 
 [[11.081888]
 [10.385256]
 [12.404845]
 ...
 [11.777278]
 [10.486743]
 [10.567093]]
1600 loss:  57.258995 
accur: 
 [[11.114871 ]
 [10.440754 ]

- loss 값은 55 대에서 더 떨어지지 않음

## 새로운 데이터 프레임 생성을 위한 차원축소

In [129]:
pred = pred.reshape(6481,)

In [138]:
c = { 'True': target_test, 
     'Predicted': pred.round(), 
     'error': target_test - pred.round()}
pred_diff = pd.DataFrame(data=c).reset_index()
pred_diff = pred_diff.drop(['index'], axis=1)
pred_diff

Unnamed: 0,True,Predicted,error
0,5,12.0,-7.0
1,16,12.0,4.0
2,38,11.0,27.0
3,8,11.0,-3.0
4,5,12.0,-7.0
...,...,...,...
6476,17,11.0,6.0
6477,8,11.0,-3.0
6478,17,11.0,6.0
6479,7,11.0,-4.0


## OLS Estimation

In [35]:
X_data = df[['sex', 'intubed','age_grp','pneumonia',
        'pregnancy', 'diabetes', 'copd', 'asthma',
        'inmsupr','hypertension','other_disease',
        'cardiovascular','obesity', 'renal_chronic', 
        'tobacco', 'contact_other_covid', 'covid_res', 
        'icu']]

target_data = df['diff_days']

xtr, xts, \
    ttr, tts = \
    train_test_split(X_data, target_data, test_size=0.2)

#data = sm.add_constant(data)
FD = sm.OLS(endog=ttr,
            exog=xtr).fit()
FD.summary()

0,1,2,3
Dep. Variable:,diff_days,R-squared (uncentered):,0.689
Model:,OLS,Adj. R-squared (uncentered):,0.689
Method:,Least Squares,F-statistic:,3187.0
Date:,"Thu, 17 Sep 2020",Prob (F-statistic):,0.0
Time:,16:20:31,Log-Likelihood:,-88762.0
No. Observations:,25921,AIC:,177600.0
Df Residuals:,25903,BIC:,177700.0
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
sex,0.2393,0.099,2.428,0.015,0.046,0.433
intubed,-0.9925,0.148,-6.723,0.000,-1.282,-0.703
age_grp,-0.0821,0.059,-1.381,0.167,-0.199,0.034
pneumonia,-0.2808,0.113,-2.485,0.013,-0.502,-0.059
pregnancy,6.9089,0.402,17.198,0.000,6.121,7.696
diabetes,0.5945,0.101,5.892,0.000,0.397,0.792
copd,0.4816,0.199,2.416,0.016,0.091,0.872
asthma,-1.2315,0.299,-4.123,0.000,-1.817,-0.646
inmsupr,0.5499,0.233,2.355,0.019,0.092,1.007

0,1,2,3
Omnibus:,9478.544,Durbin-Watson:,2.012
Prob(Omnibus):,0.0,Jarque-Bera (JB):,48707.532
Skew:,1.698,Prob(JB):,0.0
Kurtosis:,8.794,Cond. No.,79.3


- 부연설명:
- R-squared (uncentered):	0.689 으로 약 69% 모델 적합도
- x 값이 1 증가할때 마다 (정규화 하지 않았음) 해당 coef 만큼 y 값 변동
- 즉 폐렴(pneumonia)가 1 증가 ( 1 : 예 -> 2: 아니오 ) 하면 예상 사망일이 -.28일 만큼 짧아짐
- 임신 (pregnancy)가 아닐수록 (1 -> 2) 예상 사망일이 6.9일 만큼 길어짐
- 남자 (sex : 2) 일수록 예상 사망일이 0.23일 늘어남 등

In [74]:
pred1 = FD.predict(xts)
d = { 'True': tts, 'Predicted': pred1.round(), 'error': tts - pred1.round()}

pred_diff1 = pd.DataFrame(data=d).reset_index()
pred_diff1 = pred_diff1.drop(['index'], axis=1)

In [75]:
pred_diff1

Unnamed: 0,True,Predicted,error
0,5,10.0,-5.0
1,5,11.0,-6.0
2,17,12.0,5.0
3,13,12.0,1.0
4,14,11.0,3.0
...,...,...,...
6476,2,9.0,-7.0
6477,8,11.0,-3.0
6478,11,14.0,-3.0
6479,25,14.0,11.0
