In [318]:
import numpy as np
import pandas as pd
import time

start_time = time.time()

raw_train_data = pd.read_csv('covid_train.csv')
raw_valid_data = pd.read_csv('covid_valid.csv')

raw_train_data = raw_train_data.iloc[0:20,]
raw_valid_data = raw_valid_data.iloc[0:20,]
raw_train_data

Unnamed: 0,sex,patient_type,entry_date,date_symptoms,date_died,intubed,pneumonia,age,pregnancy,diabetes,copd,asthma,inmsupr,hypertension,other_disease,cardiovascular,obesity,renal_chronic,tobacco,contact_other_covid,covid_res,icu
0,1,1,22-06-2020,16-06-2020,9999-99-99,97,2,23,2,2,2,2,2,2,2,2,2,2,2,1,2,97
1,2,1,18-06-2020,12-06-2020,9999-99-99,97,2,54,97,2,2,2,2,2,2,2,2,2,2,1,2,97
2,1,1,27-06-2020,24-06-2020,9999-99-99,97,2,33,2,2,2,2,2,2,2,2,2,2,2,1,3,97
3,2,1,21-06-2020,16-06-2020,9999-99-99,97,2,53,97,2,2,2,2,2,2,2,2,2,2,99,2,97
4,1,1,30-05-2020,23-05-2020,9999-99-99,97,2,34,2,2,2,2,2,2,2,2,2,2,1,1,2,97
5,2,2,14-04-2020,01-04-2020,17-04-2020,2,1,56,97,2,2,1,1,2,1,2,2,2,2,2,1,2
6,1,2,24-05-2020,16-05-2020,9999-99-99,1,1,0,2,2,2,2,1,2,2,2,2,2,2,1,2,1
7,1,1,01-06-2020,01-06-2020,9999-99-99,97,2,32,2,2,2,2,2,2,2,2,2,2,2,2,1,97
8,2,1,26-05-2020,18-05-2020,9999-99-99,97,2,47,97,2,2,2,2,2,2,2,2,2,2,2,1,97
9,1,1,02-06-2020,26-05-2020,9999-99-99,97,2,23,2,2,2,2,2,2,2,2,2,2,2,1,2,97


In [300]:
def preprocessing(data):
  #age
  data['age_range'] = pd.cut(x=data['age'], bins=[-1,30,60,90,120], labels=['0s','30s','60s','90s']) 
  new_data = data.drop(['entry_date', 'date_symptoms','age'], axis=1)

  #date duration

  #symtoms
  for col_name in new_data.columns:
    new_data[col_name].replace(99, 'unknown',inplace=True)
    new_data[col_name].replace(98, 'unknown',inplace=True)
    new_data[col_name].replace(97, 'unknown',inplace=True)
  return new_data

In [301]:
data = preprocessing(raw_train_data)
data['age_range']

0      0s
1     30s
2     30s
3     30s
4     30s
     ... 
95    60s
96    30s
97     0s
98    30s
99    30s
Name: age_range, Length: 100, dtype: category
Categories (4, object): ['0s' < '30s' < '60s' < '90s']

In [302]:
def NB(raw_data):
  data = preprocessing(raw_data)

  prior_p = np.zeros(2)
  prior_p[0] = sum(data['date_died']=='9999-99-99')/len(data['date_died'])
  prior_p[1] = 1 - prior_p[0]

  x = data.drop(['date_died'], axis=1)
  cd = {}
  for col_name in x.columns:
    feature_cd = np.zeros((2,len(set(x[col_name]))))
    for n in range(len(list(set(x[col_name])))):
      feature_n_survive = x.loc[(x[col_name]==list(set(x[col_name]))[n])&(data['date_died']=='9999-99-99'),].shape[0]
      feature_cd[0,n] = (feature_n_survive+0.01) / sum(data['date_died']=='9999-99-99')
      feature_n_died = x.loc[(x[col_name]==list(set(x[col_name]))[n])&(data['date_died']!='9999-99-99'),].shape[0]
      feature_cd[1,n] = (feature_n_died+0.01) / sum(data['date_died']!='9999-99-99')
      if col_name == "age_range":
        print("feature_n_survive is ", feature_n_survive,". age:",list(set(x[col_name]))[n])
        print("sum for suvivaer is ", sum(data['date_died']=='9999-99-99'),)
        print("feature_n_death is ", feature_n_died)
        print("sum for death ", sum(data['date_died']!='9999-99-99'))

    feature_cd = pd.DataFrame(feature_cd,columns=list(set(x[col_name])),index=[0,1])
    cd[col_name] = feature_cd

  return prior_p, cd

prior_p, cd = NB(raw_train_data)

feature_n_survive is  57 . age: 30s
sum for suvivaer is  95
feature_n_death is  2
sum for death  5
feature_n_survive is  10 . age: 60s
sum for suvivaer is  95
feature_n_death is  2
sum for death  5
feature_n_survive is  28 . age: 0s
sum for suvivaer is  95
feature_n_death is  1
sum for death  5


In [303]:
cd['copd']
# cd['icu'].iloc[0,:]
# data = preprocessing(raw_train_data)

# data['sex']

Unnamed: 0,1,2
0,0.010632,0.989579
1,0.002,1.002


In [307]:
def predict(raw_test_data):
  test_data = preprocessing(raw_test_data)

  test_data.loc[test_data['date_died']!='9999-99-99','date_died']=1
  test_data.loc[test_data['date_died']=='9999-99-99','date_died']=0

  x_test = test_data.drop(['date_died'], axis=1)
  result = np.empty((0,x_test.shape[0]))
  for row in range(x_test.shape[0]):
    p0 = prior_p[0]
    p1 = prior_p[1]
    for col_name in x_test.columns:
      p0 = p0*cd[col_name].iloc[0,:][x_test.loc[row,col_name]]
      p1 = p1*cd[col_name].iloc[1,:][x_test.loc[row,col_name]]
    
    if p0 > p1:
      result= np.append(result,0)
    else:
      result= np.append(result,1)
  
  accuracy = np.count_nonzero(result == test_data.loc[:,'date_died']) / x_test.shape[0]

  return result,accuracy

In [305]:
test_result,test_accuracy = predict(raw_valid_data)
test_data = preprocessing(raw_valid_data)
test_data.loc[test_data['date_died']=='9999-99-99','date_died']=0
test_data.loc[test_data['date_died']!='9999-99-99','date_died']=1

x_test = test_data.drop(['date_died'], axis=1)
result = np.empty((0,x_test.shape[0]))

# prior_p[0]*cd['copd'].iloc[0,:][x_test.loc[6,'copd']]
# x_test.loc[6,'copd']
prior_p[0]*cd['copd']
# for row in range(x_test.shape[0]):
#   p0 = prior_p[0]
#   p1 = prior_p[1]
#   for col_name in x_test.columns:
#     p0 = p0*cd[col_name].iloc[0,:][x_test.loc[row,col_name]]
#     p1 = p1*cd[col_name].iloc[1,:][x_test.loc[row,col_name]]
    
#   if p0 > p1:
#     result= np.append(result,0)
#   else:
#     result= np.append(result,1)

Unnamed: 0,1,2
0,0.0101,0.9401
1,0.0019,0.9519


In [191]:
test_accuracy

array([1., 0., 0., ..., 1., 1., 1.])

In [186]:
result = np.empty((0,1))
result = np.append(result,1)
result = np.append(result,2)
result

array([1., 2.])

In [311]:
train_result,train_accuracy = predict(raw_train_data)

In [312]:
train_accuracy

0.05

In [313]:
train_result

array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.])

In [317]:
test_data = preprocessing(raw_train_data)
test_data.loc[test_data['date_died']!='9999-99-99','date_died']=1
test_data.loc[test_data['date_died']=='9999-99-99','date_died']=0

test_data



Unnamed: 0,sex,patient_type,date_died,intubed,pneumonia,pregnancy,diabetes,copd,asthma,inmsupr,hypertension,other_disease,cardiovascular,obesity,renal_chronic,tobacco,contact_other_covid,covid_res,icu,age_range
0,1,1,0,unknown,2,2,2,2,2,2,2,2,2,2,2,2,1,2,unknown,0s
1,2,1,0,unknown,2,unknown,2,2,2,2,2,2,2,2,2,2,1,2,unknown,30s
2,1,1,0,unknown,2,2,2,2,2,2,2,2,2,2,2,2,1,3,unknown,30s
3,2,1,0,unknown,2,unknown,2,2,2,2,2,2,2,2,2,2,unknown,2,unknown,30s
4,1,1,0,unknown,2,2,2,2,2,2,2,2,2,2,2,1,1,2,unknown,30s
5,2,2,1,2,1,unknown,2,2,1,1,2,1,2,2,2,2,2,1,2,30s
6,1,2,0,1,1,2,2,2,2,1,2,2,2,2,2,2,1,2,1,0s
7,1,1,0,unknown,2,2,2,2,2,2,2,2,2,2,2,2,2,1,unknown,30s
8,2,1,0,unknown,2,unknown,2,2,2,2,2,2,2,2,2,2,2,1,unknown,30s
9,1,1,0,unknown,2,2,2,2,2,2,2,2,2,2,2,2,1,2,unknown,0s


In [None]:
def main(): 

  print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
if __name__=="__main__": 
    main()