In [149]:
import numpy as np
import pandas as pd
import time

start_time = time.time()

raw_train_data = pd.read_csv('covid_train.csv')
raw_valid_data = pd.read_csv('covid_valid.csv')

In [150]:
def preprocessing(data):
  data['age_range'] = pd.cut(x=data['age'], bins=[0,30,60,90,120], labels=['0s','30s','60s','90s']) 
  new_data = data.drop(['entry_date', 'date_symptoms','age'], axis=1)
  for col_name in new_data.columns:
    new_data[col_name].replace(99, 'unknown',inplace=True)
    new_data[col_name].replace(98, 'unknown',inplace=True)
    new_data[col_name].replace(97, 'unknown',inplace=True)
  return new_data

In [151]:
def NB(raw_data):
  data = preprocessing(raw_data)

  prior_p = np.zeros(2)
  prior_p[0] = sum(data['date_died']=='9999-99-99')/len(data['date_died'])
  prior_p[1] = 1 - prior_p[0]

  x = data.drop(['date_died'], axis=1)
  cd = {}
  for col_name in x.columns:
    feature_cd = np.zeros((2,len(set(x[col_name]))))
    for n in range(len(list(set(x[col_name])))):
      feature_n_surive = x.loc[(x[col_name]==list(set(x[col_name]))[n])&(data['date_died']=='9999-99-99'),].shape[0]
      feature_cd[0,n] = (feature_n_surive+1) / sum(data['date_died']=='9999-99-99')
      feature_n_died = x.loc[(x[col_name]==list(set(x[col_name]))[n])&(data['date_died']!='9999-99-99'),].shape[0]
      feature_cd[1,n] = (feature_n_died+1) / sum(data['date_died']!='9999-99-99')

    feature_cd = pd.DataFrame(feature_cd,columns=list(set(x[col_name])),index=[0,1])
    cd[col_name] = feature_cd

  return prior_p, cd

prior_p, cd = NB(raw_train_data)

In [171]:
# cd['icu'].iloc[0,:]
data = preprocessing(raw_train_data)
data.loc[:,'sex']

0         1
1         2
2         1
3         2
4         1
         ..
283296    1
283297    2
283298    2
283299    1
283300    1
Name: sex, Length: 283301, dtype: int64

In [194]:
def predict(raw_test_data):
  test_data = preprocessing(raw_test_data)

  y_test = test_data.loc[:,'date_died']
  y_test[y_test == '9999-99-99'] = 0
  y_test[y_test != '9999-99-99'] = 1

  x_test = test_data.drop(['date_died'], axis=1)
  result = np.empty((0,x_test.shape[0]))
  for row in range(x_test.shape[0]):
    p0 = prior_p[0]
    p1 = prior_p[1]
    for col_name in x_test.columns:
      p0 = p0*cd[col_name].iloc[0,:][x_test.loc[row,col_name]]
      p1 = p1*cd[col_name].iloc[1,:][x_test.loc[row,col_name]]
    
    if p0 > p1:
      result= np.append(result,0)
    else:
      result= np.append(result,1)
  
  accuracy = np.count_nonzero(result == y_test) / x_test.shape[0]

  return result,accuracy

In [188]:
test_result,test_accuracy = predict(raw_valid_data)


In [191]:
test_result

array([1., 0., 0., ..., 1., 1., 1.])

In [186]:
result = np.empty((0,1))
result = np.append(result,1)
result = np.append(result,2)
result

array([1., 2.])

In [167]:
train_result,train_accuracy = predict(raw_train_data)

2

In [None]:
def main(): 

  print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
if __name__=="__main__": 
    main()