In [None]:
import datetime 
from datetime import timedelta
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score, plot_roc_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

### load model data & model parameter

In [None]:
base_path = './data_store'
df_israel_covid = pd.read_csv(f'{base_path}/israel_covid_large.csv')

### israel language translated to number

In [None]:
def gender_map(gender):
    if gender == 'נקבה': return 1 #'Female'
    elif gender == 'זכר': return 0 #'Male'
    return -1

In [None]:
def corona_result_map(corona_result):
    if corona_result == 'אחר': return -1 # 'Other'
    elif corona_result == 'שלילי': return 0 # 'Negative'
    elif corona_result == 'חיובי': return 1 # 'Positive'
    return ''

In [None]:
def age_60_and_above_map(age_60_and_above):
    if age_60_and_above == 'Yes': return 1
    elif age_60_and_above == 'No': return 0
    return -1

In [None]:
def test_indication_map(test_indication):
    if test_indication == 'Other': return 0
    elif test_indication == 'Abroad': return 1
    elif  test_indication == 'Contact with confirmed': return 2
    else: return -1

In [None]:
df_israel_covid['gender'] = df_israel_covid['gender'].apply(gender_map)
df_israel_covid['corona_result'] = df_israel_covid['corona_result'].apply(corona_result_map)
df_israel_covid['age_60_and_above'] = df_israel_covid['age_60_and_above'].apply(age_60_and_above_map)
df_israel_covid['test_indication'] = df_israel_covid['test_indication'].apply(test_indication_map)

### crop outlier

In [None]:
df_israel_covid_without_o = df_israel_covid[df_israel_covid['corona_result'] != -1]
df_israel_covid_without_o = df_israel_covid_without_o[df_israel_covid_without_o['gender'] != -1]
df_israel_covid_without_o = df_israel_covid_without_o.sort_values(['test_date']).reset_index(drop=True)

### dataframe to model input and label

In [None]:
raw_datas = df_israel_covid_without_o[['cough', 'fever', 'sore_throat', 'shortness_of_breath', 'head_ache', 'age_60_and_above', 'gender', 'test_indication', 'corona_result']].values

In [None]:
input_datas = raw_datas[:, :-1]
gt_datas = raw_datas[:, -1]

In [None]:
def split_train_test(data, test_ratio):
#     shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
#     test_indices = shuffled_indices[:test_set_size]
#     train_indices = shuffled_indices[test_set_size:]
    
    test_data = data[:test_set_size]
    train_data = data[test_set_size:]
#     test_data = data[:test_indices]
#     train_data = data[train_indices:]
    
    return train_data, test_data

In [None]:
def split_input_gt(non_split_data):
    splited_input = non_split_data[:, :-1]
    splited_gt = non_split_data[:, -1]
    
    return splited_input, splited_gt

In [None]:
not_test_datas, test_datas = split_train_test(raw_datas, test_ratio=0.2)
train_datas, valid_datas = split_train_test(not_test_datas, test_ratio=0.2)

train_input, train_gt = split_input_gt(train_datas)
valid_input, valid_gt = split_input_gt(valid_datas)
test_input, test_gt = split_input_gt(test_datas)

In [None]:
base_path = './data_store'

In [None]:
train_input = np.load(f'{base_path}/train_input.npy')
train_gt = np.load(f'{base_path}/train_gt.npy')
valid_input = np.load(f'{base_path}/valid_input.npy')
valid_gt = np.load(f'{base_path}/valid_gt.npy')

In [None]:
loaded_logistic_clf = joblib.load('./data_store/logistic_israel.pkl') 

### model performance check with validation data

In [None]:
def display_performance(clf, pred, valid_input, valid_gt):
    pred = np.array(pred)

    print('accuracy %.4f' % (accuracy_score(pred, valid_gt)))
    print('precision %.4f' % (precision_score(pred, valid_gt)))
    print('recall %.4f' % (recall_score(pred, valid_gt)))
    print('f1-score %.4f' % (f1_score(pred, valid_gt)))
    print(confusion_matrix(pred, valid_gt))
    plot_roc_curve(clf, valid_input, valid_gt)

In [None]:
pred = loaded_logistic_clf.predict(valid_input)
display_performance(loaded_logistic_clf, pred, valid_input, valid_gt)