In [54]:
# def synthetic_prediction_data_generate + ADD_shift
# def synthetic_ground_truth_generate
# def file preprocess
# def evaluation window preprocess
# def save_as_CSV (done)
# class score (done)


In [55]:
#imports
import pandas as pd
import numpy as np
import os, sys, itertools as it, random
import datetime as dt
from datetime import datetime 
from sklearn.metrics import precision_score, accuracy_score, recall_score, balanced_accuracy_score
from sklearn.metrics import f1_score, roc_auc_score, log_loss, roc_curve, brier_score_loss

#constants
mu = 0
sigma = 1

<p> Convert mm/dd/yyyy to yyyymm using this - <date>.strftime("%Y%m").astype(int) to convert to yyyymm format from mm/dd/yyyy </p>

<H3> File Configurations </H3>

In [65]:
def load_configurations():
    #configurations of prediction file [column header, format, range etc.]
    c_p={
            'dir': '../newdata/predictions/',
            'file': 'prediction_shift_0.csv',
            'number_of_samples': 500,
            'id_column': 'person_id',
            'id_format': int,
            'date_column': 'Date',
            'start_date': '2016-01-01',
            'end_date': '2018-01-01',
            'model_columns': ['lin_reg', 'rand_forest', 'xg_boost', 'sgmm'],
            'model_format': int
        }

    #configurations of referral file [column header, format, range etc.]
    c_r={
            'dir': '../newdata/referrals/',
            'file': 'referral.csv',
            'number_of_samples': 25,
            'start_date': '2016-01-01',
            'end_date': '2018-01-01',
            'id_column': 'person_id',
            'id_format': int,
            'date_column': 'Date'
        }

<H3> Score Class </H3>

In [57]:
class Score:
    
    def __init__(self):
        pass
    
    def get_precision(y_true, y_predict, average='binary'):
        return precision_score(y_true, y_predict, average)
    
    def get_recall(y_true, y_predict, average='binary'):
        return recall_score(y_true, y_predict, average)
    
    def get_accuracy(y_true, y_predict):
        return recall_score(y_true, y_predict)
    
    def get_balanced_accuracy(y_true, y_predict):
        return balanced_accuracy_score(y_true, y_predict)
    
    def get_f1_score(y_true, y_predict, average='binary'):
        return f1_score(y_true, y_predict, average='binary')
    
    def get_roc_auc_score(y_true, y_predict):
        return roc_auc_score(y_true, y_predict)
    
    def get_log_loss(y_true, y_predict):
        return log_loss(y_true, y_predict)
    
    #need to implement
    #prob_from_a_model = [0.8, 0.3, 0.5]
    #ground_truth_binary = [1,0,0]
    #briar score(ground_truth_binary, prob_from_a_model)
    def get_brier_score_loss(y_true, y_prob):
        pass
    
        

<H3> HELPER FUNCTIONS </H3>

In [58]:
#data = dataframe, file='/relative_path/to/file'
def dataframe_to_csv(data, file, append=True):
    if append and os.path.exists(file):
        with open(file, 'a') as f:
            data.to_csv(f, header=False, index = False)
    else:
        data.to_csv(file, index = False)

In [59]:
#generate random date between a range
#input in datetime.date format
def generate_random_date(start_date, end_date, iteration):
    date_list = []
    seed = 0
    for i in range(iteration):
        s_date = datetime.strptime(start_date, '%Y-%m-%d')
        e_date = datetime.strptime(end_date, '%Y-%m-%d')
        time_between_dates = e_date - s_date
        days_between_dates = time_between_dates.days
        random.seed(seed)
        random_number_of_days = random.randrange(days_between_dates)
        random_date = s_date + dt.timedelta(days=random_number_of_days)
        random_date = random_date.strftime('%Y-%m-%d')
        date_list.append(random_date)
        seed += 1
    return date_list
    

<H3> Generate Synthetic Data </H3>

In [60]:
def generate_synthetic_prediction_data():
    patients = pd.Series(range(0, c_p['number_of_samples'])) #500
    date = pd.date_range(c_p['start_date'], c_p['end_date'], freq='MS') #25
    data = pd.DataFrame(list(it.product(patients,date)),columns=[c_p['id_column'],c_p['date_column']])
    seed = 0
    for model in c_p['model_columns']:
        np.random.seed(seed)
        data[model] = pd.Series(np.random.random((data.shape[0])))
        seed+=1
    dataframe_to_csv(data, c_p['dir']+c_p['file'], False)

In [61]:
def generate_synthetic_ground_truth_data():
    seed = 0
    np.random.seed(seed)
    patients = pd.Series([x for x in np.random.randint(0, c_p['number_of_samples'], c_r['number_of_samples'])])
    patients = patients.sort_values(ascending=True).reset_index(drop=True)
    date = pd.Series([d for d in generate_random_date(c_r['start_date'], c_r['end_date'], c_r['number_of_samples'])])
    data = pd.DataFrame({c_r['id_column']:patients, c_r['date_column']:date})
    dataframe_to_csv(data, c_r['dir']+c_r['file'], False)


In [67]:
if __name__ == "__main__":
    load_configurations()
    generate_synthetic_ground_truth_data()
    generate_synthetic_prediction_data()
    