In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import scipy.stats
import time
import os

## TO DO: need to handle the case where r0.06 and r0.11 (SMOTE in Sherlock)

## Define helper functions

In [31]:
def run(df, df_test, model):
    t_unique = df['t'].unique()

    for t in t_unique:
        buffer = df[df['t'] == t]
        X = buffer.iloc[:, :2]
        y = buffer['y']

        lr = model.fit(X, y)

        X_test = df_test.iloc[:, :2]
        df_test['pred' + str(t)] = lr.predict(X_test)

    ## EVALUATE PERFORMANCE
    def find_highest_y(row):
        if row['pred1'] > row['pred0']:
            return 1
        else:
            return 0

    def t_opt(row):
        if row['y1'] > row['y0']:
            return 1
        else:
            return 0

    df_test['t_opt'] = df_test.apply(lambda row: t_opt(row), axis=1)    
    df_test['t_pred'] = df_test.apply(lambda row: find_highest_y(row), axis=1)
    hi = df_test['t_opt'] == df_test['t_pred']
#     print(df_test[df_test['t_opt'] != df_test['t_pred']])
#     print(hi.value_counts())
    return (df_test['t_opt'] == df_test['t_pred']).sum()/len(df_test)

In [20]:
def run(df, df_test, model):
    t_unique = df['t'].unique()
    # for 0.1, y=1 will be minority class and for 0.85, y=0 will be minority class.
    # perform smote to increase the accuracy?
    for t in t_unique:
#         print(i)
        buffer = df[df['t'] == t]
        X = buffer.iloc[:, :17]
        y = buffer['y']
        y_values = y.value_counts()

#         if y_values[1] > 5 and y_values[0] > 5:
#             smote = SMOTE(sampling_strategy=1.0, k_neighbors=5)
#         elif y_values[0] <= 5:
#             smote = SMOTE(sampling_strategy=1.0, k_neighbors=y_values[0] - 1)
#         elif y_values[1] <= 5:
#             smote = SMOTE(sampling_strategy=1.0, k_neighbors=y_values[1]-1)
#         X, y = smote.fit_resample(X, y)
#         print(y.value_counts())

        lr = model.fit(X, y)
        X_test = df_test.iloc[:, :17]
        df_test['pred' + str(t)] = [i[1] for i in lr.predict_proba(X_test)]

    ## EVALUATE PERFORMANCE
    def find_highest_y(row):
        if row['pred1'] > row['pred0'] and row['pred1'] > row['pred2']:
            return 1
        elif row['pred2'] > row['pred0'] and row['pred2'] > row['pred1']:
            return 2
        else:
            return 0

    def t_opt(row):
        if row['y1'] == 1:
            return 1
        elif row['y2'] == 1:
            return 2
        else:
            return 0

    df_test['t_opt'] = df_test.apply(lambda row: t_opt(row), axis=1)    
    df_test['t_pred'] = df_test.apply(lambda row: find_highest_y(row), axis=1)
    return (df_test['t_opt'] == df_test['t_pred']).sum()/len(df_test)

In [27]:
def driver(model):
#     probs = ['0.33', 'r0.11', 'r0.06']
    probs = ['0.33']
    seeds = [1, 2, 3, 4, 5]
    datasets = [1, 2, 3, 4, 5]
    opt_policy_dic = {}
    for prob in probs:
        buffer = []
        for dataset in datasets:
            for seed in seeds:
#                 print("------ SEED --------" + str(seed))
#                 print("------ DATASET --------" + str(dataset))
#                 print("PROB" + str(prob))
                file_name = 'data_train_' + str(prob) + '_' + str(dataset) + '.csv'
                file_name_test = 'data_test_' + str(prob) + '_' + str(dataset) + '.csv'
                # ----- CHANGE THE FILE PATH -----
                file_path = '../../data/Warfarin_v2/seed' + str(seed) + '/'
                df = pd.read_csv(file_path + file_name)
                df_test = pd.read_csv(file_path + file_name_test)
                
                buffer.append(run(df, df_test, model))
        opt_policy_dic[prob] = buffer
    return opt_policy_dic

## LRRF (best models we have)

In [2]:
probs = ['0.33', 'r0.06', 'r0.11']
seeds = [1, 2, 3, 4, 5]
datasets = [1, 2, 3, 4, 5]
opt_policy_dic = {}
for prob in probs:
    buffer = []
    for dataset in datasets:
        for seed in seeds:
#                 print("------ SEED --------" + str(seed))
#                 print("------ DATASET --------" + str(dataset))
#                 print("PROB" + str(prob))
            file_name = 'data_train_' + str(prob) + '_' + str(dataset) + '.csv'
            file_name_test = 'data_test_' + str(prob) + '_' + str(dataset) + '.csv'
            # ----- CHANGE THE FILE PATH -----
            file_path = '../../data/Warfarin_v2/rf_balance_proba_white/seed' + str(seed) + '/'
            df = pd.read_csv(file_path + file_name)
            df_test = pd.read_csv(file_path + file_name_test)

            def find_highest_y(row):
                if row['lrrf1'] > row['lrrf0'] and row['lrrf1'] > row['lrrf2']:
                    return 1
                elif row['lrrf2'] > row['lrrf0'] and row['lrrf2'] > row['lrrf1']:
                    return 2
                else:
                    return 0

            def t_opt(row):
                if row['y1'] == 1:
                    return 1
                elif row['y2'] == 1:
                    return 2
                else:
                    return 0

            df_test['t_opt'] = df_test.apply(lambda row: t_opt(row), axis=1)    
            df_test['t_pred'] = df_test.apply(lambda row: find_highest_y(row), axis=1)
            buffer.append((df_test['t_opt'] == df_test['t_pred']).sum()/len(df_test))
    opt_policy_dic[prob] = buffer

1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386
1386


KeyboardInterrupt: 

In [15]:
lrrf = pd.DataFrame(opt_policy_dic)

In [16]:
lrrf.to_csv('../results/RC/warfarin/lrrf.csv', index=False)

### LogisticRegression

In [26]:
driver(LogisticRegression(max_iter=10000))

------ SEED --------1
------ DATASET --------1
PROB0.33
------ SEED --------2
------ DATASET --------1
PROB0.33
------ SEED --------3
------ DATASET --------1
PROB0.33
------ SEED --------4
------ DATASET --------1
PROB0.33
------ SEED --------5
------ DATASET --------1
PROB0.33
------ SEED --------1
------ DATASET --------2
PROB0.33
------ SEED --------2
------ DATASET --------2
PROB0.33
------ SEED --------3
------ DATASET --------2
PROB0.33
------ SEED --------4
------ DATASET --------2
PROB0.33
------ SEED --------5
------ DATASET --------2
PROB0.33
------ SEED --------1
------ DATASET --------3
PROB0.33
------ SEED --------2
------ DATASET --------3
PROB0.33
------ SEED --------3
------ DATASET --------3
PROB0.33
------ SEED --------4
------ DATASET --------3
PROB0.33
------ SEED --------5
------ DATASET --------3
PROB0.33
------ SEED --------1
------ DATASET --------4
PROB0.33
------ SEED --------2
------ DATASET --------4
PROB0.33
------ SEED --------3
------ DATASET --------4
P

{'0.33': [0.8838383838383839,
  0.8961038961038961,
  0.8744588744588745,
  0.8852813852813853,
  0.8845598845598845,
  0.8802308802308803,
  0.8968253968253969,
  0.8831168831168831,
  0.8795093795093795,
  0.8961038961038961,
  0.8975468975468975,
  0.8953823953823954,
  0.8953823953823954,
  0.8910533910533911,
  0.9033189033189033,
  0.8780663780663781,
  0.886002886002886,
  0.8896103896103896,
  0.8831168831168831,
  0.886002886002886,
  0.8903318903318903,
  0.8997113997113997,
  0.8946608946608947,
  0.9018759018759018,
  0.9033189033189033]}

### DecisionTree

In [29]:
driver(DecisionTreeClassifier())

{'0.33': [0.8304473304473304,
  0.8333333333333334,
  0.8217893217893217,
  0.8246753246753247,
  0.8412698412698413,
  0.8556998556998557,
  0.8282828282828283,
  0.823953823953824,
  0.8520923520923521,
  0.8391053391053391,
  0.8412698412698413,
  0.8376623376623377,
  0.8196248196248196,
  0.8354978354978355,
  0.8427128427128427,
  0.8326118326118326,
  0.8347763347763347,
  0.8391053391053391,
  0.854978354978355,
  0.841991341991342,
  0.8477633477633477,
  0.8427128427128427,
  0.8391053391053391,
  0.8492063492063492,
  0.8571428571428571]}

### KNN

In [30]:
driver(KNeighborsClassifier())

{'0.33': [0.7554112554112554,
  0.7467532467532467,
  0.7453102453102453,
  0.7503607503607503,
  0.753968253968254,
  0.753968253968254,
  0.7655122655122655,
  0.753968253968254,
  0.7626262626262627,
  0.7633477633477633,
  0.7467532467532467,
  0.7604617604617605,
  0.7676767676767676,
  0.7489177489177489,
  0.7640692640692641,
  0.7748917748917749,
  0.7828282828282829,
  0.7698412698412699,
  0.7662337662337663,
  0.7698412698412699,
  0.7691197691197691,
  0.7597402597402597,
  0.7604617604617605,
  0.766955266955267,
  0.7510822510822511]}

### RandomForest

In [31]:
driver(RandomForestClassifier())

{'0.33': [0.9047619047619048,
  0.9011544011544012,
  0.886002886002886,
  0.8997113997113997,
  0.9033189033189033,
  0.8968253968253969,
  0.9069264069264069,
  0.8910533910533911,
  0.8924963924963925,
  0.9105339105339105,
  0.9054834054834054,
  0.9076479076479076,
  0.898989898989899,
  0.9033189033189033,
  0.9069264069264069,
  0.8816738816738817,
  0.8946608946608947,
  0.9047619047619048,
  0.9083694083694084,
  0.8975468975468975,
  0.9083694083694084,
  0.9134199134199135,
  0.9083694083694084,
  0.9148629148629148,
  0.9191919191919192]}