In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, r2_score

from joblib import parallel_backend

import xgboost as xgb

import pandas as pd
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import math

import pickle

import itertools
from tqdm import tqdm
import ingest
from scipy import sparse


In [3]:
outpatient = ingest.get_cache_data("Outpatient", "Outpatient.pkl")
outpatient.dropna(subset=['CLM_PMT_AMT'], inplace=True)
outpatient = outpatient[['CLM_PMT_AMT']]

INFO:root:Reading local cache file Outpatient.pkl


In [4]:
with open('feature_vectors_dictionary-truncated.txt', 'rb') as handle:
    feature_dictionary = pickle.load(handle)

for met_set in feature_dictionary.keys():
    print(met_set)
    feature_dictionary[met_set] = sparse.csr_matrix(feature_dictionary[met_set])
    print(np.shape(feature_dictionary[met_set]))

BIRTH_DATE
(790790, 1)
SEX
(790790, 1)
State
(790790, 52)
County
(790790, 306)
clm_dates
(790790, 1)
provider
(790790, 284)
DGNS_CD
(790790, 1001)
PRDCR_CD
(790790, 188)
HCPCS_CD
(790790, 730)


In [5]:
def run_model(X, y, model, grid, feature_names):

       ### Runs the model and returns MSE and R2 values and corresponding data as a dataframe

       ### Takes in four arugments
       ### X is the features to that model will be trained on as a sparse matrix
       ### y is the scalar that is to be predicted as a list
       ### model is the regression model as a model object
       ### grid is a dictionary of model hyperparameters

       ### returns a dataframe of results from the grid search

       scoring = {'mean_squared_error': make_scorer(mean_squared_error), 'r2_score': make_scorer(r2_score)}
       with parallel_backend('threading', n_jobs=2):
              grid_model = GridSearchCV(model, grid, scoring=scoring, refit=False, return_train_score=True, error_score="raise").fit(X.todense(), y)
       
       print(grid_model.cv_results_)

       df=pd.DataFrame.from_dict(grid_model.cv_results_)
       
       return df

In [6]:
def iterate_feature_sets(model, feature_dict, grid, feature_sets):

    ### Creates and runs the specified model with each comboniation of features sets from 1 to k length features and stores the results of the models into a dictionary

    ### Takes in four arguments
    ### model is the regression model as a model object
    ### feature_dict is the dictionary of feature sets
    ### grid is a dictionary of model hyperparameters
    ### feature_sets dicates how what feature sets will be returned, a list of list of strings corresponding to the keys in the dictionary
    ### (continued) all would return all combinations for 1 through k, where  k is the number of keys in feature_dict

    ### returns a dictionary of results

    results = {}

    for feature_list in feature_sets:
        if feature_list == 'all':

            for n_met in tqdm(range(len(feature_dict))):
                for met_set in list(itertools.combinations(feature_dict.keys(), n_met+1)):
                    for n_met_set in range(len(met_set)):
                        if n_met_set == 0:
                            X = feature_dict[met_set[n_met_set]]
                        else:
                            X  = sparse.hstack((X,feature_dict[met_set[n_met_set]]), format='csr')
                    answer = run_model(X, outpatient['CLM_PMT_AMT'].values.tolist(), model, grid, met_set)
                    results[met_set] = answer

        else:
            X = None
            Name = ()
            for i in feature_list:
                if X == None:
                    X = feature_dict[i]
                    Name += (i,)
                else:
                    X = sparse.hstack((X, feature_dict[i]), format='csr')
                    Name += (i,)
            answer = run_model(X, outpatient['CLM_PMT_AMT'].values.tolist(), model, grid, feature_list)
            results[Name] = answer
        
    return results

In [7]:
def write_results(results_dict, filename):

    ### Takes in two arguements
    ### results_dict is the dictionary of feature sets
    ### file name is the wanted file name as a string

    ### Returns None, but writes a file to the current directory


    df = pd.DataFrame()

    for r_key in results_dict.keys():
        temp_df = results_dict[r_key]
        temp_df['features'] = str(r_key)
        df = pd.concat((df, temp_df), ignore_index=True)

    df.to_csv(filename)

    return None

# Initial Gradient Boosted Results
### Purpose is to find which features sets matter before hyperparameter tuning

In [8]:
"""
GBRModel = HistGradientBoostingRegressor(random_state=42)
write_results(iterate_feature_sets(GBRModel, feature_dictionary, {}, [['clm_dates', 'provider', 'DGNS_CD', 'PRDCR_CD', 'HCPCS_CD'],
['clm_dates', 'DGNS_CD', 'PRDCR_CD', 'HCPCS_CD'],
['clm_dates', 'DGNS_CD', 'HCPCS_CD'],
['clm_dates', 'provider', 'DGNS_CD', 'HCPCS_CD'],
['DGNS_CD', 'PRDCR_CD', 'HCPCS_CD'],
['provider', 'DGNS_CD', 'PRDCR_CD', 'HCPCS_CD'],
['provider', 'DGNS_CD', 'HCPCS_CD'],
['DGNS_CD', 'HCPCS_CD'],
['clm_dates', 'HCPCS_CD'],
['clm_dates', 'provider', 'PRDCR_CD', 'HCPCS_CD'],
['clm_dates', 'PRDCR_CD', 'HCPCS_CD'],
['clm_dates', 'provider', 'HCPCS_CD'],
['provider', 'PRDCR_CD', 'HCPCS_CD'],
['provider', 'HCPCS_CD'],
['HCPCS_CD'],
['PRDCR_CD', 'HCPCS_CD'],
['clm_dates', 'DGNS_CD', 'PRDCR_CD'],
['clm_dates', 'provider', 'DGNS_CD'],
['clm_dates', 'DGNS_CD'],
['clm_dates', 'provider', 'DGNS_CD', 'PRDCR_CD'],
['DGNS_CD', 'PRDCR_CD'],
['provider', 'DGNS_CD'],
['provider', 'DGNS_CD', 'PRDCR_CD'],
['DGNS_CD'],
['clm_dates'],
['clm_dates', 'PRDCR_CD'],
['clm_dates', 'provider'],
['clm_dates', 'provider', 'PRDCR_CD'],
['provider', 'PRDCR_CD'],
['PRDCR_CD'],
['provider']]), 'Initial_GBRModel.csv')
"""

# Initial Stochastic Gradient Descent Results
### Purpose is to find which features set's matter before hyperparameter tuning

In [9]:
"""
SGDModel = SGDRegressor(random_state=42)
write_results(iterate_feature_sets(SGDModel, feature_dictionary, {}, [['clm_dates', 'provider', 'DGNS_CD', 'PRDCR_CD', 'HCPCS_CD'],
['clm_dates', 'DGNS_CD', 'PRDCR_CD', 'HCPCS_CD'],
['clm_dates', 'DGNS_CD', 'HCPCS_CD'],
['clm_dates', 'provider', 'DGNS_CD', 'HCPCS_CD'],
['DGNS_CD', 'PRDCR_CD', 'HCPCS_CD'],
['provider', 'DGNS_CD', 'PRDCR_CD', 'HCPCS_CD'],
['provider', 'DGNS_CD', 'HCPCS_CD'],
['DGNS_CD', 'HCPCS_CD'],
['clm_dates', 'HCPCS_CD'],
['clm_dates', 'provider', 'PRDCR_CD', 'HCPCS_CD'],
['clm_dates', 'PRDCR_CD', 'HCPCS_CD'],
['clm_dates', 'provider', 'HCPCS_CD'],
['provider', 'PRDCR_CD', 'HCPCS_CD'],
['provider', 'HCPCS_CD'],
['HCPCS_CD'],
['PRDCR_CD', 'HCPCS_CD'],
['clm_dates', 'DGNS_CD', 'PRDCR_CD'],
['clm_dates', 'provider', 'DGNS_CD'],
['clm_dates', 'DGNS_CD'],
['clm_dates', 'provider', 'DGNS_CD', 'PRDCR_CD'],
['DGNS_CD', 'PRDCR_CD'],
['provider', 'DGNS_CD'],
['provider', 'DGNS_CD', 'PRDCR_CD'],
['DGNS_CD'],
['clm_dates'],
['clm_dates', 'PRDCR_CD'],
['clm_dates', 'provider'],
['clm_dates', 'provider', 'PRDCR_CD'],
['provider', 'PRDCR_CD'],
['PRDCR_CD'],
['provider']]), 'Initial_SGDModel.csv')
"""

# Initial Elastic-Net Results
### Purpose is to find which features set's matter before hyperparameter tuning

In [10]:
"""
ENModel = ElasticNet(random_state=42)
write_results(iterate_feature_sets(ENModel, feature_dictionary, {}, [['clm_dates', 'provider', 'DGNS_CD', 'PRDCR_CD', 'HCPCS_CD'],
['clm_dates', 'DGNS_CD', 'PRDCR_CD', 'HCPCS_CD'],
['clm_dates', 'DGNS_CD', 'HCPCS_CD'],
['clm_dates', 'provider', 'DGNS_CD', 'HCPCS_CD'],
['DGNS_CD', 'PRDCR_CD', 'HCPCS_CD'],
['provider', 'DGNS_CD', 'PRDCR_CD', 'HCPCS_CD'],
['provider', 'DGNS_CD', 'HCPCS_CD'],
['DGNS_CD', 'HCPCS_CD'],
['clm_dates', 'HCPCS_CD'],
['clm_dates', 'provider', 'PRDCR_CD', 'HCPCS_CD'],
['clm_dates', 'PRDCR_CD', 'HCPCS_CD'],
['clm_dates', 'provider', 'HCPCS_CD'],
['provider', 'PRDCR_CD', 'HCPCS_CD'],
['provider', 'HCPCS_CD'],
['HCPCS_CD'],
['PRDCR_CD', 'HCPCS_CD'],
['clm_dates', 'DGNS_CD', 'PRDCR_CD'],
['clm_dates', 'provider', 'DGNS_CD'],
['clm_dates', 'DGNS_CD'],
['clm_dates', 'provider', 'DGNS_CD', 'PRDCR_CD'],
['DGNS_CD', 'PRDCR_CD'],
['provider', 'DGNS_CD'],
['provider', 'DGNS_CD', 'PRDCR_CD'],
['DGNS_CD'],
['clm_dates'],
['clm_dates', 'PRDCR_CD'],
['clm_dates', 'provider'],
['clm_dates', 'provider', 'PRDCR_CD'],
['provider', 'PRDCR_CD'],
['PRDCR_CD'],
['provider']]), 'Initial_ENModel.csv')
"""

# Initial XGBoost Results
### Purpose is to find which features set's matter before hyperparameter tuning

In [11]:
"""
XGBModel = xgb.(random_state=42)
write_results(iterate_feature_sets(XGBModel, feature_dictionary, {}, [['clm_dates', 'provider', 'DGNS_CD', 'PRDCR_CD', 'HCPCS_CD'],
['clm_dates', 'DGNS_CD', 'PRDCR_CD', 'HCPCS_CD'],
['clm_dates', 'DGNS_CD', 'HCPCS_CD'],
['clm_dates', 'provider', 'DGNS_CD', 'HCPCS_CD'],
['DGNS_CD', 'PRDCR_CD', 'HCPCS_CD'],
['provider', 'DGNS_CD', 'PRDCR_CD', 'HCPCS_CD'],
['provider', 'DGNS_CD', 'HCPCS_CD'],
['DGNS_CD', 'HCPCS_CD'],
['clm_dates', 'HCPCS_CD'],
['clm_dates', 'provider', 'PRDCR_CD', 'HCPCS_CD'],
['clm_dates', 'PRDCR_CD', 'HCPCS_CD'],
['clm_dates', 'provider', 'HCPCS_CD'],
['provider', 'PRDCR_CD', 'HCPCS_CD'],
['provider', 'HCPCS_CD'],
['HCPCS_CD'],
['PRDCR_CD', 'HCPCS_CD'],
['clm_dates', 'DGNS_CD', 'PRDCR_CD'],
['clm_dates', 'provider', 'DGNS_CD'],
['clm_dates', 'DGNS_CD'],
['clm_dates', 'provider', 'DGNS_CD', 'PRDCR_CD'],
['DGNS_CD', 'PRDCR_CD'],
['provider', 'DGNS_CD'],
['provider', 'DGNS_CD', 'PRDCR_CD'],
['DGNS_CD'],
['clm_dates'],
['clm_dates', 'PRDCR_CD'],
['clm_dates', 'provider'],
['clm_dates', 'provider', 'PRDCR_CD'],
['provider', 'PRDCR_CD'],
['PRDCR_CD'],
['provider']]), 'InitiaXGBModel.csv')
"""

# Grid Search Gradient Boosted Results
### Purpose is to find which hyperparameters perform the best on the top three feature selections + our demographic information

In [12]:
"""
grid_dict = {'learning_rate': [1, 0.5, 0.25, 0.1, 0.05, 0.01],
    'max_iter': [1, 2, 4, 8, 16, 32, 64, 100, 200],
    'max_leaf_nodes': [1, 2, 5],
    'max_depth': [np.linspace(1, 32, 32, endpoint=True)],
    'min_samples_leaf': [np.linspace(0.1, 0.5, 5, endpoint=True)]}


GBRModel = HistGradientBoostingRegressor(random_state=42)
for search in grid_dict.keys():
    write_results(iterate_feature_sets(GBRModel, feature_dictionary, {search: grid_dict[search]}, [
        ['clm_dates', 'provider', 'DGNS_CD', 'PRDCR_CD', 'HCPCS_CD', 'BIRTH_DATE', 'SEX', 'State', 'County'],
        ['clm_dates', 'DGNS_CD', 'PRDCR_CD', 'HCPCS_CD', 'BIRTH_DATE', 'SEX', 'State', 'County'],
        ['clm_dates', 'DGNS_CD', 'HCPCS_CD', 'BIRTH_DATE', 'SEX', 'State', 'County']]), search+'_GridSearch_GBRModel.csv')
"""

"\ngrid_dict = {'learning_rate': [1, 0.5, 0.25, 0.1, 0.05, 0.01],\n    'max_iter': [1, 2, 4, 8, 16, 32, 64, 100, 200],\n    'max_leaf_nodes': [1, 2, 5],\n    'max_depth': [np.linspace(1, 32, 32, endpoint=True)],\n    'min_samples_leaf': [np.linspace(0.1, 0.5, 5, endpoint=True)]}\n\n\nGBRModel = HistGradientBoostingRegressor(random_state=42)\nfor search in grid_dict.keys():\n    write_results(iterate_feature_sets(GBRModel, feature_dictionary, {search: grid_dict[search]}, [\n        ['clm_dates', 'provider', 'DGNS_CD', 'PRDCR_CD', 'HCPCS_CD', 'BIRTH_DATE', 'SEX', 'State', 'County'],\n        ['clm_dates', 'DGNS_CD', 'PRDCR_CD', 'HCPCS_CD', 'BIRTH_DATE', 'SEX', 'State', 'County'],\n        ['clm_dates', 'DGNS_CD', 'HCPCS_CD', 'BIRTH_DATE', 'SEX', 'State', 'County']]), search+'_GridSearch_GBRModel.csv')\n"

# Grid Search Stochastic Gradient Descent Results
### Purpose is to find which hyperparameters perform the best on the top three feature selections + our demographic information

In [13]:
"""
grid_dict = {#'loss': ['squared_loss', 'squared_epsilon_insensitive'],
    #'penalty': ['l1', 'l2'],
    #'alpha': [.001, .0001, .00001],
    'max_iter': [1000, 5000, 10000]}

SGDModel = SGDRegressor(random_state=42)
for search in grid_dict.keys():
    write_results(iterate_feature_sets(SGDModel, feature_dictionary, {search: grid_dict[search]}, [
['clm_dates', 'DGNS_CD', 'PRDCR_CD', 'HCPCS_CD', 'BIRTH_DATE', 'SEX', 'State', 'County'],
['clm_dates', 'DGNS_CD', 'HCPCS_CD', 'BIRTH_DATE', 'SEX', 'State', 'County'],
['clm_dates', 'provider', 'DGNS_CD', 'HCPCS_CD', 'BIRTH_DATE', 'SEX', 'State', 'County']]), search+'_GridSearch_SGDModel.csv')
"""

# Grid Search Elastic Net Results
### Purpose is to find which hyperparameters perform the best on the top three feature selections + our demographic information

In [14]:
"""
grid_dict = {'alpha': [.0001, .001, .01],
            'l1_ratio': [.1, .2, .3, .4, .5, .6, .7, .8, .9]}

ENModel = ElasticNet(random_state=42)
for search in grid_dict.keys():
    write_results(iterate_feature_sets(ENModel, feature_dictionary, {search: grid_dict[search]}, [
['clm_dates', 'DGNS_CD', 'HCPCS_CD', 'BIRTH_DATE', 'SEX', 'State', 'County'],
['clm_dates', 'DGNS_CD', 'HCPCS_CD', 'PRDCR_CD', 'BIRTH_DATE', 'SEX', 'State', 'County'],
['clm_dates', 'provider', 'DGNS_CD', 'HCPCS_CD', 'BIRTH_DATE', 'SEX', 'State', 'County']]), search+'_GridSearch_ENModel.csv')
"""

# Grid Search XGBoost Results
### Purpose is to find which hyperparameters perform the best on the top three feature selections + our demographic information

In [15]:
"""
grid_dict = {'n_estimators': [3, 6, 9, 12, 15, 18],
    'tree_method': ['approx', 'hist'],
    'max_depth': [5,10,15,20,25,30,35]}


XGBModel = xgb.XGBRFRegressor(random_state=42)
for search in grid_dict.keys():
    write_results(iterate_feature_sets(XGBModel, feature_dictionary, {search: grid_dict[search]}, [
        ['clm_dates', 'provider', 'DGNS_CD', 'PRDCR_CD', 'HCPCS_CD', 'BIRTH_DATE', 'SEX', 'State', 'County'],
        ['clm_dates', 'DGNS_CD', 'PRDCR_CD', 'HCPCS_CD', 'BIRTH_DATE', 'SEX', 'State', 'County'],
        ['clm_dates', 'DGNS_CD', 'HCPCS_CD', 'BIRTH_DATE', 'SEX', 'State', 'County']]), search+'_GridSearch_XGBModel.csv')
"""

{'mean_fit_time': array([ 842.12387476, 1489.78809748, 2197.8050643 , 2969.31562557,
       3835.67384863, 4629.15649939, 4888.40365682]), 'std_fit_time': array([ 114.35109454,   60.97789191,   82.67011449,   81.31273772,
        208.37866514,  204.8946034 , 1048.59794233]), 'mean_score_time': array([3.23340459, 3.42814431, 3.77153955, 3.06658955, 3.78215122,
       3.49965372, 4.60431123]), 'std_score_time': array([0.59607795, 0.43345626, 0.41767999, 0.18729259, 0.68180223,
       0.27430585, 1.63072123]), 'param_max_depth': masked_array(data=[5, 10, 15, 20, 25, 30, 35],
             mask=[False, False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'max_depth': 5}, {'max_depth': 10}, {'max_depth': 15}, {'max_depth': 20}, {'max_depth': 25}, {'max_depth': 30}, {'max_depth': 35}], 'split0_test_mean_squared_error': array([234970.38914409, 229507.73355338, 226938.94065465, 225810.14716623,
       225435.41402881, 225696.21870323, 226277.0

# Final Histogram Gradient Boosted NonPCA Supervised Run
### See how the model performs on the dataset without PCA

In [None]:
"""
grid_dict = {'learning_rate': [1, 0.5, 0.25, 0.1, 0.05, 0.01],
    'max_iter': [1, 2, 4, 8, 16, 32, 64, 100, 200],
    'max_leaf_nodes': [1, 2, 5],
    'max_depth': [np.linspace(1, 32, 32, endpoint=True)],
    'min_samples_leaf': [np.linspace(0.1, 0.5, 5, endpoint=True)]}


GBRModel = HistGradientBoostingRegressor(random_state=42)
for search in grid_dict.keys():
    write_results(iterate_feature_sets(GBRModel, feature_dictionary, {search: grid_dict[search]}, [
        ['clm_dates', 'provider', 'DGNS_CD', 'PRDCR_CD', 'HCPCS_CD', 'BIRTH_DATE', 'SEX', 'State', 'County'],
        ['clm_dates', 'DGNS_CD', 'PRDCR_CD', 'HCPCS_CD', 'BIRTH_DATE', 'SEX', 'State', 'County'],
        ['clm_dates', 'DGNS_CD', 'HCPCS_CD', 'BIRTH_DATE', 'SEX', 'State', 'County']]), search+'_GridSearch_GBRModel.csv')
"""