In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
from sklearn.preprocessing import normalize


from joblib import parallel_backend

import xgboost as xgb

import pandas as pd
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import math

import pickle

import itertools
from tqdm import tqdm
import ingest
from scipy import sparse


In [2]:
outpatient = ingest.get_cache_data("Outpatient", "Outpatient.pkl")
outpatient.dropna(subset=['CLM_PMT_AMT'], inplace=True)

INFO:root:Reading local cache file Outpatient.pkl


In [3]:
with open('feature_vectors_dictionary-truncated.txt', 'rb') as handle:
    feature_dictionary = pickle.load(handle)

for met_set in feature_dictionary.keys():
    print(met_set)
    feature_dictionary[met_set] = normalize(sparse.csr_matrix(feature_dictionary[met_set]))
    print(np.shape(feature_dictionary[met_set]))

BIRTH_DATE
(790790, 1)
SEX
(790790, 1)
State
(790790, 52)
County
(790790, 306)
clm_dates
(790790, 1)
provider
(790790, 284)
provider_count
(790790, 1)
DGNS_CD
(790790, 1001)
DGNS_CD_count
(790790, 1)
PRDCR_CD
(790790, 188)
PRDCR_CD_count
(790790, 1)
HCPCS_CD
(790790, 730)
HCPCS_CD_count
(790790, 1)


In [4]:
def run_model(X, y, model, grid, feature_names):

       ### Runs the model and returns MSE and R2 values and corresponding data as a dataframe

       ### Takes in four arugments
       ### X is the features to that model will be trained on as a sparse matrix
       ### y is the scalar that is to be predicted as a list
       ### model is the regression model as a model object
       ### grid is a dictionary of model hyperparameters

       ### returns a dataframe of results from the grid search

       scoring = {'mean_squared_error': make_scorer(mean_squared_error), 'r2_score': make_scorer(r2_score)}
       with parallel_backend('threading', n_jobs=2):
              grid_model = GridSearchCV(model, grid, scoring=scoring, refit='r2_score', return_train_score=True, error_score="raise").fit(X.todense(), y)
       


       preds = grid_model.predict(X.todense())
       print(preds)
       df = outpatient.copy()

       df['predictions'] =  preds
       
       return df

In [5]:
def iterate_feature_sets(model, feature_dict, grid, feature_sets):

    ### Creates and runs the specified model with each comboniation of features sets from 1 to k length features and stores the results of the models into a dictionary

    ### Takes in four arguments
    ### model is the regression model as a model object
    ### feature_dict is the dictionary of feature sets
    ### grid is a dictionary of model hyperparameters
    ### feature_sets dicates how what feature sets will be returned, a list of list of strings corresponding to the keys in the dictionary
    ### (continued) all would return all combinations for 1 through k, where  k is the number of keys in feature_dict

    ### returns a dictionary of results

    results = {}

    for feature_list in feature_sets:
        if feature_list == 'all':

            for n_met in tqdm(range(len(feature_dict))):
                for met_set in list(itertools.combinations(feature_dict.keys(), n_met+1)):
                    for n_met_set in range(len(met_set)):
                        if n_met_set == 0:
                            X = feature_dict[met_set[n_met_set]]
                        else:
                            X  = sparse.hstack((X,feature_dict[met_set[n_met_set]]), format='csr')
                    answer = run_model(X, outpatient['CLM_PMT_AMT'].values.tolist(), model, grid, met_set)
                    results[met_set] = answer

        else:
            X = None
            Name = ()
            for i in feature_list:
                if X == None:
                    X = feature_dict[i]
                    Name += (i,)
                else:
                    X = sparse.hstack((X, feature_dict[i]), format='csr')
                    Name += (i,)
            answer = run_model(X, outpatient['CLM_PMT_AMT'].values.tolist(), model, grid, feature_list)
            results[Name] = answer
        
    return results

In [6]:
def write_results(results_dict, filename):

    ### Takes in two arguements
    ### results_dict is the dictionary of feature sets
    ### file name is the wanted file name as a string

    ### Returns None, but writes a file to the current directory


    df = pd.DataFrame()

    for r_key in results_dict.keys():
        temp_df = results_dict[r_key]
        temp_df['features'] = str(r_key)
        df = pd.concat((df, temp_df), ignore_index=True)

    df.to_csv(filename)

    return None

# Final Histogram Gradient Boosted NonPCA Dataset
### Generates Dataset with Predictions

In [7]:

grid_dict = {'learning_rate': [0.25],
    'max_iter': [200],
    #wanted to go a little higher on leaf nodes since there seemed to be more to gain
    'max_leaf_nodes': [10],
    'max_depth': [18]
    #results were inconclusive for minimum samples, going to leave as default
    }


GBRModel = HistGradientBoostingRegressor(random_state=42)
write_results(iterate_feature_sets(GBRModel, feature_dictionary, grid_dict, [
    # picked because of speed and results
    ['clm_dates', 'DGNS_CD', 'HCPCS_CD', 'BIRTH_DATE', 'SEX', 'State', 'County']]), 'NPCA_FinalData_GBRModel.csv')


[ 73.44122528 208.89277945 257.55470048 ... 183.601469   129.96102677
 296.50111943]


# Final Stochastic Gradient Descent NonPCA Supervised Run
### See how the model performs on the dataset without PCA

In [8]:
grid_dict = {'loss': ['squared_epsilon_insensitive'],
    'penalty': ['l1'],
    'alpha': [.0001],
    'max_iter': [1000]}

SGDModel = SGDRegressor(random_state=42)
write_results(iterate_feature_sets(SGDModel, feature_dictionary, grid_dict, [
['clm_dates', 'DGNS_CD', 'HCPCS_CD', 'BIRTH_DATE', 'SEX', 'State', 'County']]), 'NPCA_FinalData_SGDModel.csv')

[153.2365103   99.1009535  234.30567492 ... 195.1020327  142.01869419
 355.95122977]


# Final Elastic Net NonPCA Supervised Run
### See how the model performs on the dataset without PCA

In [9]:
grid_dict = {'alpha': [.0001],
    'l1_ratio': [.9]}

ENModel = ElasticNet(random_state=42)
write_results(iterate_feature_sets(ENModel, feature_dictionary, grid_dict, [
['clm_dates', 'DGNS_CD', 'HCPCS_CD', 'BIRTH_DATE', 'SEX', 'State', 'County']]), 'NPCA_FinalData_ENModel.csv')

[138.27836695  97.18758939 230.0426669  ... 196.25697381 140.32453803
 356.97479306]


# Final XGBoost NonPCA Supervised Run
### See how the model performs on the dataset without PCA

In [10]:
grid_dict = {'max_depth': [10],
    'number_of_estimators': [9],
    'tree_method': ['hist']}

XGBModel = xgb.XGBRFRegressor(random_state=42)
write_results(iterate_feature_sets(XGBModel, feature_dictionary, grid_dict, [
['clm_dates', 'DGNS_CD', 'HCPCS_CD', 'BIRTH_DATE', 'SEX', 'State', 'County']]), 'NPCA_FinalData_XGBModel.csv')

Parameters: { "number_of_estimators" } are not used.

[ 77.02148 330.28763 240.97156 ... 241.60034 118.74428 236.9609 ]
