In [1]:
import os
import datetime
import inspect

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, learning_curve, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report, 
    roc_auc_score, roc_curve, auc, precision_recall_curve, f1_score
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SequentialFeatureSelector

from statsmodels.stats.outliers_influence import variance_inflation_factor

from utils import describe_dataframe, get_cont_enrolled, train_eval, piped_traineval



In [2]:
main = pd.read_csv("../data/combo_data.csv")

In [3]:
select_cols = ["LOS" , "PRNCPAL_DGNS_CD", "CLM_IP_ADMSN_TYPE_CD", "ER_flag", "STATE_CODE", "COUNTY_CD", "BENE_RACE_CD", "ESRD_IND","AGE", "TOT_RX_CST_AMT"]



In [4]:


print(inspect.getsource(get_cont_enrolled))

def get_cont_enrolled(init_year, end_year, df):
    '''Inputs a dataframe of CMS inpatient data and date range, and returns a dictionary of how many unique
    id's there are per year, as well as a dictionary of those unique id's per year'''

    select_years = df[df["YR"] < end_year]

    current_ids = select_years[select_years['YR'] == init_year]['BENE_ID'].unique()
    num_ids = [len(current_ids)]
    current_id_dict = {str(init_year): list(current_ids)}
    
    for year in range(init_year + 1, end_year):
        current_ids = select_years[(select_years["BENE_ID"].isin(current_ids)) & 
                                   (select_years['YR'] == year)]["BENE_ID"].unique()
        num_ids.append(len(current_ids))
        current_id_dict[str(year)] = list(current_ids)
    
    result = {
        "id_year_dict": current_id_dict, 
        "nunique_df": pd.DataFrame({"year": range(init_year, end_year), "n_unique": num_ids})
    }
    return result

