In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

import pandas as pd
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import math

import pickle

import itertools
from tqdm import tqdm

import ingest

In [15]:
outpatient = ingest.get_cache_data("Outpatient", "Outpatient.pkl")

INFO:root:Reading local cache file Outpatient.pkl


In [16]:
outpatient.dropna(subset=['CLM_PMT_AMT'], inplace=True)

In [17]:
print(len(outpatient))

790790


In [18]:
column_sets_dict = {'clm_dates': ['CLM_FROM_DT', 'CLM_THRU_DT'],
    'provider': ['PRVDR_NUM'],
    'DGNS_CD': ['ICD9_DGNS_CD_1', 'ICD9_DGNS_CD_2', 'ICD9_DGNS_CD_3',
    'ICD9_DGNS_CD_4', 'ICD9_DGNS_CD_5', 'ICD9_DGNS_CD_6', 'ICD9_DGNS_CD_7',
    'ICD9_DGNS_CD_8', 'ICD9_DGNS_CD_9','ICD9_DGNS_CD_10'],
    'PRDCR_CD': ['ICD9_PRCDR_CD_1', 'ICD9_PRCDR_CD_2', 'ICD9_PRCDR_CD_3',
    'ICD9_PRCDR_CD_4', 'ICD9_PRCDR_CD_5', 'ICD9_PRCDR_CD_6'],
    'HCPCS_CD': ['HCPCS_CD_1', 'HCPCS_CD_2', 'HCPCS_CD_3', 'HCPCS_CD_4',
    'HCPCS_CD_5', 'HCPCS_CD_6', 'HCPCS_CD_7', 'HCPCS_CD_8', 'HCPCS_CD_9',
    'HCPCS_CD_10', 'HCPCS_CD_11', 'HCPCS_CD_12', 'HCPCS_CD_13', 'HCPCS_CD_14',
    'HCPCS_CD_15', 'HCPCS_CD_16', 'HCPCS_CD_17', 'HCPCS_CD_18', 'HCPCS_CD_19',
    'HCPCS_CD_20', 'HCPCS_CD_21', 'HCPCS_CD_22', 'HCPCS_CD_23', 'HCPCS_CD_24',
    'HCPCS_CD_25', 'HCPCS_CD_26', 'HCPCS_CD_27', 'HCPCS_CD_28', 'HCPCS_CD_29',
    'HCPCS_CD_30', 'HCPCS_CD_31', 'HCPCS_CD_32', 'HCPCS_CD_33', 'HCPCS_CD_34',
    'HCPCS_CD_35', 'HCPCS_CD_36', 'HCPCS_CD_37', 'HCPCS_CD_38', 'HCPCS_CD_39',
    'HCPCS_CD_40', 'HCPCS_CD_41', 'HCPCS_CD_42', 'HCPCS_CD_43', 'HCPCS_CD_44',
    'HCPCS_CD_45']}

In [19]:
def isnan(x):
    return x != x

def TFIDF_Matrix(df, columns):
    #Takes in a dataframe, and a list of columns that comprise a "sentence"
    #Returns a TFIDF matrix

    print('start')

    corpus = []

    if len(columns) > 1:
        values = df[columns].values.tolist()

        for sentence in values:
            s = ''
            for word in sentence:
                if len(s) == 0:
                    if isnan(word):
                        s = 'None, '
                    else:
                        s = str(word)+', ' #Add [:3] after word to truncate to the first 3 characters
                else:
                    if isnan(word):
                        s += 'None, '
                    else:
                        s += str(word)+', ' #Add [:3] after word to truncate to the first 3 characters
            corpus.append(s)

    else:
        values = df[columns].values.tolist()
        for word in values:
            if isnan(word[0]):
                corpus.append('None')
            else:
                corpus.append(word[0]) #Add [:3] after word[0] to truncate to the first 3 characters
    
    print(corpus[:10])

    return  TfidfVectorizer(stop_words=['None']).fit_transform(corpus)

def Date_Diff(df, columns):
    #Takes in a dataframe, and a list of columns that are a start date and end date
    #Returns the difference between the two dates

    values = df[columns]

    values['date_diff'] = (df[columns[1]] - df[columns[0]])

    return  np.array(values['date_diff'].values.tolist()).reshape(len(values),1)
    
def Passthrough(df, columns):
    #Takes in a dataframe, and a list of columns that need to be turned into a list
    #Returns list of values from columns
    
    return np.array(df[columns].values.tolist())

In [20]:
column_sets_matrix_dict = {}
for key, value in column_sets_dict.items():
    if key in ['provider', 'NPI', 'admit_code', 'claim_discharge_code', 'DGNS_CD', 'PRDCR_CD', 'HCPCS_CD']:
        column_sets_matrix_dict[key] = TFIDF_Matrix(outpatient, value)
    elif key in ['clm_dates', 'admit_dates']:
        column_sets_matrix_dict[key] = Date_Diff(outpatient, value)
    else:
        column_sets_matrix_dict[key] = Passthrough(outpatient, value)

start
['2600RA', '3901GS', '3939PG', '3902NU', '5200TV', '5213ZG', '5213RM', '5200YU', '3902NU', '3902NU']
start
['V5841, None, None, None, None, None, None, None, None, None, ', 'V5832, V5861, 2724, 3182, V5869, 42731, None, None, None, None, ', '9594, E9174, 4019, None, None, None, None, None, None, None, ', '78943, V5866, V1272, None, None, None, None, None, None, None, ', '6009, None, None, None, None, None, None, None, None, None, ', '6115, None, None, None, None, None, None, None, None, None, ', '2723, None, None, None, None, None, None, None, None, None, ', 'V821, None, None, None, None, None, None, None, None, None, ', 'None, None, None, None, None, None, None, None, None, None, ', '7902, None, None, None, None, None, None, None, None, None, ']
start
['None, None, None, None, None, None, ', 'None, None, None, None, None, None, ', 'None, None, None, None, None, None, ', 'None, None, None, None, None, None, ', 'None, None, None, None, None, None, ', 'None, None, None, None, None,

In [22]:
with open("feature_vectors_dictionary-truncated.txt", "wb") as f:
    pickle.dump(column_sets_matrix_dict, f)