In [93]:
from sklearn.feature_extraction.text import CountVectorizer

import warnings
warnings.filterwarnings("ignore")

import numpy as np

import pandas as pd

import pickle

import ingest

from scipy import sparse


In [94]:
outpatient = ingest.get_cache_data("Outpatient", "Outpatient.pkl")

INFO:root:Reading local cache file Outpatient.pkl


In [95]:
outpatient.dropna(subset=['CLM_PMT_AMT'], inplace=True)

In [96]:
column_sets_dict = {'BIRTH_DATE': ['BENE_BIRTH_DT'],
    'SEX': ['BENE_SEX_IDENT_CD'],
    'State': ['SP_STATE_CODE'],
    'County': ['BENE_COUNTY_CD'], 
    'clm_dates': ['CLM_FROM_DT', 'CLM_THRU_DT'],
    'provider': ['PRVDR_NUM'],
    'DGNS_CD': ['ICD9_DGNS_CD_1', 'ICD9_DGNS_CD_2', 'ICD9_DGNS_CD_3',
    'ICD9_DGNS_CD_4', 'ICD9_DGNS_CD_5', 'ICD9_DGNS_CD_6', 'ICD9_DGNS_CD_7',
    'ICD9_DGNS_CD_8', 'ICD9_DGNS_CD_9','ICD9_DGNS_CD_10'],
    'PRDCR_CD': ['ICD9_PRCDR_CD_1', 'ICD9_PRCDR_CD_2', 'ICD9_PRCDR_CD_3',
    'ICD9_PRCDR_CD_4', 'ICD9_PRCDR_CD_5', 'ICD9_PRCDR_CD_6'],
    'HCPCS_CD': ['HCPCS_CD_1', 'HCPCS_CD_2', 'HCPCS_CD_3', 'HCPCS_CD_4',
    'HCPCS_CD_5', 'HCPCS_CD_6', 'HCPCS_CD_7', 'HCPCS_CD_8', 'HCPCS_CD_9',
    'HCPCS_CD_10', 'HCPCS_CD_11', 'HCPCS_CD_12', 'HCPCS_CD_13', 'HCPCS_CD_14',
    'HCPCS_CD_15', 'HCPCS_CD_16', 'HCPCS_CD_17', 'HCPCS_CD_18', 'HCPCS_CD_19',
    'HCPCS_CD_20', 'HCPCS_CD_21', 'HCPCS_CD_22', 'HCPCS_CD_23', 'HCPCS_CD_24',
    'HCPCS_CD_25', 'HCPCS_CD_26', 'HCPCS_CD_27', 'HCPCS_CD_28', 'HCPCS_CD_29',
    'HCPCS_CD_30', 'HCPCS_CD_31', 'HCPCS_CD_32', 'HCPCS_CD_33', 'HCPCS_CD_34',
    'HCPCS_CD_35', 'HCPCS_CD_36', 'HCPCS_CD_37', 'HCPCS_CD_38', 'HCPCS_CD_39',
    'HCPCS_CD_40', 'HCPCS_CD_41', 'HCPCS_CD_42', 'HCPCS_CD_43', 'HCPCS_CD_44',
    'HCPCS_CD_45']}

In [101]:
def isnan(x):
    return x != x

def TFIDF_Matrix3(df, columns):
    #Takes in a dataframe, and a list of columns that comprise a "sentence" using the first three characters of each word
    #Returns a TFIDF matrix

    corpus = []

    if len(columns) > 1:
        values = df[columns].values.tolist()

        for sentence in values:
            s = ''
            for word in sentence:
                if len(s) == 0:
                    if isnan(word):
                        s = 'None, '
                    else:
                        s = str(word)[:3]+' ' #Add [:3] after str(word) to truncate to the first 3 characters
                else:
                    if isnan(word):
                        s += 'None, '
                    else:
                        s += str(word)[:3]+' ' #Add [:3] after str(word) to truncate to the first 3 characters
            corpus.append(s)

    else:
        values = df[columns].values.tolist()
        for word in values:
            if isnan(word[0]):
                corpus.append('None')
            else:
                corpus.append(word[0][:3]) #Add [:3] after word[0] to truncate to the first 3 characters
    
    a = CountVectorizer(stop_words=['None']).fit_transform(corpus)

    return  np.nan_to_num(a), np.nan_to_num(np.sum(a,axis=1))

def TFIDF_Matrix(df, columns):
    #Takes in a dataframe, and a list of columns that comprise a "sentence" using the complete "word"
    #Returns a TFIDF matrix

    corpus = []

    if len(columns) > 1:
        values = df[columns].values.tolist()

        for sentence in values:
            s = ''
            for word in sentence:
                if len(s) == 0:
                    if isnan(word):
                        s = 'None, '
                    else:
                        s = str(word)+' '
                else:
                    if isnan(word):
                        s += 'None, '
                    else:
                        s += str(word)+' '
            corpus.append(s)

    else:
        values = df[columns].values.tolist()
        for word in values:
            if isnan(word[0]):
                corpus.append('None')
            else:
                corpus.append(word[0])

    a = CountVectorizer(stop_words=['None']).fit_transform(corpus)

    return  np.nan_to_num(a)

def Date_Year(df, columns):

    df['year'] = pd.DatetimeIndex(df[columns[0]]).year

    return np.nan_to_num(np.array(df['year'].values.tolist()).reshape(len(df),1))

def Date_Diff(df, columns):
    #Takes in a dataframe, and a list of columns that are a start date and end date
    #Returns the difference between the two dates

    values = df[columns]

    values['date_diff'] = (df[columns[1]] - df[columns[0]])  / np.timedelta64(1, 'D')

    return  np.nan_to_num(np.array(values['date_diff'].values.tolist()).reshape(len(values),1) / np.array(values['date_diff'].values.tolist()).reshape(len(values),1).max(axis=0))
    
def Passthrough(df, columns):
    #Takes in a dataframe, and a list of columns that need to be turned into a list
    #Returns list of values from columns
    
    df[columns[0]] = pd.to_numeric(df[columns[0]], errors='coerce').astype('Int64')

    return np.nan_to_num(np.array(df[columns].values.tolist()))

In [102]:
column_sets_matrix_dict = {}
for key, value in column_sets_dict.items():
    if key in ['provider', 'NPI', 'admit_code', 'claim_discharge_code', 'DGNS_CD', 'PRDCR_CD', 'HCPCS_CD']:
        column_sets_matrix_dict[key], column_sets_matrix_dict[key+'_count'] = TFIDF_Matrix3(outpatient, value)
    elif key in ['State', 'County']:
        column_sets_matrix_dict[key] = TFIDF_Matrix(outpatient, value)
    elif key in ['clm_dates', 'admit_dates']:
        column_sets_matrix_dict[key] = Date_Diff(outpatient, value)
    elif key in ['BIRTH_DATE']:
        column_sets_matrix_dict[key] = Date_Year(outpatient, value)
    else:
        column_sets_matrix_dict[key] = Passthrough(outpatient, value)

In [99]:
with open("feature_vectors_dictionary-truncated.txt", "wb") as f:
    pickle.dump(column_sets_matrix_dict, f)

In [100]:
print(column_sets_matrix_dict['HCPCS_CD_count'])

  (0, 0)	1.0
  (1, 0)	1.0
  (2, 0)	1.0
  (3, 0)	1.0
  (4, 0)	1.0
  (5, 0)	1.0
  (6, 0)	1.0
  (7, 0)	1.0
  (8, 0)	1.0
  (9, 0)	1.0
  (10, 0)	1.0
  (11, 0)	1.0
  (12, 0)	1.0
  (13, 0)	1.0
  (14, 0)	1.0
  (15, 0)	1.0
  (16, 0)	1.0
  (17, 0)	1.0
  (18, 0)	1.0
  (19, 0)	1.0
  (20, 0)	1.0
  (21, 0)	1.0
  (22, 0)	1.0
  (23, 0)	1.0
  (24, 0)	1.0
  :	:
  (790765, 0)	1.0
  (790766, 0)	1.0
  (790767, 0)	1.0
  (790768, 0)	1.0
  (790769, 0)	1.0
  (790770, 0)	1.0
  (790771, 0)	1.0
  (790772, 0)	1.0
  (790773, 0)	1.0
  (790774, 0)	1.0
  (790775, 0)	1.0
  (790776, 0)	1.0
  (790777, 0)	1.0
  (790778, 0)	1.0
  (790779, 0)	1.0
  (790780, 0)	1.0
  (790781, 0)	1.0
  (790782, 0)	1.0
  (790783, 0)	1.0
  (790784, 0)	1.0
  (790785, 0)	1.0
  (790786, 0)	1.0
  (790787, 0)	1.0
  (790788, 0)	1.0
  (790789, 0)	1.0
