# Create Inpatient Dataframe

This code takes a directory and regex expression for filename to import all relevant synthetic data files and attaches summary information by year.

In [3]:
import pandas as pd
import os
import fnmatch
from datetime import datetime
import re
import numpy as np
import pickle

## Functions

In [4]:
## alternate way to gather files matching a certain fname
def list_files(directory, pattern):
    all_files = os.listdir(directory) 
    relevant_files = []
    for entry in all_files:  
        if fnmatch.fnmatch(entry, pattern):
            #print (entry)
            relevant_files.append(entry)
    return relevant_files
## bring in all inpatient files from synthetic_data folder
def read_in_all_files(parent_folder, pattern):
    inpatient_files = list_files(parent_folder, pattern)
    df = pd.DataFrame()
    for d in inpatient_files:
        temp_df = pd.read_csv(parent_folder + d , compression='zip')
        temp_df['sample_number'] = re.sub('\.','',re.findall('\d{1}\.', d)[0])
        df = pd.concat([df, temp_df], axis = 0)
    return df

In [5]:
## helper functions
def grouping_helper(x, keep_list):
    if x in keep_list:
        x = re.sub('[^A-Za-z0-9]+','',x).lower()
    else:
        x = 'Other'
    return x

def join_codes(row):
    return " ".join(list(set([str(v) for i, v in row.iteritems() if pd.notnull(v)])))

def join_group_codes(row):
    return " ".join(list(set([str(v)[0:3] for i, v in row.iteritems() if pd.notnull(v)])))


In [6]:
# clean and create core dataset
def create_inpatient_core_df(df):
    print(df.columns)
    df.columns = [c.lower() for c in df.columns]
    df = df.loc[(df['clm_from_dt'].notnull() & df['clm_thru_dt'].notnull()),:]

    df['clm_from_datetime'] = [datetime.strptime(str(int(a)),'%Y%m%d') for a in df['clm_from_dt']]
    df['clm_thru_datetime'] = [datetime.strptime(str(int(a)),'%Y%m%d') for a in df['clm_thru_dt']]
    df['clm_start_year'] = df['clm_from_datetime'].dt.year
    df['clm_start_month'] = df['clm_from_datetime'].dt.month

    condition_list = [df['clm_utlztn_day_cnt'] <= 3,
                      (df['clm_utlztn_day_cnt'] > 3) & (df['clm_utlztn_day_cnt'] <= 7),
                      df['clm_utlztn_day_cnt'] > 7]
    choice_list = ['0-3 days', '4-7 days', 'Over 7 days']
    df['clm_utlztn_day_cnt_grouped'] = np.select(condition_list, choice_list, default = 'Other')

    # keep these key columns 
    claims_data_key_cols = ['clm_id','desynpuf_id','sample_number','clm_start_year',
                           'clm_start_month','clm_from_datetime',
                            'clm_utlztn_day_cnt','clm_utlztn_day_cnt_grouped',
                            'prvdr_num','prvdr_num_grp','at_physn_npi','clm_drg_cd',
                            'clm_drg_cd_grp','clm_pmt_amt']

    # group major diagnosis codes
    keep_list = df.clm_drg_cd.value_counts().index[df.clm_drg_cd.value_counts().values > 100]
    df['clm_drg_cd_grp'] = [grouping_helper(r, keep_list) for r in df.clm_drg_cd]
    #df.clm_drg_cd_grp.value_counts()
    keep_list_prvdr = df.prvdr_num.value_counts().index[df.prvdr_num.value_counts().values > 100]
    df['prvdr_num_grp'] = [grouping_helper(r, keep_list_prvdr) for r in df.prvdr_num]


    icd9_dgns_cols = [d for d in df.columns if d[:9] == ('icd9_dgns')]
    icd9_prcdr_cols = [p for p in df.columns if p[:10] == ('icd9_prcdr')]
    hcpcs_cols = [h for h in df.columns if h[:8] == ('hcpcs_cd')]
    provider_cols = [pv for pv in df.columns if 'physn_npi' in pv] 

    collapse_columns_list = [icd9_dgns_cols, icd9_prcdr_cols, 
                             hcpcs_cols, provider_cols]
    suffix_list = ['icd9_dgns','icd9_pcrdr','hcpcs_cd','physn_npi']

    core_df = df.loc[:,claims_data_key_cols]
    print(core_df.head())
    i = 0
    for i in range(len(collapse_columns_list)):
        print(suffix_list[i])
        print(collapse_columns_list[i])
        
        # create collapsed codes
        collapsed_codes = df.loc[:, collapse_columns_list[i]].apply(join_codes, axis = 1)
        core_df['collapsed_' + suffix_list[i]] = collapsed_codes
    
    # try and group icd9 dgns
    collapsed_icd9_dgns_group_codes = df.loc[:, icd9_dgns_cols].apply(join_group_codes, axis = 1)
    core_df['collapsed_icd9_dgns_group'] = collapsed_icd9_dgns_group_codes

    # try and group icd9 prcdr
    collapsed_icd9_prcdr_group_codes = df.loc[:, icd9_prcdr_cols].apply(join_group_codes, axis = 1)
    core_df['collapsed_icd9_prcdr_group'] = collapsed_icd9_prcdr_group_codes
        
        ## only need to uncomment if using original df as core_df
        #df.drop(columns = collapse_columns_list[i], inplace = True)
    
    return core_df

In [7]:
# create keys master list
def add_summary_info(df):
    filelist = list_files(directory = 'C:/Users/598300/wids/medicare-fraud/synthetic_data/', 
                          pattern = '*_Beneficiary_Summary_File_Sample_*')
    file_dict = dict(zip(filelist,[re.sub('\.','',re.findall('\d{1}\.', d)[0]) for d in filelist])) 
    print(file_dict)
    
    # gather and deduplicate key columns from all summary files
    k = pd.DataFrame(columns = ['desynpuf_id','bene_birth_dt', 'bene_sex_ident_cd', 'bene_race_cd', 'sample_number'])
    for sf in filelist:
        raw_df = pd.read_csv('synthetic_data/'+ sf , compression='zip')
        f = pd.DataFrame({'desynpuf_id' : raw_df['DESYNPUF_ID'],
                          'bene_birth_dt' : raw_df['BENE_BIRTH_DT'], 
                          'bene_sex_ident_cd' : raw_df['BENE_SEX_IDENT_CD'], 
                          'bene_race_cd' : raw_df['BENE_RACE_CD']})
        f['sample_number'] = re.sub('\.','',re.findall('\d{1}\.', sf)[0])
        k = pd.concat([k, f], axis = 0)
    print(k.shape)
    k.drop_duplicates(inplace = True)
    print(k.shape)
    print(k.head())

    # in a loop, clean each summary data frame associated with each sample number and attach to core keys
    rebuilt_df = pd.DataFrame()
    for n in list(set(file_dict.values())):
        filter_k = k.loc[k['sample_number']==n,:] # filter to dataframe for each sample number
        
        # iterate over the yearly summary files only relevant to the sample number n
        for s in [f for f in list(file_dict.keys()) if file_dict[f] == n]:
            raw_df = pd.read_csv('synthetic_data/'+ s , compression='zip')
            raw_df['sample_number'] = re.sub('\.','',re.findall('\d{1}\.', s)[0])
            # year specific column
            year_specific = raw_df[['SP_STATE_CODE', 'BENE_COUNTY_CD', 
                                    'BENE_DEATH_DT', 'BENE_ESRD_IND',
                                    'BENE_HI_CVRAGE_TOT_MONS', 'BENE_SMI_CVRAGE_TOT_MONS',
                                    'BENE_HMO_CVRAGE_TOT_MONS', 'PLAN_CVRG_MOS_NUM',
                                    'MEDREIMB_IP', 'BENRES_IP', 'PPPYMT_IP', 'MEDREIMB_OP', 'BENRES_OP',
                                    'PPPYMT_OP', 'MEDREIMB_CAR', 'BENRES_CAR', 'PPPYMT_CAR']]
            year_specific.columns = [(n + '_' + re.findall('\d{4}', s)[0]).lower() for n in year_specific]
            year_specific['desynpuf_id'] = raw_df['DESYNPUF_ID']

            chronic_condition_cols = [cc for cc in raw_df.columns if ((cc[:3] == ('SP_')) & (cc != 'SP_STATE_CODE'))]
            #new_chronic_condition_cols = [n + '_' + re.findall('\d{4}', s)[0] for n in chronic_condition_cols]
            for col in chronic_condition_cols:
                raw_df[col] = raw_df[col] - 1
            year_specific['chronic_condition_count_'+re.findall('\d{4}', s)[0]] = raw_df[chronic_condition_cols].sum(axis = 1)
            print(s.upper() + ' JOINER SHAPE', year_specific.shape)
            filter_k = filter_k.merge(year_specific, how='left', on='desynpuf_id')
            print(s.upper() + ' NEW K SHAPE', filter_k.shape)
                        
        # restack each portion
        rebuilt_df = pd.concat([rebuilt_df, filter_k], axis = 0)
        print('NEW REBUILT DF SHAPE: ', rebuilt_df.shape)
    
    k = rebuilt_df.drop_duplicates()
    print('DEDUPED REBUILT DF SHAPE: ', k.shape)
    
    collapsed_st = k.loc[:, [st for st in k.columns if (st[:13] == 'sp_state_code')]].apply(join_codes, axis = 1)
    k['collapsed_states'] = collapsed_st

    collapsed_ct = k.loc[:, [ct for ct in k.columns if (ct[:14] == 'bene_county_cd')]].apply(join_codes, axis = 1)
    k['collapsed_counties'] = collapsed_ct

    ## future improvement - combine columns and change to month and year died?
    k['death_ind_2008'] = np.where(k['bene_death_dt_2008'].isnull() == False, 1, 0)
    k['death_ind_2009'] = np.where(k['bene_death_dt_2009'].isnull() == False, 1, 0)
    k['death_ind_2010'] = np.where(k['bene_death_dt_2010'].isnull() == False, 1, 0)
    
    print('df shape - model df', df.shape)
    print('k shape - shape of keys df', k.shape)
    print('df head - model df', df.head())
    merged_df = df.merge(k, how='left', on=['desynpuf_id','sample_number'])
    print('merged df shape - join k to df',merged_df.shape)
    merged_df.drop_duplicates(inplace = True)
    print('merged df shape without dupes', merged_df.shape)
    
    return merged_df

## Define Input Variables

In [8]:
# folder with synthetic data
parent = 'C:/Users/598300/wids/medicare-fraud/synthetic_data/'

# regex pattern for file type (we are focusing on Inpatient for our analysis)
patt = '*Inpatient_Claims_Sample_*' # regex

## Create Modeling Dataframe

In [9]:
# read files for inpatient
df = read_in_all_files(parent, patt)
# create core dataframe
model_df = create_inpatient_core_df(df)
# attach yearly summary information
model_df = add_summary_info(model_df)

Index(['DESYNPUF_ID', 'CLM_ID', 'SEGMENT', 'CLM_FROM_DT', 'CLM_THRU_DT',
       'PRVDR_NUM', 'CLM_PMT_AMT', 'NCH_PRMRY_PYR_CLM_PD_AMT', 'AT_PHYSN_NPI',
       'OP_PHYSN_NPI', 'OT_PHYSN_NPI', 'CLM_ADMSN_DT', 'ADMTNG_ICD9_DGNS_CD',
       'CLM_PASS_THRU_PER_DIEM_AMT', 'NCH_BENE_IP_DDCTBL_AMT',
       'NCH_BENE_PTA_COINSRNC_LBLTY_AM', 'NCH_BENE_BLOOD_DDCTBL_LBLTY_AM',
       'CLM_UTLZTN_DAY_CNT', 'NCH_BENE_DSCHRG_DT', 'CLM_DRG_CD',
       'ICD9_DGNS_CD_1', 'ICD9_DGNS_CD_2', 'ICD9_DGNS_CD_3', 'ICD9_DGNS_CD_4',
       'ICD9_DGNS_CD_5', 'ICD9_DGNS_CD_6', 'ICD9_DGNS_CD_7', 'ICD9_DGNS_CD_8',
       'ICD9_DGNS_CD_9', 'ICD9_DGNS_CD_10', 'ICD9_PRCDR_CD_1',
       'ICD9_PRCDR_CD_2', 'ICD9_PRCDR_CD_3', 'ICD9_PRCDR_CD_4',
       'ICD9_PRCDR_CD_5', 'ICD9_PRCDR_CD_6', 'HCPCS_CD_1', 'HCPCS_CD_2',
       'HCPCS_CD_3', 'HCPCS_CD_4', 'HCPCS_CD_5', 'HCPCS_CD_6', 'HCPCS_CD_7',
       'HCPCS_CD_8', 'HCPCS_CD_9', 'HCPCS_CD_10', 'HCPCS_CD_11', 'HCPCS_CD_12',
       'HCPCS_CD_13', 'HCPCS_CD_14', 'HCPCS_CD_15', 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.ht

            clm_id       desynpuf_id sample_number  clm_start_year  \
0  196661176988405  00013D2EFD8E45D1             1            2010   
1  196201177000368  00016F745862898F             1            2009   
2  196661177015632  00016F745862898F             1            2009   
3  196091176981058  00016F745862898F             1            2009   
4  196261176983265  00016F745862898F             1            2010   

   clm_start_month clm_from_datetime  clm_utlztn_day_cnt  \
0                3        2010-03-12                 1.0   
1                4        2009-04-12                 6.0   
2                8        2009-08-31                 2.0   
3                9        2009-09-17                 3.0   
4                6        2010-06-26                 5.0   

  clm_utlztn_day_cnt_grouped prvdr_num prvdr_num_grp  at_physn_npi clm_drg_cd  \
0                   0-3 days    2600GD        2600gd  3.139084e+09        217   
1                   4-7 days    3900MB        3900mb  6.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


DE1_0_2008_BENEFICIARY_SUMMARY_FILE_SAMPLE_1.ZIP JOINER SHAPE (116352, 19)
DE1_0_2008_BENEFICIARY_SUMMARY_FILE_SAMPLE_1.ZIP NEW K SHAPE (116352, 23)
DE1_0_2009_BENEFICIARY_SUMMARY_FILE_SAMPLE_1.ZIP JOINER SHAPE (114538, 19)
DE1_0_2009_BENEFICIARY_SUMMARY_FILE_SAMPLE_1.ZIP NEW K SHAPE (116352, 41)
DE1_0_2010_BENEFICIARY_SUMMARY_FILE_SAMPLE_1.ZIP JOINER SHAPE (112754, 19)
DE1_0_2010_BENEFICIARY_SUMMARY_FILE_SAMPLE_1.ZIP NEW K SHAPE (116352, 59)
NEW REBUILT DF SHAPE:  (116352, 59)
DE1_0_2008_BENEFICIARY_SUMMARY_FILE_SAMPLE_2.ZIP JOINER SHAPE (116395, 19)
DE1_0_2008_BENEFICIARY_SUMMARY_FILE_SAMPLE_2.ZIP NEW K SHAPE (116395, 23)
DE1_0_2009_BENEFICIARY_SUMMARY_FILE_SAMPLE_2.ZIP JOINER SHAPE (114618, 19)
DE1_0_2009_BENEFICIARY_SUMMARY_FILE_SAMPLE_2.ZIP NEW K SHAPE (116395, 41)
DE1_0_2010_BENEFICIARY_SUMMARY_FILE_SAMPLE_2.ZIP JOINER SHAPE (112845, 19)
DE1_0_2010_BENEFICIARY_SUMMARY_FILE_SAMPLE_2.ZIP NEW K SHAPE (116395, 59)
NEW REBUILT DF SHAPE:  (232747, 59)
DEDUPED REBUILT DF SHAPE:  (232747

In [10]:
print(df.shape)
df.head()

(133267, 82)


Unnamed: 0,desynpuf_id,clm_id,segment,clm_from_dt,clm_thru_dt,prvdr_num,clm_pmt_amt,nch_prmry_pyr_clm_pd_amt,at_physn_npi,op_physn_npi,...,hcpcs_cd_37,hcpcs_cd_38,hcpcs_cd_39,hcpcs_cd_40,hcpcs_cd_41,hcpcs_cd_42,hcpcs_cd_43,hcpcs_cd_44,hcpcs_cd_45,sample_number
0,00013D2EFD8E45D1,196661176988405,1,20100312.0,20100313.0,2600GD,4000.0,0.0,3139084000.0,,...,,,,,,,,,,1
1,00016F745862898F,196201177000368,1,20090412.0,20090418.0,3900MB,26000.0,0.0,6476809000.0,,...,,,,,,,,,,1
2,00016F745862898F,196661177015632,1,20090831.0,20090902.0,3900HM,5000.0,0.0,611998500.0,611998500.0,...,,,,,,,,,,1
3,00016F745862898F,196091176981058,1,20090917.0,20090920.0,3913XU,5000.0,0.0,4971603000.0,,...,,,,,,,,,,1
4,00016F745862898F,196261176983265,1,20100626.0,20100701.0,3900MB,16000.0,0.0,6408400000.0,1960860000.0,...,,,,,,,,,,1


In [11]:
print(model_df.shape)
print(model_df.columns)
model_df.head()

(133139, 82)
Index(['clm_id', 'desynpuf_id', 'sample_number', 'clm_start_year',
       'clm_start_month', 'clm_from_datetime', 'clm_utlztn_day_cnt',
       'clm_utlztn_day_cnt_grouped', 'prvdr_num', 'prvdr_num_grp',
       'at_physn_npi', 'clm_drg_cd', 'clm_drg_cd_grp', 'clm_pmt_amt',
       'collapsed_icd9_dgns', 'collapsed_icd9_pcrdr', 'collapsed_hcpcs_cd',
       'collapsed_physn_npi', 'collapsed_icd9_dgns_group',
       'collapsed_icd9_prcdr_group', 'bene_birth_dt', 'bene_sex_ident_cd',
       'bene_race_cd', 'sp_state_code_2008', 'bene_county_cd_2008',
       'bene_death_dt_2008', 'bene_esrd_ind_2008',
       'bene_hi_cvrage_tot_mons_2008', 'bene_smi_cvrage_tot_mons_2008',
       'bene_hmo_cvrage_tot_mons_2008', 'plan_cvrg_mos_num_2008',
       'medreimb_ip_2008', 'benres_ip_2008', 'pppymt_ip_2008',
       'medreimb_op_2008', 'benres_op_2008', 'pppymt_op_2008',
       'medreimb_car_2008', 'benres_car_2008', 'pppymt_car_2008',
       'chronic_condition_count_2008', 'sp_state_code_2

Unnamed: 0,clm_id,desynpuf_id,sample_number,clm_start_year,clm_start_month,clm_from_datetime,clm_utlztn_day_cnt,clm_utlztn_day_cnt_grouped,prvdr_num,prvdr_num_grp,...,pppymt_op_2010,medreimb_car_2010,benres_car_2010,pppymt_car_2010,chronic_condition_count_2010,collapsed_states,collapsed_counties,death_ind_2008,death_ind_2009,death_ind_2010
0,196661176988405,00013D2EFD8E45D1,1,2010,3,2010-03-12,1.0,0-3 days,2600GD,2600gd,...,0.0,90.0,30.0,0.0,9.0,26.0,950.0,0,0,0
1,196201177000368,00016F745862898F,1,2009,4,2009-04-12,6.0,4-7 days,3900MB,3900mb,...,0.0,930.0,150.0,0.0,7.0,39.0,230.0,0,0,0
2,196661177015632,00016F745862898F,1,2009,8,2009-08-31,2.0,0-3 days,3900HM,3900hm,...,0.0,930.0,150.0,0.0,7.0,39.0,230.0,0,0,0
3,196091176981058,00016F745862898F,1,2009,9,2009-09-17,3.0,0-3 days,3913XU,Other,...,0.0,930.0,150.0,0.0,7.0,39.0,230.0,0,0,0
4,196261176983265,00016F745862898F,1,2010,6,2010-06-26,5.0,4-7 days,3900MB,3900mb,...,0.0,930.0,150.0,0.0,7.0,39.0,230.0,0,0,0


In [12]:
model_df['collapsed_icd9_prcdr_group']

0                                
1                                
2                     709 V58 618
3                                
4                             E88
5                                
6                             865
7                                
8                                
9                         73. 311
10                               
11                50. 403 428 518
12                               
13                    990 401 530
14                               
15                294 V12 452 V49
16                               
17                    967 272 491
18                               
19                            793
20                               
21                               
22                               
23                            451
24                               
25                               
26                        862 707
27                               
28                               
29            

399: <br>
Control of hemorrhage, not otherwise specified
Freeing of vessel
Hemodialysis
Injection of sclerosing agent into vein
Insertion of non-drug-eluting peripheral (non-coronary) vessel stent(s)
Insertion of vessel-to-vessel cannula
Other operations on vessels
Other perfusion
Replacement of vessel-to-vessel cannula
Total body perfusion

In [13]:
string = " ".join([v for v in model_df['collapsed_icd9_prcdr_group'] if pd.notnull(v)])
li = [l for l in string.split(" ") if l != '']
li_val = []
counts = []
for x in set(li):
    li_val.append(x)
    counts.append(li.count(x))
pd.DataFrame({'code' : li_val,
            'count' : counts}).sort_values(by = 'count', ascending = False).head(15)

Unnamed: 0,code,count
143,401,6754
656,815,5592
263,272,4444
633,414,4324
777,250,4213
931,990,4150
38,427,3745
818,389,3620
751,428,3516
687,451,3234


In [14]:
# adjust for columns to filter (only string friendly at the moment)
filter_dict = {'collapsed_icd9_dgns_group': None,
               'collapsed_icd9_prcdr_group': None, # '815' hip replacement
               'clm_drg_cd': None,
               'collapsed_states': None,
               'collapsed_counties': None,
               'collapsed_physn_npi': None,
               'death_ind_2008': None, #'1'
              }
for k in list(filter_dict.keys()):
    if filter_dict[k] != None:
        model_df = model_df.loc[model_df[k].str.contains(filter_dict[k]),:]
        
model_df.shape
# model_df.to_csv('hip_replacement_modeld_df.csv')

(133139, 82)

In [15]:
filename = 'unfiltered_model_df.pickle'
pickle.dump(model_df, open(filename,'wb'))

In [16]:
model_df.head()

Unnamed: 0,clm_id,desynpuf_id,sample_number,clm_start_year,clm_start_month,clm_from_datetime,clm_utlztn_day_cnt,clm_utlztn_day_cnt_grouped,prvdr_num,prvdr_num_grp,...,pppymt_op_2010,medreimb_car_2010,benres_car_2010,pppymt_car_2010,chronic_condition_count_2010,collapsed_states,collapsed_counties,death_ind_2008,death_ind_2009,death_ind_2010
0,196661176988405,00013D2EFD8E45D1,1,2010,3,2010-03-12,1.0,0-3 days,2600GD,2600gd,...,0.0,90.0,30.0,0.0,9.0,26.0,950.0,0,0,0
1,196201177000368,00016F745862898F,1,2009,4,2009-04-12,6.0,4-7 days,3900MB,3900mb,...,0.0,930.0,150.0,0.0,7.0,39.0,230.0,0,0,0
2,196661177015632,00016F745862898F,1,2009,8,2009-08-31,2.0,0-3 days,3900HM,3900hm,...,0.0,930.0,150.0,0.0,7.0,39.0,230.0,0,0,0
3,196091176981058,00016F745862898F,1,2009,9,2009-09-17,3.0,0-3 days,3913XU,Other,...,0.0,930.0,150.0,0.0,7.0,39.0,230.0,0,0,0
4,196261176983265,00016F745862898F,1,2010,6,2010-06-26,5.0,4-7 days,3900MB,3900mb,...,0.0,930.0,150.0,0.0,7.0,39.0,230.0,0,0,0


In [67]:
### example of how to import pickle object

# filename = 'unfiltered_model_df.pickle'
# with open(filename, 'rb') as d:
#     model_df = pickle.load(d)