# Business Analytics - Analytics Cup 21
Team SGS

# Setup conda environment
To ensure that the following code works, please set up a virtual conda environment in which the following packages are installed. To do this, please run the following commands on your terminal. After that you can start the jupyter notebook
### Create an environment called "py35"
`conda create -n py35 python=3.5'`
### Install numpy and other necessary packages into the "py35" environment
`conda install matplotlib pandas numpy jupyter nb_conda -n py35`
### Activate the environment
`conda activate py35`
### Start Jupyter notebook
`jupyter notebook`

# Importing libaries and set cwd
set the project_folder to the folder where the csvs are located

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

project_folder = '/Users/mgattinger/Dropbox/TUM/Master/2_Semester/BA/analytics_cup'
os.chdir(project_folder)

# Read CSVs

In [2]:
# payments
payments_df = pd.read_csv('payments.csv', delimiter=',',low_memory=False)
payments_df = payments_df.set_index('Record_ID')
payments_df = payments_df.loc[payments_df['Total_Amount_of_Payment_USDollars'] < 5000000]
#physicians
physicians_df = pd.read_csv('physicians.csv', delimiter=',',low_memory=False)
physicians_df = physicians_df.set_index('id')

In [3]:
print(payments_df.columns)
print(physicians_df.columns)

Index(['Physician_ID', 'Company_ID', 'Total_Amount_of_Payment_USDollars',
       'Date', 'Number_of_Payments', 'Form_of_Payment_or_Transfer_of_Value',
       'Nature_of_Payment_or_Transfer_of_Value', 'City_of_Travel',
       'State_of_Travel', 'Country_of_Travel', 'Ownership_Indicator',
       'Third_Party_Recipient', 'Charity', 'Third_Party_Covered',
       'Contextual_Information', 'Related_Product_Indicator', 'Product_Code_1',
       'Product_Code_2', 'Product_Code_3', 'Product_Type_1', 'Product_Type_2',
       'Product_Type_3', 'Product_Name_1', 'Product_Name_2', 'Product_Name_3',
       'Product_Category_1', 'Product_Category_2', 'Product_Category_3'],
      dtype='object')
Index(['set', 'First_Name', 'Middle_Name', 'Last_Name', 'Name_Suffix', 'City',
       'State', 'Zipcode', 'Country', 'Province', 'Primary_Specialty',
       'License_State_1', 'License_State_2', 'License_State_3',
       'License_State_4', 'License_State_5'],
      dtype='object')


# Data preperation

## remove "Allopathic & Osteopathic Physicians|"

In [4]:
# before
physicians_df['Primary_Specialty'].head()

id
1    Allopathic & Osteopathic Physicians|Obstetrics...
2    Allopathic & Osteopathic Physicians|Anesthesio...
3    Podiatric Medicine & Surgery Service Providers...
4    Allopathic & Osteopathic Physicians|Internal M...
5    Allopathic & Osteopathic Physicians|Orthopaedi...
Name: Primary_Specialty, dtype: object

In [5]:
# remove
physicians_df['Primary_Specialty'] = physicians_df['Primary_Specialty'].str.replace(r'Allopathic & Osteopathic Physicians\|', '')

In [6]:
# after
physicians_df['Primary_Specialty'].head()

id
1                   Obstetrics & Gynecology|Gynecology
2                                       Anesthesiology
3    Podiatric Medicine & Surgery Service Providers...
4             Internal Medicine|Cardiovascular Disease
5                  Orthopaedic Surgery|Sports Medicine
Name: Primary_Specialty, dtype: object

## remove "\|.*"

In [7]:
# remove
physicians_df['Primary_Specialty'] = physicians_df['Primary_Specialty'].str.replace(r'\|.*', '')
#after
physicians_df['Primary_Specialty'].head()

id
1                           Obstetrics & Gynecology
2                                    Anesthesiology
3    Podiatric Medicine & Surgery Service Providers
4                                 Internal Medicine
5                               Orthopaedic Surgery
Name: Primary_Specialty, dtype: object

## split sets

In [8]:
#valid_physicians_df = physicians_df.loc[physicians_df['set'] == 'train']
#train_physicians_df = valid_physicians_df.sample(n=4000, random_state=1)
train_physicians_df = physicians_df.loc[physicians_df['set'] == 'train']
train_physicians_df = train_physicians_df.reset_index()
train_physicians_df = train_physicians_df.rename(columns={"id": "Physician_ID"})
train_physicians_df = train_physicians_df.set_index('Physician_ID')

test_physicians_df = physicians_df.loc[physicians_df['set'] == 'test']
test_physicians_df = test_physicians_df.reset_index()
test_physicians_df = test_physicians_df.rename(columns={"id": "Physician_ID"})
test_physicians_df = test_physicians_df.set_index('Physician_ID')

#valid_physicians_df = valid_physicians_df[~valid_physicians_df.index.isin(train_physicians_df.index)]
#valid_physicians_df.sort_index(inplace=True)
#train_physicians_df.sort_index(inplace=True)
#test_physicians_df.sort_index(inplace=True)

## Mutate Ownership_Indicator

In [9]:
payments_df.Ownership_Indicator = payments_df.Ownership_Indicator.replace("No", 0)
payments_df.Ownership_Indicator = payments_df.Ownership_Indicator.replace("Yes", 1)


# Feature generation
## Get ownership interest df

In [10]:
ownership_interest_df = payments_df[["Ownership_Indicator","Physician_ID"]].groupby(by=["Physician_ID"]).max()
ownership_interest_df.head()

Unnamed: 0_level_0,Ownership_Indicator
Physician_ID,Unnamed: 1_level_1
1,0
2,0
3,0
4,0
5,0


## Get sum of payments by Physician_ID

In [11]:
total_payments_df = payments_df[["Total_Amount_of_Payment_USDollars","Physician_ID"]].groupby(by=["Physician_ID"]).sum()
total_payments_df = total_payments_df.rename(columns={"Total_Amount_of_Payment_USDollars": "total_payments"})
total_payments_df.head()

Unnamed: 0_level_0,total_payments
Physician_ID,Unnamed: 1_level_1
1,952.93
2,120924.92
3,3523.93
4,26745.61
5,25411.21


## Get number of payments by Physician_ID

In [12]:
number_payments_df = payments_df[["Total_Amount_of_Payment_USDollars","Physician_ID"]].groupby(by=["Physician_ID"]).count()
number_payments_df = number_payments_df.rename(columns={"Total_Amount_of_Payment_USDollars": "number_of_payments"})
number_payments_df.head()

Unnamed: 0_level_0,number_of_payments
Physician_ID,Unnamed: 1_level_1
1,47
2,154
3,132
4,411
5,98


## Get top nature of payment by Physician_ID

In [13]:
top_nature_df = payments_df[["Nature_of_Payment_or_Transfer_of_Value","Total_Amount_of_Payment_USDollars","Physician_ID"]].groupby(by=["Physician_ID","Nature_of_Payment_or_Transfer_of_Value"]).sum()
# nlargest(1, 'Total_Amount_of_Payment_USDollars','first')
top_nature_df.reset_index(inplace=True)
top_nature_df = top_nature_df.groupby(by=["Physician_ID"])
top_nature_df = top_nature_df.apply(lambda _df: _df.nlargest(1,'Total_Amount_of_Payment_USDollars','first'))
top_nature_df.reset_index(drop=True,inplace=True)
top_nature_df.set_index('Physician_ID',inplace=True)
top_nature_df = top_nature_df.rename(columns={"Nature_of_Payment_or_Transfer_of_Value": "top_nature","Total_Amount_of_Payment_USDollars":"total_of_top_nature"})
top_nature_df.head()

Unnamed: 0_level_0,top_nature,total_of_top_nature
Physician_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Food and Beverage,842.44
2,Compensation for services other than consultin...,45190.7
3,Food and Beverage,3523.93
4,Compensation for services other than consultin...,18011.81
5,Education,8792.0


## Create year and month column for grouping

In [14]:
payments_per_month_df = payments_df[["Date","Physician_ID","Total_Amount_of_Payment_USDollars"]]
payments_per_month_df_year = payments_per_month_df["Date"].str.replace(r'../../', '')
payments_per_month_df_month = payments_per_month_df["Date"].str.replace(r'/../....', '')
payments_per_month_df.insert(1, "Year", payments_per_month_df_year, False) 
payments_per_month_df.insert(2, "Month", payments_per_month_df_month, False)
payments_per_month_df = payments_per_month_df.drop(['Date'], axis=1)
payments_per_month_df = payments_per_month_df.loc[payments_per_month_df['Year'] > '2013']
payments_per_month_df.head()

Unnamed: 0_level_0,Year,Month,Physician_ID,Total_Amount_of_Payment_USDollars
Record_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
80021,2014,1,3328,105.87
80022,2014,1,2439,42.33
80023,2014,1,3164,12.93
80024,2014,1,968,19.45
80025,2014,1,1608,10.26


## Get range of number of payments by Physician_ID

In [15]:
range_number_of_payments_df = payments_per_month_df.groupby(by=["Year","Month","Physician_ID"]).agg(['count'])
range_number_of_payments_df = range_number_of_payments_df.groupby(by=["Year","Physician_ID"]).agg(['sum'])
range_number_of_payments_df = range_number_of_payments_df.groupby(by=["Physician_ID"]).agg(['max','min'])
range_number_of_payments_df_max = range_number_of_payments_df['Total_Amount_of_Payment_USDollars']['count']['sum']['max']
range_number_of_payments_df_min = range_number_of_payments_df['Total_Amount_of_Payment_USDollars']['count']['sum']['min']
range_number_of_payments_df['range_count'] = range_number_of_payments_df_max - range_number_of_payments_df_min
range_number_of_payments_df = range_number_of_payments_df.drop(['Total_Amount_of_Payment_USDollars'],axis=1)
range_number_of_payments_df.columns = range_number_of_payments_df.columns.droplevel(level = [1,2,3])
range_number_of_payments_df.head()

Unnamed: 0_level_0,range_count
Physician_ID,Unnamed: 1_level_1
1,8
2,45
3,12
4,50
5,18


## Get range of payments by Physician_ID

In [16]:
range_of_payments_df = payments_per_month_df.groupby(by=["Year","Month","Physician_ID"]).agg(['sum'])
range_of_payments_df = range_of_payments_df.groupby(by=["Year","Physician_ID"]).agg(['sum'])
range_of_payments_df = range_of_payments_df.groupby(by=["Physician_ID"]).agg(['max','min'])
range_of_payments_df_max = range_of_payments_df['Total_Amount_of_Payment_USDollars']['sum']['sum']['max']
range_of_payments_df_min = range_of_payments_df['Total_Amount_of_Payment_USDollars']['sum']['sum']['min']
range_of_payments_df['range_total'] = range_of_payments_df_max - range_of_payments_df_min
range_of_payments_df = range_of_payments_df.drop(['Total_Amount_of_Payment_USDollars'],axis=1)
range_of_payments_df.columns = range_of_payments_df.columns.droplevel(level = [1,2,3])
range_of_payments_df.head()

Unnamed: 0_level_0,range_total
Physician_ID,Unnamed: 1_level_1
1,170.41
2,35030.52
3,462.42
4,13247.09
5,9079.72


## Group by Company_ID and get sum, count and mean of payments by company 

In [17]:
company_df = payments_df[["Company_ID","Physician_ID","Total_Amount_of_Payment_USDollars"]]
company_df = company_df.groupby(by=['Physician_ID','Company_ID']).agg(['sum','count','mean'])
company_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Total_Amount_of_Payment_USDollars,Total_Amount_of_Payment_USDollars,Total_Amount_of_Payment_USDollars
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,count,mean
Physician_ID,Company_ID,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,69,43.85,3,14.616667
1,122,44.3,3,14.766667
1,123,22.64,1,22.64
1,138,16.88,1,16.88
1,140,44.53,3,14.843333


## Get top paying company by physician

In [18]:
top_companys = company_df['Total_Amount_of_Payment_USDollars']['sum']
top_companys = top_companys.reset_index()
top_companys = top_companys.groupby(by=["Physician_ID"])
top_companys = top_companys.apply(lambda _df: _df.nlargest(1,'sum','first'))
top_companys = top_companys.reset_index(drop=True)
top_companys = top_companys.set_index('Physician_ID')
top_companys = top_companys.drop(['sum'],axis=1)
top_companys['Company_ID']= top_companys['Company_ID'].map(str)
top_companys = top_companys.rename(columns={"Company_ID": "top_company"})
top_companys.head()

Unnamed: 0_level_0,top_company
Physician_ID,Unnamed: 1_level_1
1,550
2,601
3,340
4,18
5,442


## Get count of company payments by physician

In [19]:
companys_pay_count_df = company_df.groupby('Physician_ID').agg(['count'])
companys_pay_count_df_count = companys_pay_count_df['Total_Amount_of_Payment_USDollars']['mean']['count']
companys_pay_count_df['pay_count'] = companys_pay_count_df_count
companys_pay_count_df = companys_pay_count_df.drop(['Total_Amount_of_Payment_USDollars'],axis=1)
companys_pay_count_df.columns = companys_pay_count_df.columns.droplevel(level = [1,2])
companys_pay_count_df.head()

Unnamed: 0_level_0,pay_count
Physician_ID,Unnamed: 1_level_1
1,18
2,13
3,15
4,32
5,13


## Get std of company payments by physician

In [20]:
companys_pay_std_df = company_df.groupby('Physician_ID').agg(['std'])
companys_pay_std_df_std = companys_pay_std_df['Total_Amount_of_Payment_USDollars']['mean']['std']
companys_pay_std_df['std'] = companys_pay_std_df_std
companys_pay_std_df = companys_pay_std_df.drop(['Total_Amount_of_Payment_USDollars'],axis=1)
companys_pay_std_df.columns = companys_pay_std_df.columns.droplevel(level = [1,2])
companys_pay_std_df.head()

Unnamed: 0_level_0,std
Physician_ID,Unnamed: 1_level_1
1,6.498346
2,478.19977
3,35.449082
4,129.869365
5,1741.832451


## Get Related Product Indicator by physician

In [21]:
rpi_df = payments_df[['Related_Product_Indicator','Physician_ID','Total_Amount_of_Payment_USDollars']]
rpi_df = rpi_df.groupby(by=['Physician_ID','Related_Product_Indicator']).count()
rpi_df = rpi_df.reset_index()
rpi_df = rpi_df.groupby(by=["Physician_ID"])
rpi_df = rpi_df.apply(lambda _df: _df.nlargest(1,'Total_Amount_of_Payment_USDollars','first'))
rpi_df = rpi_df.reset_index(drop=True)
rpi_df = rpi_df.set_index('Physician_ID')
rpi_df = rpi_df.rename(columns={"Related_Product_Indicator": "top_rpi","Total_Amount_of_Payment_USDollars": "rpi_count"})
rpi_df.head()

Unnamed: 0_level_0,top_rpi,rpi_count
Physician_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Covered,26
2,Covered,88
3,Yes,75
4,Yes,231
5,Covered,41


## Generate Form_of_Payment_or_Transfer_of_Value (fop) dummy columns

In [22]:
print(payments_df.Form_of_Payment_or_Transfer_of_Value.unique())

['In-kind items and services' 'Cash or cash equivalent'
 'Dividend, profit or other return on investment'
 'Stock, stock option, or any other ownership interest'
 'Any other ownership interest' 'Stock' 'Stock option']


## payments grouped by fops and physician_id

In [23]:
fops_df = payments_df[['Physician_ID','Form_of_Payment_or_Transfer_of_Value','Total_Amount_of_Payment_USDollars']]
fops_df = fops_df.groupby(by=['Physician_ID','Form_of_Payment_or_Transfer_of_Value']).count()
fops_df = fops_df.rename(columns={"Total_Amount_of_Payment_USDollars": "fop_count"})
fops_df = fops_df.reset_index()
fops_df = fops_df.set_index('Physician_ID')
fops_df.head()

Unnamed: 0_level_0,Form_of_Payment_or_Transfer_of_Value,fop_count
Physician_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Cash or cash equivalent,4
1,In-kind items and services,43
2,Cash or cash equivalent,72
2,In-kind items and services,82
3,Cash or cash equivalent,39


## Create dummy df

In [24]:
fops_dummy_df = rpi_df.copy(deep=True)
fops_dummy_df.drop(['top_rpi','rpi_count'],axis=1,inplace=True)
fops_dummy_df['dummy'] = 0
fops_dummy_df.head()

Unnamed: 0_level_0,dummy
Physician_ID,Unnamed: 1_level_1
1,0
2,0
3,0
4,0
5,0


### Cash or cash equivalent

In [25]:
cash_df = fops_df.loc[fops_df['Form_of_Payment_or_Transfer_of_Value'] == 'Cash or cash equivalent']
fops_dummy_df = pd.concat([fops_dummy_df, cash_df], axis=1, sort=False)
fops_dummy_df = fops_dummy_df.rename(columns={"fop_count": "cash"})
fops_dummy_df['cash'] = fops_dummy_df['cash'].fillna(0) 
fops_dummy_df.drop(['Form_of_Payment_or_Transfer_of_Value','dummy'],axis=1,inplace=True)

### In-kind items and services

In [26]:
services_df = fops_df.loc[fops_df['Form_of_Payment_or_Transfer_of_Value'] == 'In-kind items and services']
fops_dummy_df = pd.concat([fops_dummy_df, services_df], axis=1, sort=False)
fops_dummy_df = fops_dummy_df.rename(columns={"fop_count": "services"})
fops_dummy_df['services'] = fops_dummy_df['services'].fillna(0) 
fops_dummy_df.drop(['Form_of_Payment_or_Transfer_of_Value'],axis=1,inplace=True)

### Stock

In [27]:
stock_df = fops_df.loc[fops_df['Form_of_Payment_or_Transfer_of_Value'] == 'Stock']
fops_dummy_df = pd.concat([fops_dummy_df, stock_df], axis=1, sort=False)
fops_dummy_df = fops_dummy_df.rename(columns={"fop_count": "stock"})
fops_dummy_df['stock'] = fops_dummy_df['stock'].fillna(0) 
fops_dummy_df.drop(['Form_of_Payment_or_Transfer_of_Value'],axis=1,inplace=True)


In [28]:
#fops_dummy_df.iloc[1713]

### Stock option

In [29]:
stock_opt_df = fops_df.loc[fops_df['Form_of_Payment_or_Transfer_of_Value'] == 'Stock option']
fops_dummy_df = pd.concat([fops_dummy_df, stock_opt_df], axis=1, sort=False)
fops_dummy_df = fops_dummy_df.rename(columns={"fop_count": "stock_opt"})
fops_dummy_df['stock_opt'] = fops_dummy_df['stock_opt'].fillna(0) 
fops_dummy_df.drop(['Form_of_Payment_or_Transfer_of_Value'],axis=1,inplace=True)

In [30]:
#fops_dummy_df.iloc[1033]

## Any other ownership interest

In [31]:
any_ownership_df = fops_df.loc[fops_df['Form_of_Payment_or_Transfer_of_Value'] == 'Any other ownership interest']
fops_dummy_df = pd.concat([fops_dummy_df, any_ownership_df], axis=1, sort=False)
fops_dummy_df = fops_dummy_df.rename(columns={"fop_count": "any_ownership"})
fops_dummy_df['any_ownership'] = fops_dummy_df['any_ownership'].fillna(0) 
fops_dummy_df.drop(['Form_of_Payment_or_Transfer_of_Value'],axis=1,inplace=True)

In [32]:
#fops_dummy_df.iloc[1058]

### Dividend, profit or other return on investment

In [33]:
dividend_df = fops_df.loc[fops_df['Form_of_Payment_or_Transfer_of_Value'] == 'Dividend, profit or other return on investment']
fops_dummy_df = pd.concat([fops_dummy_df, dividend_df], axis=1, sort=False)
fops_dummy_df = fops_dummy_df.rename(columns={"fop_count": "dividend"})
fops_dummy_df['dividend'] = fops_dummy_df['dividend'].fillna(0) 
fops_dummy_df.drop(['Form_of_Payment_or_Transfer_of_Value'],axis=1,inplace=True)

In [34]:
#fops_dummy_df.iloc[1058]

### Stock, stock option, or any other ownership interest

In [35]:
stock_or_other_df = fops_df.loc[fops_df['Form_of_Payment_or_Transfer_of_Value'] == 'Stock, stock option, or any other ownership interest']
fops_dummy_df = pd.concat([fops_dummy_df, stock_or_other_df], axis=1, sort=False)
fops_dummy_df = fops_dummy_df.rename(columns={"fop_count": "stock_or_other"})
fops_dummy_df['stock_or_other'] = fops_dummy_df['stock_or_other'].fillna(0) 
fops_dummy_df.drop(['Form_of_Payment_or_Transfer_of_Value'],axis=1,inplace=True)

In [36]:
#fops_dummy_df.iloc[1430]

In [37]:
fops_dummy_df.head()

Unnamed: 0_level_0,cash,services,stock,stock_opt,any_ownership,dividend,stock_or_other
Physician_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,4.0,43.0,0.0,0.0,0.0,0.0,0.0
2,72.0,82.0,0.0,0.0,0.0,0.0,0.0
3,39.0,93.0,0.0,0.0,0.0,0.0,0.0
4,131.0,280.0,0.0,0.0,0.0,0.0,0.0
5,16.0,82.0,0.0,0.0,0.0,0.0,0.0


## Get top form of payment by physician

In [38]:
fops_df = fops_df.reset_index()
fops_df = fops_df.groupby('Physician_ID')
fops_df = fops_df.apply(lambda _df: _df.nlargest(1,'fop_count','first'))
fops_df = fops_df.reset_index(drop=True)
fops_df = fops_df.set_index('Physician_ID')
fops_df = fops_df.rename(columns={"Form_of_Payment_or_Transfer_of_Value": "top_fop"})
fops_df.head()

Unnamed: 0_level_0,top_fop,fop_count
Physician_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,In-kind items and services,43
2,In-kind items and services,82
3,In-kind items and services,93
4,In-kind items and services,280
5,In-kind items and services,82


In [39]:
#fops_df.iloc[232]

In [40]:
features = pd.concat([total_payments_df,
                      number_payments_df,
                      top_nature_df,
                      range_number_of_payments_df,
                      range_of_payments_df,
                      top_companys,
                      companys_pay_count_df,
                      companys_pay_std_df,
                      rpi_df,
                      fops_dummy_df,
                      fops_df], axis=1, sort=False)

In [41]:
print(features.columns)

Index(['total_payments', 'number_of_payments', 'top_nature',
       'total_of_top_nature', 'range_count', 'range_total', 'top_company',
       'pay_count', 'std', 'top_rpi', 'rpi_count', 'cash', 'services', 'stock',
       'stock_opt', 'any_ownership', 'dividend', 'stock_or_other', 'top_fop',
       'fop_count'],
      dtype='object')


In [42]:
train_physicians_df = train_physicians_df[['State','Primary_Specialty']]
test_physicians_df = test_physicians_df[['State','Primary_Specialty']]

In [43]:
#train_physicians_df = pd.concat([train_physicians_df,ownership_interest_df], axis=1, sort=False)

#train_physicians_df = pd.concat([train_physicians_df,ownership_interest_df], axis=1, sort=False)

In [44]:
train_physicians_df= pd.concat([train_physicians_df,features],axis=1,join='inner')
test_physicians_df= pd.concat([test_physicians_df,features],axis=1,join='inner')

In [45]:
train_physicians_df

Unnamed: 0_level_0,State,Primary_Specialty,total_payments,number_of_payments,top_nature,total_of_top_nature,range_count,range_total,top_company,pay_count,...,rpi_count,cash,services,stock,stock_opt,any_ownership,dividend,stock_or_other,top_fop,fop_count
Physician_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,FL,Obstetrics & Gynecology,952.93,47,Food and Beverage,842.44,8,170.41,550,18,...,26,4.0,43.0,0.0,0.0,0.0,0.0,0.0,In-kind items and services,43
2,MD,Anesthesiology,120924.92,154,Compensation for services other than consultin...,45190.70,45,35030.52,601,13,...,88,72.0,82.0,0.0,0.0,0.0,0.0,0.0,In-kind items and services,82
3,NY,Podiatric Medicine & Surgery Service Providers,3523.93,132,Food and Beverage,3523.93,12,462.42,340,15,...,75,39.0,93.0,0.0,0.0,0.0,0.0,0.0,In-kind items and services,93
4,GA,Internal Medicine,26745.61,411,Compensation for services other than consultin...,18011.81,50,13247.09,18,32,...,231,131.0,280.0,0.0,0.0,0.0,0.0,0.0,In-kind items and services,280
5,NM,Orthopaedic Surgery,25411.21,98,Education,8792.00,18,9079.72,442,13,...,41,16.0,82.0,0.0,0.0,0.0,0.0,0.0,In-kind items and services,82
6,NY,Dental Providers,2680.80,33,Food and Beverage,1492.83,8,1039.88,42,13,...,16,6.0,27.0,0.0,0.0,0.0,0.0,0.0,In-kind items and services,27
7,KS,Obstetrics & Gynecology,5822.78,55,Education,2500.00,18,4355.49,278,7,...,28,3.0,52.0,0.0,0.0,0.0,0.0,0.0,In-kind items and services,52
8,NJ,Internal Medicine,2396.00,153,Food and Beverage,2396.00,27,349.58,75,16,...,120,0.0,153.0,0.0,0.0,0.0,0.0,0.0,In-kind items and services,153
9,MI,Pediatrics,921.98,28,Food and Beverage,862.08,8,237.20,2431,14,...,16,1.0,27.0,0.0,0.0,0.0,0.0,0.0,In-kind items and services,27
10,TX,Orthopaedic Surgery,167075.04,135,Consulting Fee,104436.66,15,68416.48,267,34,...,72,41.0,94.0,0.0,0.0,0.0,0.0,0.0,In-kind items and services,94


In [46]:
test_physicians_df

Unnamed: 0_level_0,State,Primary_Specialty,total_payments,number_of_payments,top_nature,total_of_top_nature,range_count,range_total,top_company,pay_count,...,rpi_count,cash,services,stock,stock_opt,any_ownership,dividend,stock_or_other,top_fop,fop_count
Physician_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
14,FL,Pain Medicine,5775.53,539,Food and Beverage,5626.66,96,514.07,197,35,...,360,56.0,483.0,0.0,0.0,0.0,0.0,0.0,In-kind items and services,483
18,NY,Internal Medicine,92123.24,584,Consulting Fee,39516.38,68,42697.24,2285,46,...,417,100.0,484.0,0.0,0.0,0.0,0.0,0.0,In-kind items and services,484
42,NY,Obstetrics & Gynecology,3159.49,180,Food and Beverage,3066.04,16,256.09,190,27,...,108,8.0,172.0,0.0,0.0,0.0,0.0,0.0,In-kind items and services,172
46,MI,,1614.34,64,Food and Beverage,1614.34,15,547.14,897,17,...,49,2.0,62.0,0.0,0.0,0.0,0.0,0.0,In-kind items and services,62
47,MD,Internal Medicine,6494.00,296,Food and Beverage,6444.00,96,1557.50,123,31,...,253,21.0,275.0,0.0,0.0,0.0,0.0,0.0,In-kind items and services,275
53,IN,Thoracic Surgery (Cardiothoracic Vascular Surg...,89072.96,469,Compensation for services other than consultin...,53500.00,46,22975.57,100,41,...,290,169.0,300.0,0.0,0.0,0.0,0.0,0.0,In-kind items and services,300
57,NY,Family Medicine,1045.44,66,Food and Beverage,1045.44,9,144.52,150,23,...,46,2.0,64.0,0.0,0.0,0.0,0.0,0.0,In-kind items and services,64
58,SC,Surgery,1468.60,19,Food and Beverage,795.43,4,857.25,700,13,...,12,1.0,18.0,0.0,0.0,0.0,0.0,0.0,In-kind items and services,18
62,TX,Internal Medicine,42219.11,107,Compensation for services other than consultin...,24550.00,36,21484.17,2112,14,...,96,35.0,72.0,0.0,0.0,0.0,0.0,0.0,In-kind items and services,72
76,MI,Urology,4120.79,107,Food and Beverage,3965.24,22,1071.94,69,28,...,60,9.0,98.0,0.0,0.0,0.0,0.0,0.0,In-kind items and services,98


In [48]:
train_physicians_df.to_csv('train_physicians_df_22_Fs.csv',sw)
test_physicians_df.to_csv('test_physicians_df_22_Fs.csv')