In [None]:
# LOAD AND CLEAN PROVIDER INFO

In [550]:
import pandas as pd
import glob
import os
import copy

# Read files with different encodings
def read_csv_with_encodings(file_path):
    # List of common encodings to try
    encodings = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252', 'utf-16', 'utf-32']
    
    for encoding in encodings:
        try:
            df = pd.read_csv(file_path, encoding=encoding, low_memory = False)
            print(f"Successfully read the file with {encoding} encoding")
            return df
        except UnicodeDecodeError:
            print(f"Failed to read with {encoding} encoding")
    
    print("Could not read the file with any of the common encodings")
    return None

# Define the directory path
directory = '/Users/apple/Documents/APD/provider_info/original/'

# Get all CSV files matching the pattern
files = glob.glob(os.path.join(directory, 'ProviderInfo_*.csv'))

# Dictionaries to store original DataFrames
raw_provider_info = {}

# Process each file
for file_path in files:
    # Extract the year from the filename
    filename = os.path.basename(file_path)
    year = filename.replace('ProviderInfo_', '').replace('.csv', '')
    
    # Read the CSV file
    df = read_csv_with_encodings(file_path)
    
    if df is not None:
        # Generate year column
        df['year'] = year
        
        # Store the original DataFrame
        raw_provider_info[f'raw_pi_{year}'] = df

# Print out the keys of the dictionary to verify
print("Raw dataframes:", raw_provider_info.keys())

Successfully read the file with utf-8 encoding
Failed to read with utf-8 encoding
Successfully read the file with latin-1 encoding
Successfully read the file with utf-8 encoding
Failed to read with utf-8 encoding
Successfully read the file with latin-1 encoding
Failed to read with utf-8 encoding
Successfully read the file with latin-1 encoding
Successfully read the file with utf-8 encoding
Failed to read with utf-8 encoding
Successfully read the file with latin-1 encoding
Raw dataframes: dict_keys(['raw_pi_2016', 'raw_pi_2017', 'raw_pi_2015', 'raw_pi_2020', 'raw_pi_2021', 'raw_pi_2019', 'raw_pi_2018'])


In [552]:
# lowercase for all column names
for key in raw_provider_info:
    raw_provider_info[key].columns = raw_provider_info[key].columns.str.lower()

In [554]:
# Dictionary to store custom tables
clean_provider_info = {}

# Define the columns for each custom table
def create_provider_info_tables(raw_provider_info):
    tables = {}
    
    for key, df in raw_provider_info.items():
        year = key.split('_')[-1]
        
        # List the columns to keep
        # Adjust these column names to match actual data
        columns_to_keep = [
            'provnum', 'federal provider number',
            'provname', 'provider name',
            'address', 'provider address',
            'city', 'provider city',
            'state', 'provider state',
            'zip', 'provider zip code',
            'phone', 'provider phone number',
            'county_ssa', 'provider ssa county code',
            'county_name', 'provider county name',
            'ownership', 'ownership type',
            'bedcert', 'number of certified beds',
            'restot', 'average number of residents per day',
            'overall_rating', 'overall rating',
            'tot_penlty_cnt', 'total number of penalties',
            'rnhrd', 'reported rn staffing hours per resident per day',
            'totlichrd', 'reported licensed staffing hours per resident per day',
            'tothrd', 'reported total nurse staffing hours per resident per day',
            'pthrd', 'reported physical therapist staffing hours per resident per day',
            'year'
        ]
        
        # Only keep columns that exist in the dataframe
        valid_columns = [col for col in columns_to_keep if col in df.columns]
        
        # Create new table with only the columns you need
        if valid_columns:
            tables[f'provider_basic_{year}'] = df[valid_columns].copy()
    
    return tables

provider_info_tables = create_provider_info_tables(raw_provider_info)
clean_provider_info.update(provider_info_tables)

# Print out the keys of the dictionary to verify
print("Custom Tables:", clean_provider_info.keys())

Custom Tables: dict_keys(['provider_basic_2016', 'provider_basic_2017', 'provider_basic_2015', 'provider_basic_2020', 'provider_basic_2021', 'provider_basic_2019', 'provider_basic_2018'])


In [556]:
# Rename 2020 and 2021 file to standard name
for key, df in clean_provider_info.items():
    if key in ['provider_basic_2020', 'provider_basic_2021']:
        clean_provider_info[key] = df.rename(columns={
            'federal provider number': 'provnum',
            'provider name': 'provname',
            'provider address': 'address',
            'provider city': 'city',
            'provider state': 'state',
            'provider zip code': 'zip',
            'provider phone number': 'phone',
            'provider ssa county code': 'county_ssa',
            'provider county name': 'county_name',
            'ownership type': 'ownership',
            'number of certified beds': 'bedcert',
            'average number of residents per day': 'restot',
            'overall rating': 'overall_rating',
            'total number of penalties': 'tot_penlty_cnt',
            'reported rn staffing hours per resident per day': 'rnhrd',
            'reported licensed staffing hours per resident per day': 'totlichrd',
            'reported total nurse staffing hours per resident per day': 'tothrd',
            'reported physical therapist staffing hours per resident per day': 'pthrd'
        })        

In [558]:
# Union all file
union_provider_info = pd.concat(clean_provider_info.values(), ignore_index=True)
union_provider_info.info()
#union_provider_info.sample(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108637 entries, 0 to 108636
Data columns (total 19 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   provnum         108637 non-null  object 
 1   provname        108637 non-null  object 
 2   address         108637 non-null  object 
 3   city            108637 non-null  object 
 4   state           108637 non-null  object 
 5   zip             108637 non-null  int64  
 6   phone           108637 non-null  int64  
 7   county_ssa      108637 non-null  int64  
 8   county_name     108637 non-null  object 
 9   ownership       108636 non-null  object 
 10  bedcert         108637 non-null  int64  
 11  restot          107914 non-null  float64
 12  overall_rating  106945 non-null  float64
 13  tot_penlty_cnt  108637 non-null  int64  
 14  rnhrd           104754 non-null  float64
 15  totlichrd       104754 non-null  float64
 16  tothrd          104754 non-null  float64
 17  pthrd     

In [560]:
# CLEAN COST REPORT

In [562]:
# Redefine the directory path
directory = '/Users/apple/Documents/APD/cost_report'

# Get all CSV files matching the pattern
files = glob.glob(os.path.join(directory, '*_CostReport.csv'))

# Dictionaries to store original DataFrames
raw_cost_report = {}

# Process each file
for file_path in files:
    # Extract the year from the filename
    filename = os.path.basename(file_path)
    year = filename.split('_')[0]
    
    # Read the CSV file
    df = read_csv_with_encodings(file_path)

    
    if df is not None:
        # Generate year column
        df['year'] = year
        
        # Store the original DataFrame
        raw_cost_report[f'raw_cost_{year}'] = df

# Print out the keys of the dictionary to verify
print("Raw dataframes:", raw_cost_report.keys())

Successfully read the file with utf-8 encoding
Successfully read the file with utf-8 encoding
Successfully read the file with utf-8 encoding
Successfully read the file with utf-8 encoding
Successfully read the file with utf-8 encoding
Successfully read the file with utf-8 encoding
Successfully read the file with utf-8 encoding
Raw dataframes: dict_keys(['raw_cost_2021', 'raw_cost_2020', 'raw_cost_2017', 'raw_cost_2018', 'raw_cost_2019', 'raw_cost_2016', 'raw_cost_2015'])


In [564]:
# Lowercase for all column names
for key in raw_cost_report:
    raw_cost_report[key].columns = raw_cost_report[key].columns.str.lower()

In [566]:
# Dictionary to store your custom tables
clean_cost_report = {}

# Define the columns for each custom table
def create_cost_report_tables(raw_cost_report):
    tables = {}
    
    for key, df in raw_cost_report.items():
        year = key.split('_')[-1]
        
        # List the columns to keep
        # Adjust these column names to match actual data
        columns_to_keep = [
            'provider_ccn', 'provider ccn',
            'rural_versus_urban', 'rural versus urban',
            'gross_revenue', 'gross revenue',
            'inpatient_revenue', 'inpatient revenue',
            'net_income', 'net income',
            'net_patient_revenue', 'net patient revenue',
            'number_of_beds', 'number of beds',
            'total_costs', 'total costs',
            'total_income', 'total income',
            'total_salaries_adjusted', 'total salaries (adjusted)',
            'fiscal_year_begin_date', 'fiscal_year_end_date',
            'fiscal year begin date','fiscal year end date',
            'year'
        ]

        
        # Only keep columns that exist in the dataframe
        valid_columns = [col for col in columns_to_keep if col in df.columns]
        
        # Create new table
        if valid_columns:
            tables[f'cost_report_clean_{year}'] = df[valid_columns].copy()
    
    return tables

cost_report_tables = create_cost_report_tables(raw_cost_report)
clean_cost_report.update(cost_report_tables)

# Print out the keys of the dictionary to verify
print("Custom Tables:", clean_cost_report.keys())

Custom Tables: dict_keys(['cost_report_clean_2021', 'cost_report_clean_2020', 'cost_report_clean_2017', 'cost_report_clean_2018', 'cost_report_clean_2019', 'cost_report_clean_2016', 'cost_report_clean_2015'])


In [568]:
# Rename 2020 and 2021 file to standard name
for key, df in clean_cost_report.items():
    if key in ['cost_report_clean_2020', 'cost_report_clean_2021']:
        clean_cost_report[key] = df.rename(columns={
            'provider ccn': 'provider_ccn',
            'rural versus urban': 'rural_versus_urban',
            'gross revenue': 'gross_revenue',
            'inpatient revenue': 'inpatient_revenue',
            'net income': 'net_income',
            'net patient revenue': 'net_patient_revenue',
            'number of beds': 'number_of_beds',
            'total costs': 'total_costs',
            'total income': 'total_income',
            'total salaries (adjusted)': 'total_salaries_adjusted',
            'fiscal year begin date': 'fiscal_year_begin_date',
            'fiscal year end date': 'fiscal_year_end_date'
        })

In [570]:
# Union all file
union_cost_report = pd.concat(clean_cost_report.values(), ignore_index=True)

union_cost_report.info()
#union_cost_report.sample(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106269 entries, 0 to 106268
Data columns (total 13 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   provider_ccn             106269 non-null  int64  
 1   rural_versus_urban       104082 non-null  object 
 2   gross_revenue            103988 non-null  float64
 3   inpatient_revenue        103985 non-null  float64
 4   net_income               89205 non-null   float64
 5   net_patient_revenue      103997 non-null  float64
 6   number_of_beds           103867 non-null  float64
 7   total_costs              104131 non-null  float64
 8   total_income             103981 non-null  float64
 9   total_salaries_adjusted  104196 non-null  float64
 10  fiscal_year_begin_date   104197 non-null  object 
 11  fiscal_year_end_date     104197 non-null  object 
 12  year                     106269 non-null  object 
dtypes: float64(8), int64(1), object(4)
memory usage: 10.5+ MB


In [572]:
# Change data type

# Columns to convert to str
cols_to_convert = ['provider_ccn', 'year']

# Convert specified columns to float
union_cost_report[cols_to_convert] = union_cost_report[cols_to_convert].astype(str)

#union_cost_report.info()
#union_cost_report.sample(3)

In [574]:
# LOAD AND CLEAN HD

In [576]:
# Redefine the directory path
directory = '/Users/apple/Documents/APD/HD'

# Get all CSV files matching the pattern
files = glob.glob(os.path.join(directory, 'HD*.csv'))

# Dictionaries to store original DataFrames
raw_HD = {}

# Process each file
for file_path in files:
    # Extract the year from the filename
    filename = os.path.basename(file_path)
    year = filename[2:6]
    
    # Read the CSV file
    df = read_csv_with_encodings(file_path)

    
    if df is not None:
        # Generate year column
        df['year'] = year
        
        # Store the original DataFrame
        raw_HD[f'raw_HD{year}'] = df

# Print out the keys of the dictionary to verify
print("Raw dataframes:", raw_HD.keys())

Successfully read the file with utf-8 encoding
Successfully read the file with utf-8 encoding
Successfully read the file with utf-8 encoding
Successfully read the file with utf-8 encoding
Successfully read the file with utf-8 encoding
Successfully read the file with utf-8 encoding
Successfully read the file with utf-8 encoding
Raw dataframes: dict_keys(['raw_HD2019', 'raw_HD2018', 'raw_HD2020', 'raw_HD2021', 'raw_HD2016', 'raw_HD2017', 'raw_HD2015'])


In [578]:
# Standardize Column Name
cols_to_convert = ['provnum','year']
for key in raw_HD:
    raw_HD[key].columns = raw_HD[key].columns.str.lower()

for key in raw_HD:
    raw_HD[key] = raw_HD[key].rename(columns = {
        'sum of hd code': 'sum_of_hd_code',
        'count of hd code': 'count_of_hd_code'
    })
    raw_HD[key][cols_to_convert] = raw_HD[key][cols_to_convert].astype(str)

In [580]:
# Union all file
union_HD = pd.concat(raw_HD.values(), ignore_index=True)

union_HD.info()
#union_HD.sample(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107516 entries, 0 to 107515
Data columns (total 4 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   provnum           107516 non-null  object
 1   sum_of_hd_code    107516 non-null  int64 
 2   count_of_hd_code  107516 non-null  int64 
 3   year              107516 non-null  object
dtypes: int64(2), object(2)
memory usage: 3.3+ MB


In [582]:
# MERGE ALL FILE 

In [584]:
# Make sure the data type for provnum is char(6)
union_provider_info['provnum'] = union_provider_info['provnum'].astype(str).str.zfill(6)
union_cost_report['provider_ccn'] = union_cost_report['provider_ccn'].astype(str).str.zfill(6)
union_cost_report['HD'] = union_HD['provnum'].astype(str).str.zfill(6)

In [594]:
nursing_merge = union_provider_info.merge(union_cost_report, 
                                          left_on=['provnum', 'year'], 
                                          right_on=['provider_ccn', 'year'], 
                                          how='right')
nursing_merge = nursing_merge.merge(union_HD, 
                      left_on=['provider_ccn', 'year'], 
                      right_on=['provnum', 'year'], 
                      how='left')
nursing_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106269 entries, 0 to 106268
Data columns (total 35 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   provnum_x                105832 non-null  object 
 1   provname                 105832 non-null  object 
 2   address                  105832 non-null  object 
 3   city                     105832 non-null  object 
 4   state                    105832 non-null  object 
 5   zip                      105832 non-null  float64
 6   phone                    105832 non-null  float64
 7   county_ssa               105832 non-null  float64
 8   county_name              105832 non-null  object 
 9   ownership                105831 non-null  object 
 10  bedcert                  105832 non-null  float64
 11  restot                   105260 non-null  float64
 12  overall_rating           104168 non-null  float64
 13  tot_penlty_cnt           105832 non-null  float64
 14  rnhr

In [596]:
# Remove duplicate rows and columns
nursing_merge = nursing_merge.drop_duplicates()
nursing_merge = nursing_merge.drop(columns = 'provnum_y')
nursing_merge = nursing_merge.rename(columns={'provnum_x':'provnum'})

#nursing_merge.info()

In [598]:
# Clean data type in merging file

cols_to_convert = ['fiscal_year_begin_date', 'fiscal_year_end_date']
nursing_merge[cols_to_convert] = nursing_merge[cols_to_convert].apply(lambda x: pd.to_datetime(x, errors='coerce', format='mixed'))

nursing_merge['phone'] = nursing_merge['phone'].astype(str)

pd.set_option('display.max_columns', None)
nursing_merge.info()
#nursing_merge.sample(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106269 entries, 0 to 106268
Data columns (total 34 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   provnum                  105832 non-null  object        
 1   provname                 105832 non-null  object        
 2   address                  105832 non-null  object        
 3   city                     105832 non-null  object        
 4   state                    105832 non-null  object        
 5   zip                      105832 non-null  float64       
 6   phone                    106269 non-null  object        
 7   county_ssa               105832 non-null  float64       
 8   county_name              105832 non-null  object        
 9   ownership                105831 non-null  object        
 10  bedcert                  105832 non-null  float64       
 11  restot                   105260 non-null  float64       
 12  overall_rating  

In [600]:
# Check category
rvu = nursing_merge['rural_versus_urban'].unique()
print(f'rvu: {rvu}')

state_cat = nursing_merge['state'].unique()
print(f'state: {state_cat}')

ownership_cat = nursing_merge['ownership'].unique()
print(f'ownership: {ownership_cat}')

rvu: ['U' 'R' nan]
state: ['NH' nan 'MO' 'MT' 'OK' 'TX' 'WA' 'LA' 'IL' 'WI' 'FL' 'OH' 'PA' 'KY' 'MI'
 'CO' 'CA' 'KS' 'NC' 'CT' 'IA' 'NJ' 'RI' 'IN' 'TN' 'VA' 'MA' 'MN' 'NE'
 'MD' 'OR' 'ID' 'GA' 'AL' 'WV' 'VT' 'SC' 'NM' 'AR' 'UT' 'ME' 'AZ' 'ND'
 'MS' 'SD' 'AK' 'NV' 'NY' 'HI' 'WY' 'DE' 'PR' 'DC']
ownership: ['For profit - Corporation' nan 'Government - City'
 'For profit - Limited Liability company' 'For profit - Partnership'
 'Government - Hospital district' 'For profit - Individual'
 'Government - State' 'Non profit - Corporation' 'Non profit - Other'
 'Non profit - Church related' 'Government - County'
 'Government - Federal' 'Government - City/county']


In [602]:
import numpy as np

def fill_missing_provider_info(df):
    """
    Fill in missing provider infomation by looking up the same provider_ccn from previous years.
    Args: df (pandas.DataFrame): DataFrame containing provider information with some rows missing data
    Returns: pandas.DataFrame: DataFrame with missing provider information filled in
    """
    # Make a copy of the input df
    filled_nursing = nursing_merge.copy()

    # Convert provider_ccn to string to ensure consistent type
    filled_nursing['provider_ccn'] = filled_nursing['provider_ccn'].astype(str)

    # Create reference dict of provider details
    provider_details = {}

    # Columns to fill
    provider_cols = ['provnum','provname','address','city','state','zip','phone','county_ssa','county_name','ownership']

    # Collect all available provider details
    for idx, row in filled_nursing.iterrows():
        provider_ccn = row['provider_ccn']
        if pd.isna(provider_ccn) or provider_ccn == '':
            continue
        # Create dict with values for provider columns
        details = {col: row[col] for col in provider_cols if col in filled_nursing.columns and not pd.isna(row[col])}
        # Add detilas to provider details dict if it doesn't exist or has fewer non-null values
        if provider_ccn not in provider_details or len(details) > len(provider_details[provider_ccn]):
            provider_details[provider_ccn] = details

    # Fill in missing values
    for idx, row in filled_nursing.iterrows():
        provider_ccn = row['provider_ccn']
        if pd.isna(provider_ccn) or provider_ccn == '':
            continue
        if provider_ccn in provider_details:
            for col in provider_cols:
                if col in filled_nursing.columns and (
                    pd.isna(filled_nursing.at[idx, col])
                    or filled_nursing.at[idx, col] == '' or
                    filled_nursing.at[idx, col] == '' or
                    filled_nursing.at[idx, col] == 'nan' or
                    str(filled_nursing.at[idx, col]).strip() == '' or
                    str(filled_nursing.at[idx, col]).lower() == 'nan'
                ):
                    if col in provider_details[provider_ccn]:
                        filled_nursing.at[idx, col] = provider_details[provider_ccn][col]
    return filled_nursing

In [604]:
nursing_merge = fill_missing_provider_info(nursing_merge)

In [606]:
# Same sample, the fiscal year in each report is different -> take all value/day in fiscal year * 365

# Column to adjust
adjust_cols = ['gross_revenue','inpatient_revenue','net_income','net_patient_revenue','total_costs','total_salaries_adjusted','total_income']

# Adjust value to column
nursing_merge['fiscal_period_days'] = (nursing_merge['fiscal_year_end_date'] - nursing_merge['fiscal_year_begin_date']).dt.days

for col in adjust_cols:
    nursing_merge[col + '_annualized'] = nursing_merge[col] * 365 / nursing_merge['fiscal_period_days']

In [612]:
nursing_merge.sample(1)
#nursing_merge.info()

Unnamed: 0,provnum,provname,address,city,state,zip,phone,county_ssa,county_name,ownership,bedcert,restot,overall_rating,tot_penlty_cnt,rnhrd,totlichrd,tothrd,pthrd,year,provider_ccn,rural_versus_urban,gross_revenue,inpatient_revenue,net_income,net_patient_revenue,number_of_beds,total_costs,total_income,total_salaries_adjusted,fiscal_year_begin_date,fiscal_year_end_date,HD,sum_of_hd_code,count_of_hd_code,fiscal_period_days,gross_revenue_annualized,inpatient_revenue_annualized,net_income_annualized,net_patient_revenue_annualized,total_costs_annualized,total_salaries_adjusted_annualized,total_income_annualized
97184,675182,BROWNFIELD REHABILITATION AND CARE CENTER,510 S FIRST ST,BROWNFIELD,TX,79316.0,8066374307.0,913.0,Terry,For profit - Corporation,54.0,42.0,1.0,1.0,0.29881,1.20833,3.15238,0.28452,2015,675182,R,3896901.0,3896901.0,-184536.0,3095468.0,54.0,625870.0,-184536.0,1293180.0,2015-01-01,2015-12-31,195445,258.0,49.0,364.0,3907607.0,3907607.0,-185042.967033,3103972.0,627589.423077,1296733.0,-185042.967033


In [488]:
# Compute average (using group by) and deal with missing value (fiscal period = 0)

In [614]:
def consolidate_annual_reports(df: pd.DataFrame) -> pd.DataFrame:
    """
    Consolidates multiple annual reports for the same provider in the same year by:
    - Averaging financial/cost report columns
    - Taking the first value for all other columns
    
    Args:
        df: DataFrame containing hospital annual report data
        
    Returns:
        DataFrame with one row per provider per year
    """
    # Define columns to average (financial metrics)
    financial_cols = [
        'gross_revenue', 'inpatient_revenue', 'net_income', 'net_patient_revenue',
        'total_costs', 'total_income', 'total_salaries_adjusted',
        'gross_revenue_annualized', 'inpatient_revenue_annualized', 
        'net_income_annualized', 'net_patient_revenue_annualized',
        'total_costs_annualized', 'total_salaries_adjusted_annualized', 
        'total_income_annualized'
    ]
    
    # Create aggregation dictionary - first filter to only include columns that exist in df
    financial_cols = [col for col in financial_cols if col in df.columns]
    
    # Build aggregation dictionary dynamically
    agg_dict = {}
    for col in df.columns:
        if col in financial_cols:
            agg_dict[col] = 'mean'
        elif col not in ['provider_ccn', 'year']:  # Exclude groupby columns
            agg_dict[col] = 'first'
    
    # Group by provider and year, then apply aggregation
    result_df = df.groupby(['provider_ccn', 'year']).agg(agg_dict).reset_index()
    
    return result_df

In [616]:
annualized_nursing_data = consolidate_annual_reports(nursing_merge)

In [618]:
annualized_nursing_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101290 entries, 0 to 101289
Data columns (total 42 columns):
 #   Column                              Non-Null Count   Dtype         
---  ------                              --------------   -----         
 0   provider_ccn                        101290 non-null  object        
 1   year                                101290 non-null  object        
 2   provnum                             101225 non-null  object        
 3   provname                            101225 non-null  object        
 4   address                             101225 non-null  object        
 5   city                                101225 non-null  object        
 6   state                               101225 non-null  object        
 7   zip                                 101225 non-null  float64       
 8   phone                               101290 non-null  object        
 9   county_ssa                          101225 non-null  float64       
 10  county_n

In [620]:
filtered_annualize = annualized_nursing_data.dropna(subset=['fiscal_period_days'])
filtered_annualize = filtered_annualize[filtered_annualize['fiscal_period_days'] > 0]
filtered_annualize = filtered_annualize.drop_duplicates()

In [622]:
filtered_annualize.info()

<class 'pandas.core.frame.DataFrame'>
Index: 99386 entries, 0 to 101289
Data columns (total 42 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   provider_ccn                        99386 non-null  object        
 1   year                                99386 non-null  object        
 2   provnum                             99350 non-null  object        
 3   provname                            99350 non-null  object        
 4   address                             99350 non-null  object        
 5   city                                99350 non-null  object        
 6   state                               99350 non-null  object        
 7   zip                                 99350 non-null  float64       
 8   phone                               99386 non-null  object        
 9   county_ssa                          99350 non-null  float64       
 10  county_name               

In [624]:
# Check for missing provider information 
filtered_annualize['provnum'].isna().sum()

# 36 is relatively small compared to a total 99350 providers, so we can drop them for now

36

In [626]:
# Drop unmatched provnum
nursing_clean = filtered_annualize[filtered_annualize['provnum'].notnull()]
nursing_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 99350 entries, 0 to 101289
Data columns (total 42 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   provider_ccn                        99350 non-null  object        
 1   year                                99350 non-null  object        
 2   provnum                             99350 non-null  object        
 3   provname                            99350 non-null  object        
 4   address                             99350 non-null  object        
 5   city                                99350 non-null  object        
 6   state                               99350 non-null  object        
 7   zip                                 99350 non-null  float64       
 8   phone                               99350 non-null  object        
 9   county_ssa                          99350 non-null  float64       
 10  county_name               

In [524]:
# Export to csv
file_path = os.path.join(os.getcwd(), '/Users/apple/Documents/APD/','nursing_clean.csv')
nursing_clean.to_csv(file_path, index=False)