# Data Cleansing for Health Investment

## Set-up

In [None]:
import pandas as pd
import numpy as np
import os

pd.set_option('display.max_column', None)
pd.set_option('display.float_format', '{:.0f}'.format)

def data_explore(filepath):
    """Explores and prints information about a CSV file."""
    try:
        df = pd.read_csv(filepath, encoding='ISO-8859-1')
    except FileNotFoundError:
        print(f"Error: File not found at {filepath}")
        return None

    print(f"\n\nData dimensions:\n{df.shape}")
    # print(f"\n\nNull values:\n{df.isnull().sum()}")
    return df

# File paths
base_path = "C:/Users/nhien/Downloads/HI/"
provider_files = [f"ProviderInfo_{year}.csv" for year in range(2015, 2022)]
health_deficiency_files = [f"HealthDeficiencies_{year}.csv" for year in range(2015, 2022)]
quality_files = [f"QualityMsrMDS_{year}.csv" for year in range(2015, 2022)]
penalties_files = [f"Penalties_{year}.csv" for year in range(2015, 2022)]

## Cost report files will be imported separately later


## provider_info tables

In [None]:
def process_provider_csv(df, year):
    """Processes provider information CSV files."""
    required_cols = ["provnum", "provname", "state", "address", "city", "zip", 
                     "ownership", "restot", "overall_rating", 
                     "survey_rating", "quality_rating", "staffing_rating", "adj_total"]
    
    df.columns = df.columns.str.lower()
    
    if year >= 2020:
        column_mapping = {
            "federal provider number": "provnum", 
            "provider name": "provname", 
            "provider state": "state", 
            "provider address": "address", 
            "provider city": "city", 
            "provider zip code": "zip", 
            "ownership type": "ownership", 
            "average number of residents per day": "restot", 
            "overall rating": "overall_rating", 
            "health inspection rating": "survey_rating", 
            "qm rating": "quality_rating", 
            "staffing rating": "staffing_rating", 
            "adjusted total nurse staffing hours per resident per day": "adj_total"
        }
        df.rename(columns=column_mapping, inplace=True)
    
    df = df[required_cols]
    df['year'] = year
    return df


# Process provider data
provider_dfs = []
for year, file in zip(range(2015, 2022), provider_files):
    filepath = os.path.join(base_path, file)
    df = data_explore(filepath)
    if df is not None:
        provider_dfs.append(process_provider_csv(df, year))

# Union
provider_info = pd.concat(provider_dfs, ignore_index=True)

# Ensure provnum is char(6) datatype
provider_info['provnum'] = provider_info['provnum'].astype(str).str.zfill(6)





Data dimensions:
(15661, 79)


Data dimensions:
(15651, 80)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = year




Data dimensions:
(15646, 80)


Data dimensions:
(15596, 78)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = year




Data dimensions:
(15471, 85)


Data dimensions:
(15348, 87)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = year




Data dimensions:
(15264, 88)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = year


In [22]:
# # Check data  
# print(provider_info.shape)
# print(provider_info.info())
# print(provider_info.isnull().sum())

(108637, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108637 entries, 0 to 108636
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   provnum          108637 non-null  object 
 1   provname         108637 non-null  object 
 2   state            108637 non-null  object 
 3   address          108637 non-null  object 
 4   city             108637 non-null  object 
 5   zip              108637 non-null  int64  
 6   ownership        108636 non-null  object 
 7   restot           107914 non-null  float64
 8   overall_rating   106945 non-null  float64
 9   survey_rating    107389 non-null  float64
 10  quality_rating   107188 non-null  float64
 11  staffing_rating  104544 non-null  float64
 12  adj_total        104271 non-null  float64
 13  year             108637 non-null  int64  
dtypes: float64(6), int64(2), object(6)
memory usage: 11.6+ MB
None
provnum               0
provname              0
state   

## health_deficiencies table

In [86]:


def process_health_deficiencies_csv(df, year):
    """Processes health deficiencies CSV files."""
    required_cols = ["provnum", "scope"]
    
    df.columns = df.columns.str.lower().str.strip()
    
    if year >= 2020:
        column_mapping = {
            "federal provider number": "provnum", 
            "scope severity code": "scope"
        }
        df.rename(columns=column_mapping, inplace=True)
    
    df = df[required_cols]
    df['year'] = year
    return df


# Process health deficiencies data
health_deficiency_dfs = []
for year, file in zip(range(2015, 2022), health_deficiency_files):
    filepath = os.path.join(base_path, file)
    df = data_explore(filepath)
    if df is not None:
        health_deficiency_dfs.append(process_health_deficiencies_csv(df, year))

# Union
health_deficiency = pd.concat(health_deficiency_dfs, ignore_index=True)

# Add pk for database primary key (provnum & year are foreign keys)
health_deficiency['pk'] = range(1, len(health_deficiency) + 1)

# Encode HD score to number
scope_mapping = {chr(65 + i): i + 1 for i in range(12)}
health_deficiency['scope_num'] = health_deficiency['scope'].map(scope_mapping)
health_deficiency.loc[health_deficiency['year'] == 2021, 'scope_num'] = health_deficiency['scope']

# Ensure provnum is char(6) datatype
health_deficiency['provnum'] = health_deficiency['provnum'].astype(str).str.zfill(6)


  df = pd.read_csv(filepath, encoding='ISO-8859-1')




Data dimensions:
(479167, 18)


Null values:
provnum                  0
Provname                 0
address                  0
city                     0
state                    0
zip                      0
survey_date_output       0
SurveyType               0
defpref                  0
tag                      0
tag_desc                 0
scope                    0
defstat                  0
statdate              7620
cycle                    0
standard                 0
complaint                0
filedate                 0
dtype: int64


Duplicated values:
0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 479167 entries, 0 to 479166
Data columns (total 18 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   provnum             479167 non-null  object
 1   Provname            479167 non-null  object
 2   address             479167 non-null  object
 3   city                479167 non-null  object
 4   state               4

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = year
  df = pd.read_csv(filepath, encoding='ISO-8859-1')




Data dimensions:
(329324, 18)


Null values:
provnum                  0
Provname                 0
address                  0
city                     0
state                    0
zip                      0
survey_date_output       0
SurveyType               0
defpref                  0
tag                      0
tag_desc                 0
scope                    0
defstat                  0
statdate              3566
cycle                    0
standard                 0
complaint                0
filedate                 0
dtype: int64


Duplicated values:
0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 329324 entries, 0 to 329323
Data columns (total 18 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   provnum             329324 non-null  object
 1   Provname            329324 non-null  object
 2   address             329324 non-null  object
 3   city                329324 non-null  object
 4   state               3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = year
  df = pd.read_csv(filepath, encoding='ISO-8859-1')




Data dimensions:
(338451, 18)


Null values:
provnum                  0
Provname                 0
address                  0
city                     0
state                    0
zip                      0
survey_date_output       0
SurveyType               0
defpref                  0
tag                      0
tag_desc                 0
scope                    0
defstat                  0
statdate              2116
cycle                    0
standard                 0
complaint                0
filedate                 0
dtype: int64


Duplicated values:
0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 338451 entries, 0 to 338450
Data columns (total 18 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   provnum             338451 non-null  object
 1   Provname            338451 non-null  object
 2   address             338451 non-null  object
 3   city                338451 non-null  object
 4   state               3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = year
  df = pd.read_csv(filepath, encoding='ISO-8859-1')




Data dimensions:
(350887, 19)


Null values:
PROVNUM                     0
PROVNAME                    0
ADDRESS                     0
CITY                        0
STATE                       0
ZIP                         0
SURVEY_DATE_OUTPUT          0
HLTHSRVY_POST20171128       0
SURVEYTYPE                  0
DEFPREF                     0
TAG                         0
TAG_DESC                    0
SCOPE                       0
DEFSTAT                     0
STATDATE                 1524
cycle                       0
Standard                    0
Complaint                   0
FILEDATE                    0
dtype: int64


Duplicated values:
0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350887 entries, 0 to 350886
Data columns (total 19 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   PROVNUM                350887 non-null  object
 1   PROVNAME               350887 non-null  object
 2   ADDRESS                

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = year
  df = pd.read_csv(filepath, encoding='ISO-8859-1')




Data dimensions:
(362931, 19)


Null values:
PROVNUM                  0
PROVNAME                 0
ADDRESS                  0
CITY                     0
STATE                    0
ZIP                      0
SURVEY_DATE_OUTPUT       0
SURVEYTYPE               0
DEFPREF                  0
CATEGORY                 0
TAG                      0
TAG_DESC                 0
SCOPE                    0
DEFSTAT                  0
STATDATE              1701
cycle                    0
Standard                 0
Complaint                0
FILEDATE                 0
dtype: int64


Duplicated values:
0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362931 entries, 0 to 362930
Data columns (total 19 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   PROVNUM             362931 non-null  object
 1   PROVNAME            362931 non-null  object
 2   ADDRESS             362931 non-null  object
 3   CITY                362931 non-null  object

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = year
  df = pd.read_csv(filepath, encoding='ISO-8859-1')




Data dimensions:
(360443, 20)


Null values:
Federal Provider Number      0
Provider Name                0
Provider Address             0
Provider City                0
Provider State               0
Provider Zip Code            0
Survey Date                  0
Survey Type                  0
Deficiency Prefix            0
Deficiency Category          0
Deficiency Tag Number        0
Deficiency Description       0
Scope Severity Code          0
Deficiency Corrected         0
Correction Date            804
Inspection Cycle             0
Standard Deficiency          0
Complaint Deficiency         0
Location                     0
Processing Date              0
dtype: int64


Duplicated values:
0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360443 entries, 0 to 360442
Data columns (total 20 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   Federal Provider Number  360443 non-null  object
 1   Provider Name      

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = year
  df = pd.read_csv(filepath, encoding='ISO-8859-1')




Data dimensions:
(370746, 21)


Null values:
Federal Provider Number                       0
Provider Name                                 0
Provider Address                              0
Provider City                                 0
Provider State                                0
Provider Zip Code                             0
Survey Date                                   0
Survey Type                                   0
Deficiency Prefix                             0
Deficiency Category                           0
Deficiency Tag Number                         0
Deficiency Description                        0
Scope Severity Code                           0
Deficiency Corrected                          0
Correction Date                            2001
Inspection Cycle                              0
Standard Deficiency                           0
Complaint Deficiency                          0
Infection Control Inspection Deficiency       0
Location                                 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = year
  health_deficiency.loc[health_deficiency['year'] == 2021, 'scope_num'] = health_deficiency['scope']


In [131]:
health_deficiency['year'].value_counts()


year
2015    479167
2021    370746
2019    362931
2020    360443
2018    350887
2017    338451
2016    329324
Name: count, dtype: int64

In [87]:

print(health_deficiency.shape)
print(provider_info.info())
print(health_deficiency.isnull().sum())

(2591949, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108637 entries, 0 to 108636
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   provnum          108637 non-null  object 
 1   provname         108637 non-null  object 
 2   state            108637 non-null  object 
 3   address          108637 non-null  object 
 4   city             108637 non-null  object 
 5   zip              108637 non-null  int64  
 6   ownership        108636 non-null  object 
 7   restot           107914 non-null  float64
 8   overall_rating   106945 non-null  float64
 9   survey_rating    107389 non-null  float64
 10  quality_rating   107188 non-null  float64
 11  staffing_rating  104544 non-null  float64
 12  adj_total        104271 non-null  float64
 13  year             108637 non-null  int64  
dtypes: float64(6), int64(2), object(6)
memory usage: 11.6+ MB
None
provnum      0
scope        0
year         0
pk         

In [91]:
print(health_deficiency['provnum'].value_counts())

provnum
145371    842
395382    820
555336    808
555566    797
055017    793
         ... 
106127      1
095040      1
045468      1
37E632      1
676499      1
Name: count, Length: 16329, dtype: int64


## quality

In [None]:
def process_qa_csv(df, year):
    """Processes quality measure MDS CSV files."""
    required_cols = ["provnum", "msr_cd", "msr_descr", "avg_score"]
    
    df.columns = df.columns.str.lower().str.strip()
    
    if year < 2020:
        if year == 2015:
            df.rename(columns={"measure_score_3qtr_avg": "avg_score"}, inplace=True)
        else:
            df.rename(columns={"measure_score_4qtr_avg": "avg_score"}, inplace=True)
    else:
        column_mapping = {
            "federal provider number": "provnum", 
            "measure code": "msr_cd",
            "measure description": "msr_descr",
            "four quarter average score": "avg_score"
        }
        df.rename(columns=column_mapping, inplace=True)
    
    df = df[required_cols]
    df['year'] = year
    return df


# Process quality data
quality_dfs = []
for year, file in zip(range(2015, 2022), quality_files):
    filepath = os.path.join(base_path, file)
    df = data_explore(filepath)
    if df is not None:
        quality_dfs.append(process_qa_csv(df, year))

# Union
quality = pd.concat(quality_dfs, ignore_index=True)

# Add pk for database primary key (provnum & year are foreign keys)
quality['pk'] = range(1, len(quality) + 1)

# Ensure provnum is char(6) datatype
quality['provnum'] = quality['provnum'].astype(str).str.zfill(6)



  df = pd.read_csv(filepath, encoding='ISO-8859-1')




Data dimensions:
(281898, 22)


Null values:
provnum                        0
provname                       0
address                        0
city                           0
state                          0
zip                            0
msr_cd                         0
msr_descr                      0
stay_type                      0
q1_measure_score           56134
q1_measure_fn             225764
q2_measure_score           54979
q2_measure_fn             226919
q3_measure_score           54526
q3_measure_fn             227372
measure_score_3qtr_avg     15637
score3qtr_fn              266261
five_star_msr                  0
q1_quarter                     0
q2_quarter                     0
q3_quarter                     0
filedate                       0
dtype: int64


Duplicated values:
0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 281898 entries, 0 to 281897
Data columns (total 22 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------               

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = year
  df = pd.read_csv(filepath, encoding='ISO-8859-1')




Data dimensions:
(328671, 25)


Null values:
PROVNUM                        0
PROVNAME                       0
ADDRESS                        0
CITY                           0
STATE                          0
ZIP                            0
MSR_CD                         0
MSR_DESCR                      0
STAY_TYPE                      0
Q1_MEASURE_SCORE           48371
Q1_MEASURE_FN             280300
Q2_MEASURE_SCORE           48069
Q2_MEASURE_FN             280602
Q3_MEASURE_SCORE           47283
Q3_MEASURE_FN             281388
Q4_MEASURE_SCORE           47021
Q4_MEASURE_FN             281650
MEASURE_SCORE_4QTR_AVG     15471
SCORE4QTR_FN              313200
FIVE_STAR_MSR                  0
Q1_QUARTER                     0
Q2_QUARTER                     0
Q3_QUARTER                     0
Q4_QUARTER                     0
FILEDATE                       0
dtype: int64


Duplicated values:
0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 328671 entries, 0 to 328670
Data columns (

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = year
  df = pd.read_csv(filepath, encoding='ISO-8859-1')




Data dimensions:
(328566, 25)


Null values:
PROVNUM                        0
PROVNAME                       0
ADDRESS                        0
CITY                           0
STATE                          0
ZIP                            0
MSR_CD                         0
MSR_DESCR                      0
STAY_TYPE                      0
Q1_MEASURE_SCORE           47725
Q1_MEASURE_FN             280841
Q2_MEASURE_SCORE           47773
Q2_MEASURE_FN             280793
Q3_MEASURE_SCORE           46730
Q3_MEASURE_FN             281836
Q4_MEASURE_SCORE           46478
Q4_MEASURE_FN             282088
MEASURE_SCORE_4QTR_AVG     15090
SCORE4QTR_FN              313476
FIVE_STAR_MSR                  0
Q1_QUARTER                     0
Q2_QUARTER                     0
Q3_QUARTER                     0
Q4_QUARTER                     0
FILEDATE                       0
dtype: int64


Duplicated values:
0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 328566 entries, 0 to 328565
Data columns (

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = year




Data dimensions:
(327516, 25)


Null values:
PROVNUM                        0
PROVNAME                       0
ADDRESS                        0
CITY                           0
STATE                          0
ZIP                            0
MSR_CD                         0
MSR_DESCR                      0
STAY_TYPE                      0
Q1_MEASURE_SCORE           47081
Q1_MEASURE_FN             280435
Q2_MEASURE_SCORE           47382
Q2_MEASURE_FN             280134
Q3_MEASURE_SCORE           46317
Q3_MEASURE_FN             281199
Q4_MEASURE_SCORE           46103
Q4_MEASURE_FN             281413
MEASURE_SCORE_4QTR_AVG     14531
SCORE4QTR_FN              312985
FIVE_STAR_MSR                  0
Q1_QUARTER                     0
Q2_QUARTER                     0
Q3_QUARTER                     0
Q4_QUARTER                     0
FILEDATE                       0
dtype: int64


Duplicated values:
0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 327516 entries, 0 to 327515
Data columns (

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = year




Data dimensions:
(293949, 22)


Null values:
PROVNUM                        0
PROVNAME                       0
ADDRESS                        0
CITY                           0
STATE                          0
ZIP                            0
MSR_CD                         0
MSR_DESCR                      0
STAY_TYPE                      0
Q1_MEASURE_SCORE           53542
Q1_MEASURE_FN             240407
Q2_MEASURE_SCORE           53250
Q2_MEASURE_FN             240699
Q3_MEASURE_SCORE           52949
Q3_MEASURE_FN             241000
Q4_MEASURE_SCORE           52882
Q4_MEASURE_FN             241067
MEASURE_SCORE_4QTR_AVG     13697
SCORE4QTR_FN              280252
FIVE_STAR_MSR                  0
MEASURE_PERIOD                 0
FILEDATE                       0
dtype: int64


Duplicated values:
0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 293949 entries, 0 to 293948
Data columns (total 22 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------               

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = year
  df = pd.read_csv(filepath, encoding='ISO-8859-1')




Data dimensions:
(276264, 23)


Null values:
Federal Provider Number                          0
Provider Name                                    0
Provider Address                                 0
Provider City                                    0
Provider State                                   0
Provider Zip Code                                0
Measure Code                                     0
Measure Description                              0
Resident type                                    0
Q1 Measure Score                             37468
Footnote for Q1 Measure Score               238796
Q2 Measure Score                             37208
Footnote for Q2 Measure Score               239056
Q3 Measure Score                             37112
Footnote for Q3 Measure Score               239152
Q4 Measure Score                             37501
Footnote for Q4 Measure Score               238763
Four Quarter Average Score                   12131
Footnote for Four Quarter Average S

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = year
  df = pd.read_csv(filepath, encoding='ISO-8859-1')




Data dimensions:
(274752, 23)


Null values:
Federal Provider Number                          0
Provider Name                                    0
Provider Address                                 0
Provider City                                    0
Provider State                                   0
Provider Zip Code                                0
Measure Code                                     0
Measure Description                              0
Resident type                                    0
Q1 Measure Score                             45997
Footnote for Q1 Measure Score               228755
Q2 Measure Score                             48872
Footnote for Q2 Measure Score               225880
Q3 Measure Score                             48652
Footnote for Q3 Measure Score               226100
Q4 Measure Score                             46448
Footnote for Q4 Measure Score               228304
Four Quarter Average Score                   13084
Footnote for Four Quarter Average S

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = year


In [130]:
quality['year'].value_counts()


year
2016    328671
2017    328566
2018    327516
2019    293949
2015    281898
2020    276264
2021    274752
Name: count, dtype: int64

In [95]:
print(quality.shape)
print(quality.info())
print(quality.isnull().sum())

(2111616, 6)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111616 entries, 0 to 2111615
Data columns (total 6 columns):
 #   Column     Dtype  
---  ------     -----  
 0   provnum    object 
 1   msr_cd     int64  
 2   msr_descr  object 
 3   avg_score  float64
 4   year       int64  
 5   pk         int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 96.7+ MB
None
provnum          0
msr_cd           0
msr_descr        0
avg_score    99641
year             0
pk               0
dtype: int64


## penalty

In [None]:
def process_pe_csv(df, year):
    """Processes penalties CSV files."""
    required_cols = ["provnum", "pnlty_type", "fine_amt", "payden_days"]
    
    df.columns = df.columns.str.lower().str.strip()
    
    if year >= 2020:
        column_mapping = {
            "federal provider number": "provnum", 
            "penalty type": "pnlty_type",
            "fine amount": "fine_amt",
            "payment denial length in days": "payden_days"
        }
        df.rename(columns=column_mapping, inplace=True)
    
    df = df[required_cols]
    df['year'] = year
    return df

# Process penalties data
penalties_dfs = []
for year, file in zip(range(2015, 2022), penalties_files):
    filepath = os.path.join(base_path, file)
    df = data_explore(filepath)
    if df is not None:
        penalties_dfs.append(process_pe_csv(df, year))

# Union
penalties = pd.concat(penalties_dfs, ignore_index=True)

# Add pk for database primary key (provnum & year are foreign keys)
penalties['pk'] = range(1, len(penalties) + 1)

# Ensure provnum is char(6) datatype
penalties['provnum'] = penalties['provnum'].astype(str).str.zfill(6)




Data dimensions:
(6478, 12)


Null values:
provnum              0
provname             0
address              0
city                 0
state                0
zip                  0
pnlty_date           0
pnlty_type           0
fine_amt          1398
payden_strt_dt    5080
payden_days       5080
filedate             0
dtype: int64


Duplicated values:
266
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6478 entries, 0 to 6477
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   provnum         6478 non-null   int64  
 1   provname        6478 non-null   object 
 2   address         6478 non-null   object 
 3   city            6478 non-null   object 
 4   state           6478 non-null   object 
 5   zip             6478 non-null   int64  
 6   pnlty_date      6478 non-null   object 
 7   pnlty_type      6478 non-null   object 
 8   fine_amt        5080 non-null   float64
 9   payden_strt_dt  1398 non-null   ob

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docu



Duplicated values:
0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9067 entries, 0 to 9066
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   provnum         9067 non-null   int64  
 1   provname        9067 non-null   object 
 2   address         9067 non-null   object 
 3   city            9067 non-null   object 
 4   state           9067 non-null   object 
 5   zip             9067 non-null   int64  
 6   pnlty_date      9067 non-null   object 
 7   pnlty_type      9067 non-null   object 
 8   fine_amt        7440 non-null   float64
 9   payden_strt_dt  1627 non-null   object 
 10  payden_days     1627 non-null   float64
 11  filedate        9067 non-null   object 
dtypes: float64(2), int64(2), object(8)
memory usage: 850.2+ KB


Summary:
None


Data dimensions:
(6640, 13)


Null values:
Federal Provider Number             0
Provider Name                       0
Provider Address                    0
P

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = year


In [128]:
penalties['year'].value_counts()

year
2021    26146
2019     9067
2018     8742
2017     7598
2016     6817
2020     6640
2015     6478
Name: count, dtype: int64

In [93]:

print(penalties.shape)
print(penalties.tail())
print(penalties.info())
print(penalties.isnull().sum())

(71488, 6)
      provnum pnlty_type  fine_amt  payden_days  year     pk
71483  676499       Fine      1300          NaN  2021  71484
71484  676499       Fine      1625          NaN  2021  71485
71485  676499       Fine      1950          NaN  2021  71486
71486  676499       Fine      2275          NaN  2021  71487
71487  686124       Fine     13065          NaN  2021  71488
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71488 entries, 0 to 71487
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   provnum      71488 non-null  object 
 1   pnlty_type   71488 non-null  object 
 2   fine_amt     60632 non-null  float64
 3   payden_days  10856 non-null  float64
 4   year         71488 non-null  int64  
 5   pk           71488 non-null  int64  
dtypes: float64(2), int64(2), object(2)
memory usage: 3.3+ MB
None
provnum            0
pnlty_type         0
fine_amt       10856
payden_days    60632
year               0
pk     

## cost_report

### clean & union

In [None]:
CostReport_2015= data_explore("C:/Users/nhien/Downloads/HI/2015_CostReport.csv")
CostReport_2016= data_explore("C:/Users/nhien/Downloads/HI/2016_CostReport.csv")
CostReport_2017= data_explore("C:/Users/nhien/Downloads/HI/2017_CostReport.csv")
CostReport_2018= data_explore("C:/Users/nhien/Downloads/HI/2018_CostReport.csv")
CostReport_2019= data_explore("C:/Users/nhien/Downloads/HI/2019_CostReport.csv")
CostReport_2020= data_explore("C:/Users/nhien/Downloads/HI/2020_CostReport.csv")
CostReport_2021= data_explore("C:/Users/nhien/Downloads/HI/2021_CostReport.csv")

def process_cr(df):
  
    # Ensure provnum is char(6)
    df['Provider_CCN']=df['Provider_CCN'].astype(str).str.zfill(6) 

    # Fiscal year is char so adding a"Date_Difference" columnn to account for later
    df['Fiscal_Year_Begin_Date'] = pd.to_datetime(df['Fiscal_Year_Begin_Date'])
    df['Fiscal_Year_End_Date'] = pd.to_datetime(df['Fiscal_Year_End_Date'])
    # Calculate date difference for each row
    df['fiscal_period'] = (df['Fiscal_Year_End_Date'] - df['Fiscal_Year_Begin_Date']).dt.days

    required_cols = ['provnum',
                     'rural_versus_urban',
                     'accounts_receivable',
                     'cash_on_hand_and_in_banks',
                     'gross_revenue',
                     'inpatient_revenue',
                     'less_total_operating_expense',
                     'net_income',
                     'net_income_from_patients',
                     'net_patient_revenue',
                     'overhead_non_salary_costs',
                     'snf_days_total',
                     'salaries_wages_and_fees_payable',
                     'total_liab_and_fund_balances',
                     'total_salaries_adjusted',
                     'total_fund_balances',
                     'total_liabilities',                     
                     'snf_admissions_total',
                     'fiscal_period']
    
    # Convert column names to lowercase
    df.columns = [col.lower() for col in df.columns]
    
        # Define column mapping
    column_mapping = {
        "provider_ccn": "provnum" 
    }
    
   # Rename columns based on mapping
    df.rename(columns=column_mapping, inplace=True)

    # Keep only the required columns
    df = df[required_cols]
   
    return df


# Process the 2015-2019 files
cr_2015 = process_cr(CostReport_2015)
cr_2016 = process_cr(CostReport_2016)
cr_2017 = process_cr(CostReport_2017)
cr_2018 = process_cr(CostReport_2018)
cr_2019 = process_cr(CostReport_2019)

def process_cr(df):
  


    # Fiscal year is char so adding a"Date_Difference" columnn to account for later
    df['Fiscal Year Begin Date'] = pd.to_datetime(df['Fiscal Year Begin Date'])
    df['Fiscal Year End Date'] = pd.to_datetime(df['Fiscal Year End Date'])
    # Calculate date difference for each row
    df['fiscal_period'] = (df['Fiscal Year End Date'] - df['Fiscal Year Begin Date']).dt.days

    required_cols = ['provnum',
                     'rural_versus_urban',
                     'accounts_receivable',
                     'cash_on_hand_and_in_banks',
                     'gross_revenue',
                     'inpatient_revenue',
                     'less_total_operating_expense',
                     'net_income',
                     'net_income_from_patients',
                     'net_patient_revenue',
                     'overhead_non_salary_costs',
                     'snf_days_total',
                     'salaries_wages_and_fees_payable',
                     'total_liab_and_fund_balances',
                     'total_salaries_adjusted',
                     'total_fund_balances',
                     'total_liabilities',
                     'snf_admissions_total',
                     'fiscal_period']
    
    # Define column mapping
    column_mapping = {
            'provider ccn':'provnum',
            'rural versus urban':'rural_versus_urban',
            'accounts receivable':'accounts_receivable',
            'cash on hand and in banks':'cash_on_hand_and_in_banks',
            'gross revenue':'gross_revenue',
            'inpatient revenue':'inpatient_revenue',
            'less total operating expense':'less_total_operating_expense',
            'net income':'net_income',
            'net income from service to patients':'net_income_from_patients',
            'net patient revenue':'net_patient_revenue',
            'overhead non-salary costs':'overhead_non_salary_costs',
            'snf days total':'snf_days_total',
            'salaries, wages, and fees payable':'salaries_wages_and_fees_payable',
            'total liabilities and fund balances':'total_liab_and_fund_balances',
            'total salaries (adjusted)':'total_salaries_adjusted',
            'total fund balances':'total_fund_balances',
            'total liabilities':'total_liabilities',
            'nf admissions total':'snf_admissions_total'        
    }
    # Convert column names to lowercase
    df.columns = [col.lower() for col in df.columns]
   # Rename columns based on mapping
    df.rename(columns=column_mapping, inplace=True)
    
    # Ensure provnum is char(6)
    df['provnum']=df['provnum'].astype(str).str.zfill(6) 

    # Keep only the required columns
    df = df[required_cols]
   
    return df

cr_2020 = process_cr(CostReport_2020)
cr_2021 = process_cr(CostReport_2021)

# Add a year column to each dataset
cr_2015['year'] = 2015
cr_2016['year'] = 2016
cr_2017['year'] = 2017
cr_2018['year'] = 2018
cr_2019['year'] = 2019
cr_2020['year'] = 2020
cr_2021['year'] = 2021

# Concatenate all datasets into a single dataframe
cost_report = pd.concat([
    cr_2015, 
    cr_2016, 
    cr_2017, 
    cr_2018,
    cr_2019, 
    cr_2020, 
    cr_2021], ignore_index=True)




  df = pd.read_csv(filepath, encoding='ISO-8859-1')




Data dimensions:
(15402, 74)


Null values:
rpt_rec_num                   0
Provider_CCN                  0
Facility_Name                 0
Street_Address                3
City                          0
                           ... 
Total_fund_balances         484
Total_liabilities           617
Total_other_Assets         4727
Wage_related_Costs_core     545
Fixed_equipment            8790
Length: 74, dtype: int64


Duplicated values:
0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15402 entries, 0 to 15401
Data columns (total 74 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   rpt_rec_num                      15402 non-null  int64  
 1   Provider_CCN                     15402 non-null  int64  
 2   Facility_Name                    15402 non-null  object 
 3   Street_Address                   15399 non-null  object 
 4   City                             15402 non-null  object 
 5   State_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cr_2015['year'] = 2015
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cr_2016['year'] = 2016
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cr_2017['year'] = 2017
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cave

provnum                                0
rural_versus_urban                  2187
accounts_receivable                 5038
cash_on_hand_and_in_banks           4567
gross_revenue                       2283
inpatient_revenue                   2284
less_total_operating_expense        2074
net_income                          2360
net_income_from_patients            2081
net_patient_revenue                 2279
overhead_non_salary_costs           2072
snf_days_total                      2092
salaries_wages_and_fees_payable    11260
total_liab_and_fund_balances        2190
total_salaries_adjusted             2073
total_fund_balances                 2434
total_liabilities                   3017
snf_admissions_total               31685
fiscal_period                       2072
year                                   0
dtype: int64

In [None]:
# # check
# cost_report.shape
# cost_report.isnull().sum()
# cost_report['year'].value_counts()
# cost_report.isnull().sum()



year
2017    15433
2015    15402
2019    15182
2018    15142
2016    15104
2021    15057
2020    14949
Name: count, dtype: int64

## Annualize data
Since a provnum can have many cost reports a year, the following code annualize the numbers

In [None]:

import pandas as pd
import numpy as np

def create_annual_cost_report(cost_report: pd.DataFrame) -> pd.DataFrame:
    """Creates an annual cost report with NaN handling during aggregation."""
    columns_to_adjust = cost_report.columns[2:18]
    cost_report = cost_report[cost_report['fiscal_period'] > 0]

    for col in columns_to_adjust:
        cost_report[col] = (cost_report[col] / cost_report['fiscal_period']) * 365

    # Custom aggregation function to handle NaNs correctly
    def nan_aware_mean(series):
        if series.isnull().all():
            return np.nan
        else:
            return series.mean()

    # Group and aggregate, using nan_aware_mean
    agg_dict = {col: nan_aware_mean for col in columns_to_adjust}
    agg_dict['rural_versus_urban'] = 'first'

    annual_report = cost_report.groupby(['provnum', 'year']).agg(agg_dict).reset_index()

    return annual_report


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cost_report[col] = (cost_report[col] / cost_report['fiscal_period']) * 365


(99386, 19)


year
2017    14305
2018    14305
2019    14270
2020    14246
2021    14194
2016    14071
2015    13995
Name: count, dtype: int64

In [None]:
# # check
# annual_report = create_annual_cost_report(cost_report)
# print(annual_report.shape)
# annual_report['year'].value_counts()
# annual_report.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99386 entries, 0 to 99385
Data columns (total 19 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   provnum                          99386 non-null  object 
 1   year                             99386 non-null  int64  
 2   accounts_receivable              96672 non-null  float64
 3   cash_on_hand_and_in_banks        97146 non-null  float64
 4   gross_revenue                    99209 non-null  float64
 5   inpatient_revenue                99209 non-null  float64
 6   less_total_operating_expense     99384 non-null  float64
 7   net_income                       99142 non-null  float64
 8   net_income_from_patients         99380 non-null  float64
 9   net_patient_revenue              99213 non-null  float64
 10  overhead_non_salary_costs        99386 non-null  float64
 11  snf_days_total                   99371 non-null  float64
 12  salaries_wages_and

If fiscal_report is null, all numerical variables of that row is also null. Therefore, it is safe to drop them in the code above:
```
    cost_report = cost_report[cost_report['fiscal_period'] > 0]
```
The following code proves so. Since they are not essential for data cleansing, they are commented out. Uncomment to see the results.

In [145]:
# Define numerical columns (cols 2–16) — adjust this if needed
columns_to_adjust = cost_report.columns[2:18]

# 1. Count nulls before filtering
nulls_before = cost_report[columns_to_adjust].isnull().sum()

# 2. Remove rows with invalid fiscal_period
filtered_cost_report = cost_report.dropna(subset=['fiscal_period'])
filtered_cost_report = filtered_cost_report[filtered_cost_report['fiscal_period'] > 0]

# 3. Count nulls after filtering
nulls_after = filtered_cost_report[columns_to_adjust].isnull().sum()

# 4. Calculate difference
nulls_removed_with_fiscal_period = nulls_before - nulls_after

# Display comparison
comparison_df = pd.DataFrame({
    'Nulls Before': nulls_before,
    'Nulls After': nulls_after,
    'Difference (Removed Nulls)': nulls_removed_with_fiscal_period
})

print(comparison_df)

# Optional: Total number of rows removed
rows_removed = cost_report.shape[0] - filtered_cost_report.shape[0]
print(f"\nTotal rows removed due to invalid fiscal_period: {rows_removed}")


                                 Nulls Before  Nulls After  \
accounts_receivable                      5038         2966   
cash_on_hand_and_in_banks                4567         2495   
gross_revenue                            2283          211   
inpatient_revenue                        2284          212   
less_total_operating_expense             2074            2   
net_income                               2360          288   
net_income_from_patients                 2081            9   
net_patient_revenue                      2279          207   
overhead_non_salary_costs                2072            0   
snf_days_total                           2092           20   
salaries_wages_and_fees_payable         11260         9188   
total_liab_and_fund_balances             2190          118   
total_salaries_adjusted                  2073            1   
total_fund_balances                      2434          362   
total_liabilities                        3017          945   
snf_admi

The following code further proves that the new number of rows is valid. Uncomment to see. 

In [None]:
# # 1. Unique provnum-year pairs in the original data
# original_pairs = cost_report[['provnum', 'year']]
# original_unique_pairs = original_pairs.dropna().drop_duplicates()
# num_original_pairs = original_unique_pairs.shape[0]
# print(f"Original unique (provnum, year) pairs: {num_original_pairs}")

# # 2. Drop rows with bad fiscal_period and track which ones
# filtered = cost_report.dropna(subset=['fiscal_period', 'year'])
# filtered = filtered[filtered['fiscal_period'] > 0]

# filtered_unique_pairs = filtered[['provnum', 'year']].drop_duplicates()
# num_filtered_pairs = filtered_unique_pairs.shape[0]
# print(f"Remaining unique (provnum, year) pairs after valid fiscal_period filter: {num_filtered_pairs}")

# # 3. Confirm drop count
# num_dropped = num_original_pairs - num_filtered_pairs
# print(f"Number of (provnum, year) combinations dropped: {num_dropped}")

# # 4. Optionally: Show which combinations were dropped
# dropped_pairs = pd.merge(original_unique_pairs, filtered_unique_pairs, 
#                          on=['provnum', 'year'], how='left', indicator=True)
# dropped_pairs = dropped_pairs[dropped_pairs['_merge'] == 'left_only']
# print(f"\nSample dropped pairs due to invalid fiscal_period:\n{dropped_pairs[['provnum', 'year']].head()}")


Original unique (provnum, year) pairs: 101290
Remaining unique (provnum, year) pairs after valid fiscal_period filter: 99386
Number of (provnum, year) combinations dropped: 1904

Sample dropped pairs due to invalid fiscal_period:
    provnum  year
184  525716  2015
185  525718  2015
186  525719  2015
187  175443  2015
191  245578  2015


### join with provider_info & back-fill

In [None]:
def left_join_with_provider_info(annual_report: pd.DataFrame, provider_info: pd.DataFrame) -> pd.DataFrame:
    """Performs a left join with provider_info and counts missing rows."""
    merged = pd.merge(annual_report, provider_info, on=['provnum', 'year'], how='left')
    provider_columns = [col for col in provider_info.columns if col not in ['provnum', 'year']]
    missing_provider_info = merged[provider_columns].isnull().all(axis=1).sum()
    print(f"Number of rows in annual_report without matching provider_info by provnum & year: {missing_provider_info}")
    return merged

merged_data = left_join_with_provider_info(annual_report, provider_info)

def backfill_provider_info(merged_data: pd.DataFrame, provider_info: pd.DataFrame) -> pd.DataFrame:
    """Backfills missing provider_info using a lookup table."""
    provider_columns = [col for col in provider_info.columns if col not in ['provnum', 'year']]
    provnum_lookup = provider_info.sort_values('year').drop_duplicates('provnum')[['provnum'] + provider_columns]
    merged_with_lookup = pd.merge(merged_data, provnum_lookup, on='provnum', how='left', suffixes=('', '_lookup'))
    for col in provider_columns:
        merged_with_lookup[col] = merged_with_lookup[col].combine_first(merged_with_lookup[f"{col}_lookup"])
        merged_with_lookup.drop(columns=[f"{col}_lookup"], inplace=True)
    return merged_with_lookup

merged_filled = backfill_provider_info(merged_data, provider_info)



Number of rows in annual_report without matching provider_info by provnum & year: 306
Total rows still unmatched by provnum: 36


Unnamed: 0,provnum,year,accounts_receivable,cash_on_hand_and_in_banks,gross_revenue,inpatient_revenue,less_total_operating_expense,net_income,net_income_from_patients,net_patient_revenue,overhead_non_salary_costs,snf_days_total,salaries_wages_and_fees_payable,total_liab_and_fund_balances,total_salaries_adjusted,total_fund_balances,total_liabilities,snf_admissions_total,rural_versus_urban,provname,state,address,city,zip,ownership,restot,overall_rating,survey_rating,quality_rating,staffing_rating,adj_total
1467,15462,2015,489631.0,8150.0,4678692,4678692,4430379,-1345434,-1346342,3084037,2258291,9801.0,77411.0,-1382587.0,2172088,-2283504.0,900917.0,139.0,U,,,,,,,,,,,,
1468,15462,2016,439443.0,-15327.0,3589981,3589981,3912985,-1287130,-1287343,2625642,2078666,8086.0,59087.0,-2393267.0,1834319,-3576914.0,1183647.0,97.0,U,,,,,,,,,,,,
2146,35209,2015,1811816.0,1110959.0,2005581,2005581,6153315,-4312947,-4336515,1816800,3826314,1586.0,84340.0,11508249.0,2011792,-10891845.0,22400094.0,,U,,,,,,,,,,,,
4029,55010,2015,3298810.0,-407741.0,5913237,5913237,5514745,-23861,-20653,5494092,3456819,22542.0,354431.0,8647159.0,2057926,7622443.0,1024715.0,293.0,U,,,,,,,,,,,,
7754,56343,2015,1734878.0,-976953.0,2950812,2950812,3350351,-491860,-523057,2827294,2132757,18508.0,538076.0,4299638.0,1217595,-4209483.0,8509122.0,78.0,U,,,,,,,,,,,,
9669,65428,2018,1703602.0,444641.0,4078952,4078952,12720675,-9477095,-9502965,3217710,8658628,6893.0,406419.0,1646912.0,4062047,-13569991.0,15216903.0,334.0,U,,,,,,,,,,,,
16269,106141,2020,324044.0,1456.0,13113586,13113586,8115922,4160229,4158850,12274771,5955413,7806.0,-8668.0,18715126.0,2160509,4160229.0,14554896.0,,U,,,,,,,,,,,,
18526,115775,2021,971847.0,8600740.0,2443422,2427833,12694924,-10250651,-10379156,2315768,9344219,4759.0,179180.0,78873423.0,3350705,27147136.0,51726287.0,,U,,,,,,,,,,,,
29271,165422,2015,1285286.0,293976.0,2321323,2321323,2283342,-161332,-175797,2107545,1217907,14154.0,159752.0,1600721.0,1065435,1130664.0,470057.0,68.0,U,,,,,,,,,,,,
30452,165790,2021,293893.0,1914277.0,3751713,3751713,6584107,-2698095,-3023417,3560691,4309033,4846.0,446377.0,67985069.0,2275075,13773035.0,54212035.0,,U,,,,,,,,,,,,


The rows that are still left unmatched are shown below. Uncomment to see. 
Not necessary to include since they cannot be identifiable. 

In [None]:
def get_unmatched_rows_by_provname(merged_data: pd.DataFrame) -> pd.DataFrame:
    """Returns rows where provname is still missing."""
    unmatched_rows = merged_data[merged_data['provname'].isnull()]
    print(f"Total rows still unmatched by provnum: {len(unmatched_rows)}")
    return unmatched_rows

unmatched_rows = get_unmatched_rows_by_provname(merged_filled)
from IPython.display import display
display(unmatched_rows)

In [None]:
# Final 

cost_report_provider_info= merged_filled.dropna(subset=['provname'])
print(cost_report_provider_info.shape)

(99350, 31)


## Export

Uncomment to save them to local machine. 

In [None]:
## Save to CSV
# health_deficiency.to_csv('health_deficiencies.csv', index=False, encoding='utf-8')
# quality.to_csv('quality.csv', index=False, encoding='utf-8')
# penalties.to_csv('penalties.csv', index=False, encoding='utf-8')
# cost_report_provider_info.to_csv('cost_report_provider_info.csv', index=False, encoding='utf-8')
