In [1]:
# Importing necessary libraries

# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd

# Libraries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

# to split the data into train and test
from sklearn.model_selection import train_test_split

# to build linear regression_model
from sklearn.linear_model import LinearRegression

# to check model performance
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# to build linear regression_model using statsmodels
import statsmodels.api as sm


# Removes the limit for the number of displayed columns
pd.set_option("display.max_columns", None)
# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 200)

In [2]:
# loading the dataset
data = pd.read_csv("/home/noman/local-LLM-with-RAG/721563402_Mat-Su_standardcharges.csv")

In [3]:
# View top 5 rows of the data
data.head()

Unnamed: 0,SVCCD,Description,CPT,Gross Charge,As of Date,DISCOUNTED CASH PRICE INPATIENT,DISCOUNTED CASH PRICE OUTPATIENT,DE-IDENTIFIED MIN,DE-IDENTIFIED MAX,Premera Blue Cross of AK - All Plans,Premera Blue Cross of AK - CHS Employees - All Plans,Aetna HMO PPO POS - All Plans,Aetna AK Care (State) HMO PPO POS - All Plans,Pacific Health Coalition PPO - All Plans,United Healthcare PPO POS - All Plans,Cigna PPO - All Plans,MODA PPO - All Plans,Multiplan PPO - All Plans
0,1610122,BONE AGE STUDY DR,77072.0,422.0,2/13/2023,211.0,126.6,193.698,396.258,205.092,193.698,295.822,284.428,238.852,281.474,303.418,315.234,396.258
1,1317090,HLA2 TYP HR 1 AL EA,81383.0,659.0,2/13/2023,329.5,197.7,302.481,618.801,320.274,302.481,461.959,444.166,372.994,439.553,473.821,492.273,618.801
2,1219691,DILTIAZE 120MG SR CP,,11.0,2/13/2023,5.5,3.3,5.049,10.329,5.346,5.049,7.711,7.414,6.226,7.337,7.909,8.217,10.329
3,1317006,BRAF GENE ANALY V600,81210.0,600.0,2/13/2023,300.0,180.0,275.4,563.4,291.6,275.4,420.6,404.4,339.6,400.2,431.4,448.2,563.4
4,1641004,MR-UPPER JOINT W,73222.0,5181.0,2/13/2023,2590.5,1554.3,2378.079,4864.959,2517.966,2378.079,3631.881,3491.994,2932.446,3455.727,3725.139,3870.207,4864.959


In [4]:
# noman
# Drop all columns except the specified ones
data = data.drop(columns=[col for col in data.columns if col not in ['Description', 'CPT', 'Gross Charge']])

# View the modified dataset
data.head()

Unnamed: 0,Description,CPT,Gross Charge
0,BONE AGE STUDY DR,77072.0,422.0
1,HLA2 TYP HR 1 AL EA,81383.0,659.0
2,DILTIAZE 120MG SR CP,,11.0
3,BRAF GENE ANALY V600,81210.0,600.0
4,MR-UPPER JOINT W,73222.0,5181.0


In [5]:
# # Save the modified DataFrame as a CSV file
# data.to_csv('/home/noman/local-LLM-with-RAG/Research/721563402_Mat-Su_standardcharges_modified_v1.csv', index=False)

In [5]:
data.head(20)

Unnamed: 0,Description,CPT,Gross Charge
0,BONE AGE STUDY DR,77072,422.0
1,HLA2 TYP HR 1 AL EA,81383,659.0
2,DILTIAZE 120MG SR CP,,11.0
3,BRAF GENE ANALY V600,81210,600.0
4,MR-UPPER JOINT W,73222,5181.0
5,NEEDLES 2,,1088.0
6,BUPIV.5%/EPI PF 30ML,,47.0
7,TRAMADOL 50MG TAB,,11.0
8,HEPATITIS B GLOB 1ML,90371,551.0
9,PACU PHASE 2 IN 15MN,,1103.0


In [6]:
# Drop rows where 'brand' is NaN
data = data.dropna(subset=['Description', 'CPT', 'Gross Charge'])

# Save the updated DataFrame as a new CSV file
data.to_csv('/home/noman/local-LLM-with-RAG/Research/721563402_Mat-Su_standardcharges_modified_v2.csv', index=False)

# View the updated dataset
data.head(20)

Unnamed: 0,Description,CPT,Gross Charge
0,BONE AGE STUDY DR,77072,422.0
1,HLA2 TYP HR 1 AL EA,81383,659.0
3,BRAF GENE ANALY V600,81210,600.0
4,MR-UPPER JOINT W,73222,5181.0
8,HEPATITIS B GLOB 1ML,90371,551.0
11,CATH-INFUS PERIF 13,C1751,12486.0
12,IR-ABD AORTA SERIAL,75625,9052.0
13,AMINOPHYLL 250MG INJ,J0280,77.0
15,IR-FEM POP REVAS ATH,SURG,56890.0
16,SP-ENT SPN 2OR3 V DR,72082,561.0


In [7]:
#sample data
data.sample(n=5, random_state=1)

Unnamed: 0,Description,CPT,Gross Charge
4138,SKIN DEBRIDEMENT WITHOUT CC/MCC,572,74874.7
2498,MICRODISSECTION MANU,88381,288.0
2365,TL201 THALC PER MCI,A9505,218.0
5018,Njx aa&/strd trigeminal nrv,64400,2161.2
1908,NALOXONE HCL 1MG INJ,J2310,98.0


In [8]:
data.shape

(3900, 3)

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3900 entries, 0 to 5067
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Description   3900 non-null   object 
 1   CPT           3900 non-null   object 
 2   Gross Charge  3900 non-null   float64
dtypes: float64(1), object(2)
memory usage: 121.9+ KB


In [11]:
df = data.copy()

In [9]:
import os
from PyPDF2 import PdfReader, PdfWriter

# Replace with the path to your input PDF file
input_pdf_path = "/home/noman/local-LLM-with-RAG/Research/721563402_Mat-Su_standardcharges_modified_v2.pdf"
output_folder = "/home/noman/local-LLM-with-RAG/Research"

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Load the input PDF
reader = PdfReader(input_pdf_path)

# Split each page into a separate PDF
for page_num in range(len(reader.pages)):
    writer = PdfWriter()
    writer.add_page(reader.pages[page_num])
    
    output_path = os.path.join(output_folder, f"page_{page_num + 1}.pdf")
    
    # Save the individual page as a new PDF
    with open(output_path, "wb") as output_pdf:
        writer.write(output_pdf)

print("PDF pages have been split and saved successfully.")

PDF pages have been split and saved successfully.
