In [1]:
import pandas as pd

In [2]:
admissions = pd.read_csv("../data/raw/hospital_admission.csv", index_col=False)
admissions.head()

Unnamed: 0,CaseOrder,Customer_id,Interaction,UID,City,State,County,Zip,Lat,Lng,...,TotalCharge,Additional_charges,Item1,Item2,Item3,Item4,Item5,Item6,Item7,Item8
0,1,C412403,8cd49b13-f45a-4b47-a2bd-173ffa932c2f,3a83ddb66e2ae73798bdf1d705dc0932,Eva,AL,Morgan,35621,34.3496,-86.72508,...,3726.70286,17939.40342,3,3,2,2,4,3,3,4
1,2,Z919181,d2450b70-0337-4406-bdbb-bc1037f1734c,176354c5eef714957d486009feabf195,Marianna,FL,Jackson,32446,30.84513,-85.22907,...,4193.190458,17612.99812,3,4,3,4,4,4,3,3
2,3,F995323,a2057123-abf5-4a2c-abad-8ffe33512562,e19a0fa00aeda885b8a436757e889bc9,Sioux Falls,SD,Minnehaha,57110,43.54321,-96.63772,...,2434.234222,17505.19246,2,4,4,4,3,4,3,3
3,4,A879973,1dec528d-eb34-4079-adce-0d7a40e82205,cd17d7b6d152cb6f23957346d11c3f07,New Richland,MN,Waseca,56072,43.89744,-93.51479,...,2127.830423,12993.43735,3,5,5,3,4,5,5,5
4,5,C544523,5885f56b-d6da-43a3-8760-83583af94266,d2f0425877b10ed6bb381f3e2579424a,West Point,VA,King William,23181,37.59894,-76.88958,...,2113.073274,3716.525786,2,1,3,3,5,3,4,3


In [3]:
admissions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 50 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   CaseOrder           10000 non-null  int64  
 1   Customer_id         10000 non-null  object 
 2   Interaction         10000 non-null  object 
 3   UID                 10000 non-null  object 
 4   City                10000 non-null  object 
 5   State               10000 non-null  object 
 6   County              10000 non-null  object 
 7   Zip                 10000 non-null  int64  
 8   Lat                 10000 non-null  float64
 9   Lng                 10000 non-null  float64
 10  Population          10000 non-null  int64  
 11  Area                10000 non-null  object 
 12  TimeZone            10000 non-null  object 
 13  Job                 10000 non-null  object 
 14  Children            10000 non-null  int64  
 15  Age                 10000 non-null  int64  
 16  Incom

In [4]:
# Readmissions are costly to hospitals as high readmission occurrence may result in fines or penalties.
# To reduce readmissions, hospitals can focus on identifying patients who are at high risk of readmission.
# during this project, I will use machine learning to predict readmissions and identify high-risk patients.



In [5]:
# I will be iterating through the data to identify and remove any features that /
# have more than five levels of categorical data. this will eliminate the chance of multicollinearity.

In [None]:
## I will be removing object variables that have more than 5 levels

cols_to_drop = [col for col in admissions.select_dtypes(include=['object']).columns
                if admissions[col].nunique()>5]

admissions.drop(columns=cols_to_drop, inplace=True)



In [7]:
admissions.drop(['Zip','CaseOrder','Lat','Lng','Population','Children','Item1','Item2','Item3','Item4','Item5','Item6','Item7','Item8'],axis=1,inplace=True)

In [8]:
# to make the data better for machine learning, I will convert categorical data into numerical data using one-hot encoding. /
# this will ensure that the machine learning model can better understand the data and make more accurate predictions.

In [9]:
from sklearn.preprocessing import OneHotEncoder

In [10]:
# separating the categorical columns
cat_columns = admissions.select_dtypes(include='object').columns

# applying one-hot encoding to categorical columns
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_features = encoder.fit_transform(admissions[cat_columns])
encoded_columns = encoder.get_feature_names_out(cat_columns)

# combining the encoded features with the original dataframe
encoded_admissions = pd.DataFrame(encoded_features, columns=encoded_columns, index=admissions.index)
cat_removed = admissions.drop(columns=cat_columns).astype('float')
encoded_admissions = pd.concat([cat_removed, encoded_admissions], axis=1)


In [11]:
encoded_admissions.dtypes

Age                                    float64
Income                                 float64
VitD_levels                            float64
Doc_visits                             float64
Full_meals_eaten                       float64
vitD_supp                              float64
Initial_days                           float64
TotalCharge                            float64
Additional_charges                     float64
Area_Suburban                          float64
Area_Urban                             float64
Marital_Married                        float64
Marital_Never Married                  float64
Marital_Separated                      float64
Marital_Widowed                        float64
Gender_Male                            float64
Gender_Nonbinary                       float64
ReAdmis_Yes                            float64
Soft_drink_Yes                         float64
Initial_admin_Emergency Admission      float64
Initial_admin_Observation Admission    float64
HighBlood_Yes

In [12]:
# I will now split the data into training and testing sets. 
# this will allow me to train the model on a portion of the data and test its accuracy on a different portion of the data.
# This will be a 20/80 split for the training and testing sets. stratify will ensure that the split is representative of the overall data.


In [13]:
#  Place Split Code Here
from sklearn.model_selection import train_test_split

X = encoded_admissions.drop('ReAdmis_Yes', axis=1)
y = encoded_admissions['ReAdmis_Yes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10, stratify=y)

print(f'Training data shape: {X_train.shape}, Training labels shape: {y_train.shape}')
print(f'Target distribution - Train: \n{y_train.value_counts(normalize=True)}, Test: \n{y_test.value_counts(normalize=True)}')


Training data shape: (8000, 36), Training labels shape: (8000,)
Target distribution - Train: 
ReAdmis_Yes
0.0    0.633125
1.0    0.366875
Name: proportion, dtype: float64, Test: 
ReAdmis_Yes
0.0    0.633
1.0    0.367
Name: proportion, dtype: float64


In [14]:
# now that the data is split into training and testing sets, I will use Chi - squared test to identify /
# and remove any features that are not significant in predicting readmissions. 
# I will use a p-value threshold of 0.05 to determine significance. as well as the score threshold of .5 to determine significance.

# an additional selectKBest function will be used to select the top 10 features based on the Chi - squared test. 
# the results between selectKBest and Chi - squared test will be compared to ensure that the top 10 features are significant.

### Chi2

In [15]:
from sklearn.feature_selection import SelectKBest, chi2

In [16]:
kBest = SelectKBest(chi2, k=7)

X_train_selected = kBest.fit_transform(X_train, y_train)
X_test_selected = kBest.transform(X_test)

selected_features = X_train.columns[kBest.get_support()].tolist()
print(f'Selected features: {selected_features}')

#Final Train DF
train_final = pd.DataFrame(X_train_selected, columns=selected_features)
train_final['ReAdmis_Yes'] = y_train.reset_index(drop=True)

#Final Test DF
test_final = pd.DataFrame(X_test_selected, columns=selected_features)
test_final['ReAdmis_Yes'] = y_test.reset_index(drop=True)


print(f'Final training data shape: {train_final.shape}')
print(f'Final testing data shape: {test_final.shape}')

train_final.to_csv('../data/processed/train_admissions_final.csv', index=False)
test_final.to_csv('../data/processed/test_admissions_final.csv', index=False)


Selected features: ['Age', 'Income', 'Initial_days', 'TotalCharge', 'Additional_charges', 'Services_CT Scan', 'Services_Intravenous']
Final training data shape: (8000, 8)
Final testing data shape: (2000, 8)


In [17]:
# Sores & P-values
scores = kBest.scores_
pvalues = kBest.pvalues_

feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': scores, 'P-Value': pvalues})
feature_scores = feature_scores.sort_values(by='Score', ascending=False)
print(feature_scores)

                                Feature         Score        P-Value
7                           TotalCharge  5.093640e+06   0.000000e+00
6                          Initial_days  1.162429e+05   0.000000e+00
1                                Income  3.275102e+03   0.000000e+00
8                    Additional_charges  8.430335e+02  2.377389e-185
0                                   Age  7.194535e+00   7.312594e-03
34                 Services_Intravenous  4.651111e+00   3.103346e-02
33                     Services_CT Scan  3.738533e+00   5.317110e-02
5                             vitD_supp  3.329417e+00   6.805099e-02
35                         Services_MRI  1.891870e+00   1.689912e-01
12                Marital_Never Married  1.597102e+00   2.063144e-01
32                           Asthma_Yes  1.263395e+00   2.610095e-01
4                      Full_meals_eaten  1.148041e+00   2.839596e-01
28                         BackPain_Yes  1.142959e+00   2.850279e-01
18    Initial_admin_Emergency Admi

In [18]:
score_threshold = 0.5 # the score gives us an idea of feature importance, where high score is more important
pvalue_threshold = 0.05 # standard p-value threshold

selected_features = feature_scores[(feature_scores['Score'] >= score_threshold) & (feature_scores['P-Value'] <= pvalue_threshold)]['Feature'].tolist()
final_admissions = encoded_admissions[selected_features + ['ReAdmis_Yes']]

final_admissions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   TotalCharge           10000 non-null  float64
 1   Initial_days          10000 non-null  float64
 2   Income                10000 non-null  float64
 3   Additional_charges    10000 non-null  float64
 4   Age                   10000 non-null  float64
 5   Services_Intravenous  10000 non-null  float64
 6   ReAdmis_Yes           10000 non-null  float64
dtypes: float64(7)
memory usage: 547.0 KB


In [20]:
# the review of features from both methods indicates that the features to be used are:
# TotalCharge, Initial_days, Income, Additional_charges, Age, Services_Intravenous.

In [19]:
# Random Forest Classifier will be implemented in the next notebook