In [1]:
import pandas as pd
import os
import boto3
import io
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
bucket_name = "lneg-loka"
file_name = "patient_data_raw.csv"
processed_file_name = "patient_data_processed.csv"

In [3]:
s3 = boto3.client('s3')
obj = s3.get_object(Bucket=bucket_name, Key=file_name)
csv_string = obj['Body'].read().decode('utf-8')
df = pd.read_csv(io.StringIO(csv_string))

In [None]:
df

In [None]:
df['chronic_obstructive_pulmonary_disease'].unique()

In [None]:
df['chronic_obstructive_pulmonary_disease'].hist()

In [None]:
df.isna().sum()

In [None]:
df.loc[df['chronic_obstructive_pulmonary_disease']=='A','exercise_frequency'].hist() #Plot class-conditional distribution of exercise_frequency for classes A, B, C and D. Beware, order of bins changes!

By plotting class-conditional distribution of 'exercise_frequency', it can be seen that it should have little predictive power for prediction chronic obstructive pulmonary disease - the distribution is virtually equal for all classes.

In [None]:
df.loc[df['chronic_obstructive_pulmonary_disease']=='D','education_level'].hist() #Plot class-conditional distribution of education_level for classes A, B, C and D. Beware, order of bins changes!

By plotting class-conditional distribution of 'education_level', it can be seen that it should have little predictive power for prediction chronic obstructive pulmonary disease - the distribution is virtually equal for all classes.

Since the distributions of educational_level and exercise_frequency are virtually class-independent, these features should have little predictive power for predicting chronc obstructive pulmonary disease. Because of this, I decided to discard these features instead of other more complex/wasteful solutions to deal with missing/NaN values.

In [None]:
covariate_name = "education_level"
target="D"
bins_temp = df[df["chronic_obstructive_pulmonary_disease"]==target][covariate_name].unique()
bins = np.array([i for i in bins_temp if str(i).lower() != 'nan'])
bins.sort()
df[df["chronic_obstructive_pulmonary_disease"]==target][covariate_name].value_counts().loc[bins].plot.bar() #use for categorical variables
#df[df["chronic_obstructive_pulmonary_disease"]==target][covariate_name].hist(bins=20) #use for continuous real-valued variables

All variables have equal class-conditional distributions. Should not be possible to get meaningful predictions for chronic obstructive pulmonary disease from these features.
There's almost perfect collinearity between certain features, e.g. BMI and alanine_aminotransferase. Is the data of one of these features corrupted? In any case at least one of these should be dropped

In [19]:
df_new = df.copy()

Should make preprocessing a part of model pipeline to avoid having to preprocess test data at inference time

In [None]:
df_new = df_new.drop('exercise_frequency',axis=1)
df_new = df_new.drop('education_level',axis=1)
df_new = df_new.drop('patient_id',axis=1)
df_new = df_new.drop("alanine_aminotransferase",axis=1)

In [21]:
df_new

Unnamed: 0,age,sex,bmi,smoker,diagnosis_code,medication_count,days_hospitalized,readmitted,last_lab_glucose,diet_quality,income_bracket,urban,albumin_globulin_ratio,chronic_obstructive_pulmonary_disease,alanine_aminotransferase
0,69,Male,25.3,No,D1,1,9,1,100.8,Average,Middle,0,0.8934,B,25.4423
1,32,Male,27.0,Yes,D4,1,7,0,106.3,Good,High,1,0.7104,C,27.0529
2,89,Female,30.6,No,D1,2,9,0,138.4,Good,Middle,1,0.1078,B,30.4405
3,78,Male,17.8,Yes,D1,4,5,0,81.8,Average,Low,1,0.3754,C,17.5797
4,38,Female,37.7,No,D3,5,6,0,115.2,Poor,Low,1,0.0994,D,37.7834
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,88,Female,29.9,Yes,D1,8,3,1,104.0,Poor,Low,1,0.7400,B,30.0000
9996,43,Male,32.7,No,D5,2,4,0,69.5,Average,Low,1,0.3098,C,32.6781
9997,63,Male,34.1,Yes,D2,4,9,1,78.0,Good,High,1,0.8163,B,34.1534
9998,63,Female,28.0,No,D5,6,7,1,100.8,Average,Low,0,0.0544,A,28.0628


In [24]:
label_encoder = LabelEncoder()
df_new['diet_quality'] = label_encoder.fit_transform(df['diet_quality'])
df_new['income_bracket'] = label_encoder.fit_transform(df['income_bracket'])

In [25]:
multinomial_categories = ['diagnosis_code']
multinomial_df = pd.get_dummies(df_new[multinomial_categories],drop_first=False)
binomial_categories =['sex','smoker']
binomial_df = pd.get_dummies(df_new[binomial_categories],drop_first=True)
df_new = df_new.drop(multinomial_categories,axis=1)
df_new = df_new.drop(binomial_categories,axis=1)
df_new = pd.concat([df_new,binomial_df],axis=1)
df_new = pd.concat([df_new,multinomial_df],axis=1)
df_new

Unnamed: 0,age,bmi,medication_count,days_hospitalized,readmitted,last_lab_glucose,diet_quality,income_bracket,urban,albumin_globulin_ratio,chronic_obstructive_pulmonary_disease,sex_Male,smoker_Yes,diagnosis_code_D1,diagnosis_code_D2,diagnosis_code_D3,diagnosis_code_D4,diagnosis_code_D5
0,69,25.3,1,9,1,100.8,0,2,0,0.8934,B,True,False,True,False,False,False,False
1,32,27.0,1,7,0,106.3,1,0,1,0.7104,C,True,True,False,False,False,True,False
2,89,30.6,2,9,0,138.4,1,2,1,0.1078,B,False,False,True,False,False,False,False
3,78,17.8,4,5,0,81.8,0,1,1,0.3754,C,True,True,True,False,False,False,False
4,38,37.7,5,6,0,115.2,2,1,1,0.0994,D,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,88,29.9,8,3,1,104.0,2,1,1,0.7400,B,False,True,True,False,False,False,False
9996,43,32.7,2,4,0,69.5,0,1,1,0.3098,C,True,False,False,False,False,False,True
9997,63,34.1,4,9,1,78.0,1,0,1,0.8163,B,True,True,False,True,False,False,False
9998,63,28.0,6,7,1,100.8,0,1,0,0.0544,A,False,False,False,False,False,False,True


In [26]:
buffer = io.StringIO()
df_new.to_csv(buffer)
s3.put_object(Bucket = bucket_name, Key = processed_file_name, Body = buffer.getvalue())

{'ResponseMetadata': {'RequestId': 'RZAJMS4AQE0N9PYK',
  'HostId': 'mxCCZvn5SWbxdyXgeFfdZJce5qUNJY21LwIVTRbQNEfPlVZvi4kl1I3XE5XpbE+J1CkcVtwyhJ4=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'mxCCZvn5SWbxdyXgeFfdZJce5qUNJY21LwIVTRbQNEfPlVZvi4kl1I3XE5XpbE+J1CkcVtwyhJ4=',
   'x-amz-request-id': 'RZAJMS4AQE0N9PYK',
   'date': 'Mon, 13 Oct 2025 12:59:35 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"f5144294b0304c4d00ed375b0f8b66fd"',
   'x-amz-checksum-crc32': 'Egj9qg==',
   'x-amz-checksum-type': 'FULL_OBJECT',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"f5144294b0304c4d00ed375b0f8b66fd"',
 'ChecksumCRC32': 'Egj9qg==',
 'ChecksumType': 'FULL_OBJECT',
 'ServerSideEncryption': 'AES256'}