In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [None]:
df=pd.read_csv('/content/fake_medical_data_with_issues.csv')

In [None]:
df.head()

In [None]:
data=df.copy()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df['Age'].fillna(df['Age'].mean(),inplace=True)  #fill missing value

In [None]:
df['Age'].isnull().sum()

In [None]:
df['Gender'].info()

In [None]:
df['Gender'].value_counts()

In [None]:
null_indices = df[df['Gender'].isna()].index
fill_values = ['Female'] * 65 + ['Male'] * 10
np.random.shuffle(fill_values)
df.loc[null_indices, 'Gender'] = fill_values

In [None]:
df['Height_cm'].fillna(df['Height_cm'].mean(),inplace=True)
df['Weight_kg'].fillna(df['Weight_kg'].mean(),inplace=True)    # replace missing values

In [None]:
df['Height_cm'].isnull().sum()

In [None]:
df['Weight_kg'].isnull().sum()

In [None]:
df['BMI'].value_counts()

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(x='BMI', data=df, palette='Set2')
plt.title('BMI Category Distribution')
plt.xlabel('BMI Category')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
df['BMI'].fillna(df['BMI'].mode()[0], inplace=True)

In [None]:
df['BMI'].isnull().sum()

In [None]:
df['Blood_Pressure'].value_counts()

In [None]:
if 'Blood_Pressure' in df.columns:
    split_cols = df['Blood_Pressure'].str.split('/', expand=True)
    df['Systolic'] = pd.to_numeric(split_cols[0], errors='coerce')
    df['Diastolic'] = pd.to_numeric(split_cols[1], errors='coerce')

In [None]:
df.drop('Patient_ID',axis=1,inplace=True)
df.drop('Blood_Pressure',axis=1,inplace=True)

In [None]:
df['Heart_Rate'].isnull().sum()

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(x='Heart_Rate', data=df, palette='Set2')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
df['Heart_Rate'].fillna(df['BMI'].mode()[0], inplace=True)

In [None]:
df['Heart_Rate'].isnull().sum()

In [None]:
df['Cholesterol'].info()

In [None]:
null_indices = df[df['Cholesterol'].isna()].index
n_nulls = len(null_indices)
fill_values = ['Normal'] * 40 + ['High'] * 45 + ['Low'] * 42
total_known = len(fill_values)
p_normal = 40 / total_known
p_high = 45 / total_known
p_low = 42 / total_known
n_normal = int(p_normal * n_nulls)
n_high = int(p_high * n_nulls)
n_low = n_nulls - n_normal - n_high
fill_nulls = ['normal'] * n_normal + ['high'] * n_high + ['low'] * n_low
np.random.shuffle(fill_nulls)
df.loc[null_indices, 'Cholesterol'] = fill_nulls

In [None]:
df['Diabetes'].value_counts()

In [None]:
null_indices = df[df['Diabetes'].isna()].index
count_y = 45
count_no = 43
count_yes = 30
total_known = count_y + count_no + count_yes
p_y = count_y / total_known
p_no = count_no / total_known
p_yes = count_yes / total_known
n_nulls = len(null_indices)
n_y = int(p_y * n_nulls)
n_no = int(p_no * n_nulls)
n_yes = n_nulls - n_y - n_no
fill_values = ['Y'] * n_y + ['No'] * n_no + ['Yes'] * n_yes
np.random.shuffle(fill_values)
df.loc[null_indices, 'Diabetes'] = fill_values


In [None]:
df['Diabetes'].info()

In [None]:
df['Smoker'].info()

In [None]:
df.drop('Smoker',axis=1,inplace=True)

In [None]:
df['Exercise_Freq'].value_counts()

In [None]:
null_indices = df[df['Exercise_Freq'].isna()].index
count_daily = 49
count_never = 40
count_rarely = 36
count_weekly = 34
total_known = count_daily + count_never + count_rarely + count_weekly
p_daily = count_daily / total_known
p_never = count_never / total_known
p_rarely = count_rarely / total_known
p_weekly = count_weekly / total_known
n_nulls = len(null_indices)
n_daily = int(p_daily * n_nulls)
n_never = int(p_never * n_nulls)
n_rarely = int(p_rarely * n_nulls)
n_weekly = n_nulls - n_daily - n_never - n_rarely
fill_values = (
    ['Daily'] * n_daily +
    ['Never'] * n_never +
    ['Rarely'] * n_rarely +
    ['Weekly'] * n_weekly
)
np.random.shuffle(fill_values)
df.loc[null_indices, 'Exercise_Freq'] = fill_values

In [None]:
df['Exercise_Freq'].info()

In [None]:
df['Medication'].value_counts()

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(x='Medication', data=df, palette='Set2')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
df['Medication'].info()

In [None]:
null_indices = df[df['Medication'].isna()].index
count_druga = 45
count_drugb = 39
count_unknown = 35
total_known = count_druga + count_drugb + count_unknown
p_druga = count_druga / total_known
p_drugb = count_drugb / total_known
p_unknown = count_unknown / total_known
n_nulls = len(null_indices)
n_druga = int(p_druga * n_nulls)
n_drugb = int(p_drugb * n_nulls)
n_unknown = n_nulls - n_druga - n_drugb
fill_values = (
    ['DrugA'] * n_druga +
    ['DrugB'] * n_drugb +
    ['Unknown'] * n_unknown
)
np.random.shuffle(fill_values)
df.loc[null_indices, 'Medication'] = fill_values

In [None]:
df['Medication'].info()

In [None]:
df['Diagnosis'].value_counts()

In [None]:
df['Diagnosis'].isnull().sum()

In [None]:
null_indices = df[df['Diagnosis'].isna()].index
count_diabetes = 48
count_hypertension = 38
total_known = count_diabetes + count_hypertension
p_diabetes = count_diabetes / total_known
p_hypertension = count_hypertension / total_known
n_nulls = len(null_indices)
n_diabetes = int(p_diabetes * n_nulls)
n_hypertension = n_nulls - n_diabetes
fill_values = (
    ['Diabetes'] * n_diabetes +
    ['Hypertension'] * n_hypertension
)
np.random.shuffle(fill_values)
df.loc[null_indices, 'Diagnosis'] = fill_values

In [None]:
df['Diagnosis'].info()

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(x='Diagnosis', data=df, palette='Set2')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
df['Visit_Date'].value_counts()

In [None]:
df['Visit_Date'] = pd.to_datetime(df['Visit_Date'], errors='coerce')
null_indices = df[df['Visit_Date'].isna()].index
n_nulls = len(null_indices)
date1 = pd.Timestamp('2025-01-10')
date2 = pd.Timestamp('2025-02-20')
count_date1 = 47
count_date2 = 43
total_known = count_date1 + count_date2
p_date1 = count_date1 / total_known
p_date2 = count_date2 / total_known
n_date1 = int(p_date1 * n_nulls)
n_date2 = n_nulls - n_date1
fill_values = [date1] * n_date1 + [date2] * n_date2
np.random.shuffle(fill_values)
df.loc[null_indices, 'Visit_Date'] = fill_values

In [None]:
df['Year'] = df['Visit_Date'].dt.year
df['Month'] = df['Visit_Date'].dt.month
df['Day'] = df['Visit_Date'].dt.day
df['Weekday'] = df['Visit_Date'].dt.weekday

In [None]:
df['Visit_Date'].info()

In [None]:
df['Follow_Up_Days'].value_counts()

In [None]:
df['Follow_Up_Days'].isnull().sum()

In [None]:
df['Follow_Up_Days'].fillna(df['Follow_Up_Days'].mean(), inplace=True)

In [None]:
df['Follow_Up_Days'].isnull().sum()

In [None]:
df.head()

In [None]:
df['Doctor_Notes']=data['Doctor_Notes']

In [None]:
df['Doctor_Notes'].info()

In [None]:
df['Doctor_Notes'].value_counts()

In [None]:
null_indices = df[df['Doctor_Notes'].isna()].index
n_nulls = len(null_indices)
p_check = 56 / (56 + 46)
p_follow = 46 / (56 + 46)
n_check = int(p_check * n_nulls)
n_follow = n_nulls - n_check
fill_values = ['Check BP'] * n_check + ['Follow-up required'] * n_follow
np.random.shuffle(fill_values)
df.loc[null_indices, 'Doctor_Notes'] = fill_values

In [None]:
df['Doctor_Notes'].info()

In [None]:
df.drop('Insurance_Status',axis=1,inplace=True)

In [None]:
df.drop('Visit_Date',axis=1,inplace=True)

In [None]:
df['Hospital'].fillna(df['Hospital'].mode()[0], inplace=True)

In [None]:
df['Hospital'].info()

In [None]:
df['Alcohol_Intake'].value_counts()

In [None]:
df.drop('Alcohol_Intake',axis=1,inplace=True)

In [None]:
df['Systolic'].fillna(df['Systolic'].mean(), inplace=True)
df['Diastolic'].fillna(df['Diastolic'].mean(), inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
numerical_features = df.select_dtypes(include=np.number).columns

for feature in numerical_features:
  plt.figure(figsize=(8, 6))
  plt.boxplot(df[feature].dropna())
  plt.title(f"Box Plot of {feature}")
  plt.ylabel(feature)
  plt.show()

In [None]:
df = df[df['Height_cm'] != df['Height_cm'].min()]
df = df[df['Height_cm'] != df['Height_cm'].max()]

In [None]:
df = df[df['Weight_kg'] != df['Weight_kg'].min()]

In [None]:
df = df[df['BMI'] != df['BMI'].max()]

In [None]:
df = df[df['Systolic'] != df['Systolic'].max()]

In [None]:
df = df[df['Diastolic'] != df['Diastolic'].max()]

In [None]:
numerical_features = df.select_dtypes(include=np.number).columns

for feature in numerical_features:
  plt.figure(figsize=(8, 6))
  plt.boxplot(df[feature].dropna())
  plt.title(f"Box Plot of {feature}")
  plt.ylabel(feature)
  plt.show()

In [None]:
dummies = pd.get_dummies(df['Gender'])
df= pd.concat([df, dummies], axis=1)

In [None]:
df.drop('Gender',axis=1,inplace=True)

In [None]:
df['Cholesterol_encoded'], uniques = pd.factorize(df['Cholesterol'])

In [None]:
for col in ['Cholesterol', 'Diabetes', 'Exercise_Freq', 'Medication', 'Diagnosis']:
    df[col + '_encoded'], _ = pd.factorize(df[col])

In [None]:
df.drop('Diagnosis',axis=1,inplace=True)
df.drop('Medication',axis=1,inplace=True)
df.drop('Exercise_Freq',axis=1,inplace=True)
df.drop('Cholesterol',axis=1,inplace=True)
df.drop('Diabetes',axis=1,inplace=True)

In [None]:
for col in ['Doctor_Notes' ,	'Hospital']:
    df[col + '_encoded'], _ = pd.factorize(df[col])

In [None]:
df.drop('Doctor_Notes',axis=1,inplace=True)
df.drop('Hospital',axis=1,inplace=True)

In [None]:
df.head()