In [225]:
import pandas as pd
import numpy as np

In [226]:
df = pd.read_excel("blood test data last.xlsx")

In [227]:
df.head()

Unnamed: 0,Patient ID,Age,Gender,BMI,BMI Category,HbA1c,HbA1c Category,Systolic BP,Diastolic BP,Cholesterol (mg/dL),HDL,LDL,Triglyceride
0,1,69,Male,24.2,Normal Weight,6.6,Diabetic,109,91,277.0,50.0,128.0,169.0
1,2,32,Female,36.1,Obese,6.4,PreDiabetic,137,74,231.0,48.0,102.0,152.0
2,3,78,Female,21.1,Normal Weight,7.5,Diabetic,172,103,265.0,54.0,120.0,168.0
3,4,38,Female,35.2,Obese,5.3,Normal,176,76,241.0,59.0,102.0,178.0
4,5,41,Male,44.1,Morbidly Obese,7.5,Diabetic,113,100,136.0,60.0,119.0,184.0


In [228]:
import numpy as np

def extractNumericColumn(df):
    numerical_columns = []
    for col in df.columns:
        if "Category" in col or "ID" in col:
            continue

        # the type of the first element
        col_type = type(df[col].iloc[0])

        # check if the type is int or float
        if col_type in [np.int64, np.float64]:
            numerical_columns.append(col)
    return numerical_columns


In [229]:
def extractCategoricalColumn(df):
    categorical_columns = []
    for col in df.columns:
      if "Category" in col or "ID" in col:
        continue
      # the type of the first element
      col_type = type(df[col].iloc[0])

      # check if the type is string
      if col_type in [str]:
          categorical_columns.append(col)
    return categorical_columns

In [230]:
import statistics
def handleMissingNumericalValues(df):
  for col in df.columns:
    if df[col].isna().any():
      print(f"column name is: {col}")
      # Fill the missing values with the mean of the column
      # Convert the column to numeric, coercing errors to NaN
      df[col] = pd.to_numeric(df[col], errors='coerce')
      # Calculate the mean of the column, ignoring NaNs
      mean_value = np.nanmean(df[col])
      df[col] = df[col].fillna(mean_value)
      print(f"Now filled the values with {df[col].mean()}")
  return df

def handleMissingCategoricalValues(df):
  for col in df.columns:
    if df[col].isna().any():
      # Fill the missing values with the most common category used in that column
      most_common = df[col].mode().iloc[0]  # Use .mode() correctly
      df[col] = df[col].fillna(most_common)
    return df

In [231]:
numerical_columns = extractNumericColumn(df)
categorical_columns = extractCategoricalColumn(df)

print(numerical_columns)
print(categorical_columns)

print(df.head())

df[numerical_columns] = handleMissingNumericalValues(df[numerical_columns])
df[categorical_columns] = handleMissingCategoricalValues(df[categorical_columns])

X = df[numerical_columns + categorical_columns]
y = df["BMI Category"] + " " + df["HbA1c Category"]

['Age', 'BMI', 'HbA1c', 'Systolic BP', 'Diastolic BP', 'Cholesterol (mg/dL)', 'HDL', 'LDL ', 'Triglyceride']
['Gender']
   Patient ID  Age  Gender   BMI    BMI Category  HbA1c HbA1c Category  \
0           1   69    Male  24.2   Normal Weight    6.6       Diabetic   
1           2   32  Female  36.1           Obese    6.4    PreDiabetic   
2           3   78  Female  21.1   Normal Weight    7.5       Diabetic   
3           4   38  Female  35.2           Obese    5.3         Normal   
4           5   41    Male  44.1  Morbidly Obese    7.5       Diabetic   

   Systolic BP  Diastolic BP  Cholesterol (mg/dL)   HDL   LDL   Triglyceride  
0          109            91                277.0  50.0  128.0         169.0  
1          137            74                231.0  48.0  102.0         152.0  
2          172           103                265.0  54.0  120.0         168.0  
3          176            76                241.0  59.0  102.0         178.0  
4          113           100            

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = pd.to_numeric(df[col], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(mean_value)


In [232]:
from sklearn.preprocessing import normalize, StandardScaler

def processNumerical(numerical_data, type="normalization"):
    # copy the df to protect original data
    numerical_data = numerical_data.copy()

    # normalization is to normalize(min-max, log, mean) the data, for not specially known distributed data or to use in distance based algorithms
    if type == "normalization":
        for col in numerical_data.columns:
            # The whole columns is being transformed in once
            normalized_col = normalize(numerical_data[[col]], norm="l2", axis=0)
            numerical_data[col] = normalized_col.flatten()
    # standardization is to standardize the data, to use in SVM like models or PCA(dimension reduction) algorithms
    elif type == "standardization":
        scaler = StandardScaler()
        for col in numerical_data.columns:
            # The whole columns is being transformed in once
            standardized_col = scaler.fit_transform(numerical_data[[col]])
            numerical_data[col] = standardized_col.flatten()

    return numerical_data


def processCategorical(categorical_data):
    # copy the df to protect original data
    categorical_data = categorical_data.copy()
    for col in categorical_data.columns:
      # Since we have only 2 label male and female we can use label encoder
      from sklearn.preprocessing import LabelEncoder
      le = LabelEncoder()
      categorical_data[col] = le.fit_transform(categorical_data[col])
    return categorical_data


In [233]:
X[numerical_columns] = processNumerical(X[numerical_columns], type="normalization")
X[categorical_columns] = processCategorical(X[categorical_columns])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numerical_columns] = processNumerical(X[numerical_columns], type="normalization")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[categorical_columns] = processCategorical(X[categorical_columns])


In [234]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(pd.Series(y))

In [235]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

In [236]:
X_train.head()

Unnamed: 0,Age,BMI,HbA1c,Systolic BP,Diastolic BP,Cholesterol (mg/dL),HDL,LDL,Triglyceride,Gender
238,0.025129,0.038571,0.039622,0.021019,0.027112,0.024912,0.034194,0.034903,0.032886,0
533,0.028556,0.015747,0.030348,0.034806,0.037596,0.037728,0.031141,0.031413,0.029864,1
430,0.0257,0.016246,0.03077,0.022375,0.028558,0.03528,0.036026,0.034635,0.03502,1
977,0.047974,0.015747,0.027398,0.030285,0.031089,0.02664,0.025646,0.032755,0.035197,0
301,0.027985,0.020731,0.031613,0.023053,0.022413,0.029088,0.034194,0.031681,0.033064,0


In [237]:
y_train

array([ 7, 15, 15, 15,  3,  7, 13, 17,  0, 15, 11, 15,  7, 11,  0, 16,  4,
        0,  8,  8,  2, 15,  7, 17, 13,  8,  4, 15, 12, 12,  8,  1, 17,  8,
       15,  3,  7,  0,  7, 10,  0, 10,  2, 12,  3, 17, 11,  0,  1,  0,  3,
       15,  7,  7,  3, 15, 12, 11,  7, 10,  7,  0, 15,  8,  0,  7,  5,  0,
        7,  4,  7, 16,  0,  7, 15,  0, 15,  7,  4,  8,  8, 11,  8,  0, 11,
       10, 13,  6,  5, 11,  8,  7,  7,  7, 15,  4,  3, 15, 12,  0,  0,  7,
        8, 15,  0,  3,  8, 11, 15, 11,  0, 11, 11, 11, 11,  3, 11, 11, 15,
       15,  7,  7, 10,  3,  7,  0,  7, 16, 12,  0,  8,  3, 15, 16,  4, 17,
       10,  4,  4,  3,  1, 10,  3,  1, 11,  3, 13, 11, 16,  5, 15, 11,  5,
        7,  4,  0,  7,  8,  7,  7,  7,  7,  7,  7, 13,  0, 13,  7, 11, 15,
        8,  7,  1,  0, 15,  7,  0, 13,  3,  7,  3,  1,  3,  7, 15,  1, 11,
        7,  7,  7, 10,  3,  7,  8,  1, 16, 10,  0,  7, 11, 15, 11,  7, 16,
       15,  7,  7,  1,  5,  3, 15, 11,  7, 10,  3, 10, 15,  0,  8, 11,  7,
        8,  3,  3,  7, 12

In [238]:
def learn(model, X_train, y_train):
  if model == "svm":
    from sklearn.svm import SVC
    model = SVC()
    model.fit(X_train, y_train)

  elif model == "knn":
    from sklearn.neighbors import KNeighborsClassifier
    model = KNeighborsClassifier()
    model.fit(X_train, y_train)

  elif model == "decision_tree":
    from sklearn.tree import DecisionTreeClassifier
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)

  elif model == "random_forest":
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier()
    model.fit(X_train, y_train)

  elif model == "naive_bayes":
    from sklearn.naive_bayes import GaussianNB
    model = GaussianNB()
    model.fit(X_train, y_train)

  elif model == "xgboost":
    from xgboost import XGBClassifier
    model = XGBClassifier()
    model.fit(X_train, y_train)
  return model

In [239]:
model = learn("xgboost", X_train, y_train)

In [240]:
def evaluate(model, X_test, y_test):
  from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
  y_pred = model.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred, average='weighted')
  recall = recall_score(y_test, y_pred, average='weighted')
  f1 = f1_score(y_test, y_pred, average='weighted')
  return accuracy, precision, recall, f1

In [241]:
accuracy, precision, recall, f1 = evaluate(model, X_test, y_test)

In [242]:
print("------------------------------")
print(f"Accuracy: {accuracy}")
print("------------------------------")
print(f"Precision: {precision}")
print("------------------------------")
print(f"Recall: {recall}")
print("------------------------------")
print(f"F1 Score: {f1}")
print("------------------------------")

------------------------------
Accuracy: 0.9565217391304348
------------------------------
Precision: 0.9620681093400617
------------------------------
Recall: 0.9565217391304348
------------------------------
F1 Score: 0.9556108040593884
------------------------------


In [243]:
def compare_models(X_train, y_train, X_test, y_test):
  models = ["svm", "knn", "decision_tree", "random_forest", "naive_bayes", "xgboost"]
  results = []
  for model in models:
    model = learn(model, X_train, y_train)
    accuracy, precision, recall, f1 = evaluate(model, X_test, y_test)
    results.append((model, accuracy, precision, recall, f1))
  return results

In [244]:
results = compare_models(X_train, y_train, X_test, y_test)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [245]:
for result in results:
  print("Model:" + str(result[0]).split("(")[0])
  print(f"Accuracy: {result[1]}")
  print(f"Precision: {result[2]}")
  print(f"Recall: {result[3]}")
  print(f"F1 Score: {result[4]}")
  print("------------------------------")

Model:SVC
Accuracy: 0.21739130434782608
Precision: 0.04725897920604915
Recall: 0.21739130434782608
F1 Score: 0.07763975155279504
------------------------------
Model:KNeighborsClassifier
Accuracy: 0.5072463768115942
Precision: 0.47933453075881866
Recall: 0.5072463768115942
F1 Score: 0.4770575490303869
------------------------------
Model:DecisionTreeClassifier
Accuracy: 0.9323671497584541
Precision: 0.9379846401585533
Recall: 0.9323671497584541
F1 Score: 0.934793310273872
------------------------------
Model:RandomForestClassifier
Accuracy: 0.9565217391304348
Precision: 0.9602913189869713
Recall: 0.9565217391304348
F1 Score: 0.9555924471803489
------------------------------
Model:GaussianNB
Accuracy: 0.8405797101449275
Precision: 0.8549172656876699
Recall: 0.8405797101449275
F1 Score: 0.8342248240871298
------------------------------
Model:XGBClassifier
Accuracy: 0.9565217391304348
Precision: 0.9620681093400617
Recall: 0.9565217391304348
F1 Score: 0.9556108040593884
-------------------