In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import duckdb

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso

In [3]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV

In [4]:
import lightgbm as lgb
from sklearn.dummy import DummyClassifier

In [5]:
from sklearn.metrics import make_scorer, fbeta_score, confusion_matrix, classification_report

In [6]:
import pickle

In [7]:
import warnings
warnings.filterwarnings("ignore")

# Import data

In [8]:
df = pd.read_csv("./dataset/Training.csv")

diabetes

In [9]:
df_diabetes = duckdb.query("""
select *
, case when prognosis = 'Diabetes' then 1 else 0 end diabetes_alert
from df
""").to_df()

In [10]:
df_diabetes = df_diabetes.drop(columns='prognosis')

typhoid

In [11]:
df_typhoid = duckdb.query("""
select *
, case when prognosis = 'Typhoid' then 1 else 0 end typhoid_alert
from df
""").to_df()

In [12]:
df_typhoid = df_typhoid.drop(columns='prognosis')

# Lasso (Top 5 Feature)

diabetes

In [13]:
X = df_diabetes.drop('diabetes_alert', axis=1)
y = df_diabetes['diabetes_alert']

In [14]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

lasso = Lasso(alpha=0.1, random_state=42)
lasso.fit(X_scaled, y)

feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': lasso.coef_,
    'Abs_Coefficient': np.abs(lasso.coef_)
})

top_5_features = feature_importance.sort_values(by='Abs_Coefficient', ascending=False).head(5)
top_5_features_diabetes = top_5_features['Feature'].to_list()

In [15]:
top_5_features_diabetes

['itching',
 'skin_rash',
 'nodal_skin_eruptions',
 'continuous_sneezing',
 'shivering']

typhoid

In [16]:
X = df_typhoid.drop('typhoid_alert', axis=1)
y = df_typhoid['typhoid_alert']

In [17]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

lasso = Lasso(alpha=0.1, random_state=42)
lasso.fit(X_scaled, y)

feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': lasso.coef_,
    'Abs_Coefficient': np.abs(lasso.coef_)
})

top_5_features = feature_importance.sort_values(by='Abs_Coefficient', ascending=False).head(5)
top_5_features_typhoid = top_5_features['Feature'].to_list()

In [18]:
top_5_features_typhoid

['toxic_look_(typhos)',
 'belly_pain',
 'itching',
 'skin_rash',
 'continuous_sneezing']

# Training

diabetes

In [19]:
top_5_features_diabetes

['itching',
 'skin_rash',
 'nodal_skin_eruptions',
 'continuous_sneezing',
 'shivering']

In [20]:
X = df_diabetes[top_5_features_diabetes]
y = df_diabetes[['diabetes_alert']]

In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
lgb_model = lgb.LGBMClassifier(random_state=42)

In [23]:
lgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'num_leaves': [20, 31, 40, 50],
    'reg_alpha': [0, 1, 10],
    'reg_lambda': [0, 1, 10]
}

In [24]:
lgb_search = RandomizedSearchCV(
    estimator=lgb_model,
    param_distributions=lgb_param_grid,
    n_iter=5,
    scoring='recall',
    cv=3,
    random_state=42,
    n_jobs=-1
)

In [25]:
lgb_search.fit(X_train, Y_train.values)

[LightGBM] [Info] Number of positive: 0, number of negative: 3936
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000148 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10
[LightGBM] [Info] Number of data points in the train set: 3936, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000000 -> initscore=-34.538776
[LightGBM] [Info] Start training from score -34.538776


In [26]:
lgb_model = lgb_search.best_estimator_

In [27]:
with open('./trained_model/lgb_model_diabetes.pkl', 'wb') as file:
    pickle.dump(lgb_model, file)

typhoid

In [28]:
top_5_features_typhoid

['toxic_look_(typhos)',
 'belly_pain',
 'itching',
 'skin_rash',
 'continuous_sneezing']

In [29]:
X = df_typhoid[top_5_features_typhoid]
y = df_typhoid[['typhoid_alert']]

In [30]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
lgb_model = lgb.LGBMClassifier(random_state=42)

In [32]:
lgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'num_leaves': [20, 31, 40, 50],
    'reg_alpha': [0, 1, 10],
    'reg_lambda': [0, 1, 10]
}

In [33]:
lgb_search = RandomizedSearchCV(
    estimator=lgb_model,
    param_distributions=lgb_param_grid,
    n_iter=5,
    scoring='recall',
    cv=3,
    random_state=42,
    n_jobs=-1
)

In [34]:
lgb_search.fit(X_train, Y_train.values)

[LightGBM] [Info] Number of positive: 101, number of negative: 3835
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000367 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10
[LightGBM] [Info] Number of data points in the train set: 3936, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.025661 -> initscore=-3.636804
[LightGBM] [Info] Start training from score -3.636804


In [35]:
lgb_model = lgb_search.best_estimator_

In [36]:
with open('./trained_model/lgb_model_typhoid.pkl', 'wb') as file:
    pickle.dump(lgb_model, file)

# Test

diabetes

In [37]:
with open('./trained_model/lgb_model_diabetes.pkl', 'rb') as file:
    model_diabetes = pickle.load(file)

In [38]:
array_example = np.array([1,0,1,0,0]).reshape(1, -1)

In [39]:
model_diabetes.predict_proba(array_example)[0][0]

np.float64(0.999999999999999)

In [40]:
model_diabetes.predict_proba(array_example)[0][1]

np.float64(1.0000000036274914e-15)

typhoid

In [41]:
with open('./trained_model/lgb_model_typhoid.pkl', 'rb') as file:
    model_typhoid = pickle.load(file)

In [42]:
array_example = np.array([1,0,1,0,0]).reshape(1, -1)

In [43]:
model_typhoid.predict_proba(array_example)[0][0]

np.float64(0.2253655745655524)

In [44]:
model_typhoid.predict_proba(array_example)[0][1]

np.float64(0.7746344254344476)