<a href="https://colab.research.google.com/github/mikakia/Project-in-HealthCare/blob/FirstStep/Healthcare_Risk_Factors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import  missingno as msno

import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC, NuSVC
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.decomposition import PCA

from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold, cross_val_score,cross_val_predict


#Exploring the Dataset

In [157]:
url = "https://raw.githubusercontent.com/mikakia/Project-in-HealthCare/main/dirty_v3_path.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,Age,Gender,Medical Condition,Glucose,Blood Pressure,BMI,Oxygen Saturation,LengthOfStay,Cholesterol,Triglycerides,HbA1c,Smoking,Alcohol,Physical Activity,Diet Score,Family History,Stress Level,Sleep Hours,random_notes,noise_col
0,46.0,Male,Diabetes,137.04,135.27,28.9,96.04,6,231.88,210.56,7.61,0,0,-0.2,3.54,0,5.07,6.05,lorem,-137.057211
1,22.0,Male,Healthy,71.58,113.27,26.29,97.54,2,165.57,129.41,4.91,0,0,8.12,5.9,0,5.87,7.72,ipsum,-11.23061
2,50.0,,Asthma,95.24,,22.53,90.31,2,214.94,165.35,5.6,0,0,5.01,4.65,1,3.09,4.82,ipsum,98.331195
3,57.0,,Obesity,,130.53,38.47,96.6,5,197.71,182.13,6.92,0,0,3.16,3.37,0,3.01,5.33,lorem,44.187175
4,66.0,Female,Hypertension,95.15,178.17,31.12,94.9,4,259.53,115.85,5.98,0,1,3.56,3.4,0,6.38,6.64,lorem,44.831426


In [None]:
df.shape

In [None]:
#df.iloc[0:3, 2:4]
df.iloc[[0, 1337]]
#df.iloc[:, [0]]

In [None]:
df[['Gender', 'BMI']]
#df.BMI

In [None]:
df.dtypes

In [None]:
df.info()

#Preprocessing

##Checking correlation and missing values

In [None]:
#columns' names to lower case
df.columns = ['age', 'gender', 'medical_condition', 'glucose', 'blood_pressure',
       'bmi', 'oxygen_saturation', 'lengthofstay', 'cholesterol',
       'triglycerides', 'hba1c', 'smoking', 'alcohol', 'physical_activity',
       'diet_score', 'family_history', 'stress_level', 'sleep_hours',
       'random_notes', 'noise_col']
df.head()

In [None]:
df.describe()

###Check for outliers in noise_col

In [None]:
print("Min value:", df['noise_col'].min())
print("Max value:", df['noise_col'].max())

In [None]:
df.corr(numeric_only=True)

In [None]:
df.select_dtypes(exclude=['number']).head()

## Plots

In [None]:
#SUBPLOTS

fig, axes = plt.subplots(1, 2, figsize=(15, 5)) #1 row,2 plots

# BMI
axes[0].hist(df['bmi'], bins=10, color='darkcyan', edgecolor='black')
axes[0].set_title('BMI Distribution')

# Age
axes[1].hist(df['age'], bins=10, color='orange', edgecolor='black')
axes[1].set_title('Age Distribution')

plt.tight_layout()
plt.show()

In [None]:
#Pairplots
numeric_cols = ['age', 'smoking', 'blood_pressure']

sns.pairplot(df, vars=numeric_cols, hue='medical_condition', diag_kind='hist', palette='Set1')
plt.show()

## Find and replace missing values

In [None]:
df.info()
df.isnull().sum()

In [None]:
msno.matrix(df)

In [None]:
msno.heatmap(df)
plt.show()


In [None]:
# skew()< 0.5-> mean, else median
df['age'].skew() #mean
#gender ->mode
df['glucose'].skew() #median
#medical_condition -> mode
df['blood_pressure'].skew() #mean

In [None]:
#fill missing values
df_clean = df.copy()
mode_cols = ['gender', 'medical_condition']
df_clean[mode_cols] = df_clean[mode_cols].fillna(df_clean[mode_cols].mode().iloc[0])

df_clean['age'] = df_clean['age'].fillna(df_clean['age'].mean())
df_clean['glucose'] = df_clean['glucose'].fillna(df_clean['glucose'].median())
df_clean['blood_pressure'] = df_clean['blood_pressure'].fillna(df_clean['blood_pressure'].mean())

In [None]:
# Check sum missing values in the dataset
print(df_clean.isna().sum())

## Convert categorical values to numerical by labeling

In [None]:
df_clean['gender'] = LabelEncoder().fit_transform(df_clean['gender'])
df_clean.head()


In [None]:
df_clean.dtypes

In [None]:
df_clean['random_notes'].unique()

In [None]:
df_clean['medical_condition'].unique()

In [None]:

df_clean['medical_condition'] = LabelEncoder().fit_transform(df_clean['medical_condition'])
df_clean['random_notes'] = LabelEncoder().fit_transform(df_clean['random_notes'])
df_clean.head()


In [None]:
df_clean['family_history'].head()

##Normalize the values

In [None]:
#standarlization
scaler = StandardScaler()
standard_scaled_data = scaler.fit_transform(df_clean[['age', 'glucose','blood_pressure','bmi', 'oxygen_saturation', 'cholesterol',
       'triglycerides', 'hba1c','diet_score','stress_level', 'sleep_hours','noise_col']])

df_scaled = pd.DataFrame(standard_scaled_data, columns=['age', 'glucose','blood_pressure','bmi', 'oxygen_saturation', 'cholesterol',
       'triglycerides', 'hba1c','diet_score','stress_level', 'sleep_hours','noise_col'])
df_scaled

In [None]:
df.columns = ['age', 'gender', 'medical_condition', 'glucose', 'blood_pressure',
       'bmi', 'oxygen_saturation', 'lengthofstay', 'cholesterol',
       'triglycerides', 'hba1c', 'smoking', 'alcohol', 'physical_activity',
       'diet_score', 'family_history', 'stress_level', 'sleep_hours',
       'random_notes', 'noise_col']

In [None]:
# combined data (scaled and rest)
df_complete = df_clean.copy()
df_complete[['age', 'glucose','blood_pressure','bmi', 'oxygen_saturation', 'cholesterol',
       'triglycerides', 'hba1c','diet_score','stress_level', 'sleep_hours','noise_col']] = df_scaled[['age', 'glucose','blood_pressure','bmi', 'oxygen_saturation', 'cholesterol',
       'triglycerides', 'hba1c','diet_score','stress_level', 'sleep_hours','noise_col']]

df_complete.head()


In [None]:
df_complete.shape

# Training and Test Phase

In [None]:
X = df_complete.drop('medical_condition', axis=1)  # all features except medical condition and noise_col
y = df_complete['medical_condition']               # target

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
X_notscaled = df_clean.drop('medical_condition', axis=1)
y_notscaled = df_clean['medical_condition']

In [None]:
X_train_ns, X_test_ns, y_train_ns, y_test_ns = train_test_split(X_notscaled, y_notscaled, test_size=0.2, random_state=42, stratify=y_notscaled)

## SVM



###SVC with rbd kernel removing one column
Accuracy: 82%

In [None]:
# Remove column noise_col because of outliers
df_rm1c = df_complete.drop('noise_col', axis=1)
df_rm1c.head()

In [None]:
X1 = df_rm1c.drop('medical_condition', axis=1)
y1 = df_rm1c['medical_condition']

In [None]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X1, y1, test_size=0.2, random_state=42, stratify=y1)


In [None]:
svm_model_1 = SVC(kernel='rbf', C=1.0, probability=True, random_state=42)
svm_model_1.fit(X_train_1, y_train_1)

In [None]:
y_pred1s = svm_model_1.predict(X_test_1)

print("Classification Report:")
print(classification_report(y_test_1, y_pred1s))

###SVC with rbf kernel
Accuracy: 82%

In [None]:
svm_model = SVC(kernel='rbf', C=1.0, probability=True, random_state=42)
svm_model.fit(X_train, y_train)

In [None]:
y_pred = svm_model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# to 2D
pca = PCA(n_components=2)
X_test_2d = pca.fit_transform(X_test)

# samples
plt.figure(figsize=(8,6))
for class_label in np.unique(y_test):
    plt.scatter(
        X_test_2d[y_test==class_label, 0],
        X_test_2d[y_test==class_label, 1],
        label=f"Class {class_label}",
        alpha=0.6
    )


plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.title("SVM Test Samples")
plt.legend()
plt.show()


In [None]:
sv = svm_model.support_vectors_
sv_2d = pca.transform(sv)
plt.scatter(sv_2d[:,0], sv_2d[:,1], s=100, facecolors='none', edgecolors='k', label='Support Vectors')
plt.legend()
plt.show()


In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(y_test))

disp.plot(cmap='PiYG')
plt.title("Confusion Matrix")
plt.show()


In [None]:
# 25 first predictios
for true, pred in zip(y_test[:25], y_pred[:20]):
    print(f"True: {true} → Predicted: {pred}")


In [None]:
# orinala labels
original_labels = df['medical_condition']
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(original_labels)


print(le.classes_)




In [None]:
y_pred_named = le.inverse_transform(y_pred)
y_test_named = le.inverse_transform(y_test)

# 30 predictions
for true, pred in zip(y_test_named[:30], y_pred_named[:20]):
    print(f"Actual: {true} → Predicted: {pred}")


###SVC with linear kernel
Accuracy:79%

In [None]:
svm_model2 = SVC(kernel='linear', C=1.0, probability=True, random_state=42)
svm_model2.fit(X_train, y_train)

In [None]:
y_pred2 = svm_model2.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred2))

###SVC with poly kernel

Accuracy: 80%

In [None]:
svm_model3 = SVC(kernel='poly', C=1.0, probability=True, random_state=42)
svm_model3.fit(X_train, y_train)

In [None]:
y_pred3 = svm_model3.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred3))

###NuSVC
Accuracy: 51%

In [None]:
model4 = NuSVC(nu=0.05, kernel='poly', probability=True)
model4.fit(X_train, y_train)

In [None]:
y_pred4 = model4.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred4))

###LinearSVC

####hinge
Accuracy: 78%

In [None]:
model_linearSVC = LinearSVC(loss='hinge', C=1.2, max_iter=2000)
model_linearSVC.fit(X_train, y_train)

In [None]:
y_pred5 = model_linearSVC.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred5))

####squared_hinge

Accuracy:78%

In [None]:
model_linearSVC2 = LinearSVC(loss='squared_hinge', C=3, max_iter=1000)
model_linearSVC2.fit(X_train, y_train)

In [None]:
y_pred6 = model_linearSVC2.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred6))
print("Classification Report:")
print(classification_report(y_test, y_pred6))

##Logistic Regression
Accuracy: 78%

In [None]:
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train, y_train)

In [None]:
y_pred_log = log_reg.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print("\nClassification Report:\n", classification_report(y_test, y_pred_log))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_log))

##Random Forest
Accuracy: 82%

Changed n_estimators but there were not significant changes in accuracy

In [None]:
rf_model = RandomForestClassifier(n_estimators=100,max_depth=None,random_state=42)
rf_model.fit(X_train_ns, y_train_ns)

In [None]:
y_pred_rf = rf_model.predict(X_test_ns)
print("Accuracy:", accuracy_score(y_test_ns, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test_ns, y_pred_rf))

In [None]:
cm = confusion_matrix(y_test_ns, y_pred_rf)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=rf_model.classes_)
disp.plot(cmap='BuPu_r')
plt.title("Random Forest Confusion Matrix")
plt.show()

##GradientBoostingClassifier

Accuracy: 81%

In [None]:
gb_model = GradientBoostingClassifier(n_estimators=60,learning_rate=0.1,max_depth=3,random_state=42)
gb_model.fit(X_train_ns, y_train_ns)

In [None]:
y_pred_gb = gb_model.predict(X_test_ns)

print("Accuracy:", accuracy_score(y_test_ns, y_pred_gb))
print("Classification Report:\n", classification_report(y_test_ns, y_pred_gb))

In [None]:
cm = confusion_matrix(y_test_ns, y_pred_gb)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Gradient Boosting Confusion Matrix")
plt.show()

##XGBoost Classifier
Accuracy: 81%

In [None]:
xgb_model = XGBClassifier(n_estimators=100,max_depth=3,learning_rate=0.1,eval_metric='mlogloss',random_state=42)

xgb_model.fit(X_train_ns, y_train_ns)

In [None]:
y_predxgb = xgb_model.predict(X_test_ns)

print("Accuracy:", accuracy_score(y_test_ns, y_predxgb))
print("Classification Report:\n", classification_report(y_test_ns, y_predxgb))

In [None]:
cm = confusion_matrix(y_test, y_predxgb)
sns.heatmap(cm, annot=True, fmt='d', cmap='Oranges')
plt.title("XGBoost Confusion Matrix")
plt.show()

##Neural Network
Accuracy: 82%

In [None]:

nn_model = MLPClassifier(hidden_layer_sizes=(16,8), activation='relu', max_iter=500, random_state=42)
nn_model.fit(X_train, y_train)

In [None]:
y_pred_nn = nn_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_nn))
print("Classification Report:\n", classification_report(y_test, y_pred_nn))

##KNN
Accuracy: 80%

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=15, metric='minkowski')
knn_model.fit(X_train, y_train)

In [None]:
y_pred_knn = knn_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Classification Report:\n", classification_report(y_test, y_pred_knn))
print("Confusion Matric\n",confusion_matrix(y_test, y_pred_knn))

###KNN with k folds
Accuracy: 80%

In [None]:
kf = KFold(n_splits=8, shuffle=True, random_state=42)

In [None]:
scores = cross_val_score(knn_model, X, y, cv=kf, scoring='accuracy')

print("Accuracy for each fold:", scores)
print("Mean accuracy:", scores.mean())