In [4]:
import os
import pandas as pd
from natsort import natsorted
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
import joblib

# Get the current working directory
current_dir = os.getcwd()

# Define folder names
normal_folder = 'NORMAL'
abnormal_folder = 'AHB'
mi_folder = 'MI'
pmi_folder = 'PM'

# Define relative paths for each folder
normal_path = os.path.join(current_dir, normal_folder)
abnormal_path = os.path.join(current_dir, abnormal_folder)
mi_path = os.path.join(current_dir, mi_folder)
pmi_path = os.path.join(current_dir, pmi_folder)

# Create lists to store file names
NORMAL_ = []
MI_ = []
PMI_ = []
HB_ = []

# Store paths and their corresponding types
Types_ECG = {'normal': normal_path, 'Abnormal_hear_beat': abnormal_path, 'MI': mi_path, 'History_MI': pmi_path}

# Populate lists with file names
for types, folder in Types_ECG.items():
    for files in os.listdir(folder):
        if types == 'normal':
            NORMAL_.append(files)
        elif types == 'Abnormal_hear_beat':
            HB_.append(files)
        elif types == 'MI':
            MI_.append(files)
        elif types == 'History_MI':
            PMI_.append(files)

# Sort file names in natural order
NORMAL_ = natsorted(NORMAL_)
MI_ = natsorted(MI_)
PMI_ = natsorted(PMI_)
HB_ = natsorted(HB_)

# Loop over and create combined csv files for each lead
for x in range(len(MI_)):
    df1 = pd.read_csv(os.path.join(normal_path, NORMAL_[x]))
    df2 = pd.read_csv(os.path.join(abnormal_path, HB_[x]))
    df3 = pd.read_csv(os.path.join(mi_path, MI_[x]))
    df4 = pd.read_csv(os.path.join(pmi_path, PMI_[x]))
    final_df = pd.concat([df1, df2, df3, df4], ignore_index=True)
    final_df.to_csv(os.path.join(current_dir, f'Combined_IDLead_{x + 1}.csv'))

# Now reading just lead1
df = pd.read_csv(os.path.join(current_dir, 'Combined1d_csv', 'Combined_IDLead_1.csv'))
df['Target'].unique()

df.drop(columns=['Unnamed: 0'], inplace=True)

# Convert Target column values as Numeric using ngroups
encode_target_label = df.groupby('Target').ngroup().rename("target").to_frame()
test_final = df.merge(encode_target_label, left_index=True, right_index=True)
test_final.drop(columns=['Target'], inplace=True)

# Perform Dimensionality reduction (PCA) on that Dataframe and check
pca = PCA(n_components=100)
x_pca = pca.fit_transform(test_final.iloc[:, 0:-1])
x_pca = pd.DataFrame(x_pca)

# Calculate the variance explained by principal components
explained_variance = pca.explained_variance_ratio_
print('Variance of each component:', pca.explained_variance_ratio_)
print('\nTotal Variance Explained:', round(sum(list(pca.explained_variance_ratio_)) * 100, 2))

# Store the new pca generated dimensions in a dataframe
pca_df = pd.DataFrame(data=x_pca)
target = pd.Series(test_final['target'], name='target')
result_df = pd.concat([pca_df, target], axis=1)

# K Nearest Neighbors Classifier
steps_knn = [('knn', KNeighborsClassifier())]
pipeline_knn = Pipeline(steps_knn)
k_range = list(range(1, 9))
parameters_knn = dict(knn__n_neighbors=k_range)
X = result_df.iloc[:, :-1]
y = result_df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
cv_knn = GridSearchCV(pipeline_knn, parameters_knn, cv=2)
cv_knn.fit(X_train, y_train)
y_pred_knn = cv_knn.predict(X_test)
knn_accuracy = cv_knn.score(X_test, y_test)
print("K Nearest Neighbors Classifier Accuracy:", knn_accuracy)
print(classification_report(y_test, y_pred_knn))
print("Tuned Model Parameters:", cv_knn.best_params_)

# Save the KNN model
joblib.dump(cv_knn, os.path.join(current_dir, 'knn_model.pkl'))

# Logistic Regression
lr_pipeline = Pipeline([('scaler', StandardScaler()),
                        ('lr', LogisticRegression(max_iter=1000))])
param_grid_lr = {'lr__C': [0.001, 0.01, 0.1, 1, 10, 100],
                 'lr__penalty': ['l2']}
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
lr_grid = GridSearchCV(lr_pipeline, param_grid_lr, cv=5, n_jobs=-1)
lr_grid.fit(X_train, y_train)
y_pred_lr = lr_grid.predict(X_test)
lr_accuracy = lr_grid.score(X_test, y_test)
print("\nLogistic Regression Model Evaluation:")
print("Accuracy:", lr_accuracy)
print(classification_report(y_test, y_pred_lr))
print("Best Parameters:", lr_grid.best_params_)

# Save the Logistic Regression model
joblib.dump(lr_grid, os.path.join(current_dir, 'logistic_regression_model.pkl'))

# Support Vector Machine
steps_svm = [('SVM', SVC())]
pipeline_svm = Pipeline(steps_svm)
param_grid_svm = {'SVM__C': [0.1, 1, 10, 100],
                  'SVM__gamma': [1, 0.1, 0.01, 0.001]}
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=21)
cv_svm = GridSearchCV(pipeline_svm, param_grid_svm, cv=3)
cv_svm.fit(X_train, y_train)
y_pred_svm = cv_svm.predict(X_test)
svm_accuracy = cv_svm.score(X_test, y_test)
print("\nSupport Vector Machine Classifier Accuracy:", svm_accuracy)
print(classification_report(y_test, y_pred_svm))
print("Tuned Model Parameters:", cv_svm.best_params_)

# Save the Support Vector Machine model
joblib.dump(cv_svm, os.path.join(current_dir, 'final_test.pkl'))


Variance of each component: [1.76145888e-01 9.50265614e-02 6.99060614e-02 6.15960001e-02
 5.34876630e-02 4.23664893e-02 3.68320213e-02 3.38541791e-02
 3.00884979e-02 2.90396728e-02 2.64962509e-02 2.42272738e-02
 2.10221030e-02 1.99751559e-02 1.77321042e-02 1.63016802e-02
 1.53898622e-02 1.48412074e-02 1.33644825e-02 1.19674074e-02
 1.16813409e-02 1.05807650e-02 9.68875480e-03 9.47385060e-03
 8.65347748e-03 8.47506998e-03 7.93382172e-03 7.30163338e-03
 6.76380665e-03 6.36886390e-03 6.02004791e-03 5.46823032e-03
 5.31229911e-03 4.97821789e-03 4.74686092e-03 4.46081684e-03
 4.21254684e-03 4.01200243e-03 3.87246476e-03 3.52519084e-03
 3.37596894e-03 3.26978336e-03 3.08241145e-03 2.96423495e-03
 2.73419816e-03 2.50965698e-03 2.35335480e-03 2.25665349e-03
 2.20141761e-03 1.96782025e-03 1.74343954e-03 1.70982830e-03
 1.57456047e-03 1.53704487e-03 1.36768435e-03 1.33167096e-03
 1.26444173e-03 1.20053330e-03 1.18738749e-03 1.08864087e-03
 1.02824532e-03 9.11484783e-04 7.89962329e-04 7.59785111e

['C:\\Users\\pawan\\Downloads\\ultron test\\final_test.pkl']

In [5]:
pip install xgboost


Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/24/ec/ad387100fa3cc2b9b81af0829b5ecfe75ec5bb19dd7c19d4fea06fb81802/xgboost-2.0.3-py3-none-win_amd64.whl.metadata
  Downloading xgboost-2.0.3-py3-none-win_amd64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.8 MB 653.6 kB/s eta 0:02:33
   ---------------------------------------- 0.2/99.8 MB 2.0 MB/s eta 0:00:51
   ---------------------------------------- 0.9/99.8 MB 6.5 MB/s eta 0:00:16
    --------------------------------------- 1.8/99.8 MB 9.3 MB/s eta 0:00:11
    --------------------------------------- 2.0/99.8 MB 8.7 MB/s eta 0:00:12
    ---------------------------------------

In [6]:
import xgboost as xgb


In [7]:
# XGBoost Classifier
xgb_model = xgb.XGBClassifier()
param_grid_xgb = {'n_estimators': [50, 100, 200],
                  'learning_rate': [0.01, 0.1, 0.2],
                  'max_depth': [3, 5, 7]}

cv_xgb = GridSearchCV(xgb_model, param_grid_xgb, cv=3)
cv_xgb.fit(X_train, y_train)
y_pred_xgb = cv_xgb.predict(X_test)
xgb_accuracy = cv_xgb.score(X_test, y_test)
print("\nXGBoost Classifier Accuracy:", xgb_accuracy)
print(classification_report(y_test, y_pred_xgb))
print("Tuned Model Parameters:", cv_xgb.best_params_)

# Save the XGBoost model
joblib.dump(cv_xgb, os.path.join(current_dir, 'xgboost_model.pkl'))



XGBoost Classifier Accuracy: 0.8225806451612904
              precision    recall  f1-score   support

           0       0.76      0.70      0.73        93
           1       0.96      1.00      0.98        99
           2       0.79      0.85      0.82       117
           3       0.73      0.68      0.70        63

    accuracy                           0.82       372
   macro avg       0.81      0.81      0.81       372
weighted avg       0.82      0.82      0.82       372

Tuned Model Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}


['C:\\Users\\pawan\\Downloads\\ultron test\\xgboost_model.pkl']