In [15]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
import joblib
import pandas as pd

# Define column names
column_names = [
    'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 
    'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'
]

# Load the dataset with the defined column names
data = pd.read_csv('cleveland.csv', header=None, names=column_names)

# Check for missing values in the dataset
print("Missing values in each column:")
print(data.isnull().sum())

# Handle missing values (Option 1: drop rows with missing values)
# data = data.dropna()

# Handle missing values (Option 2: fill missing values with the mean of the column)
data = data.fillna(data.mean())

# Print the first few rows and the column names
print(data.head())
print("Column names in the dataset:", data.columns)

# Define features and target
target_column = 'target'  # Make sure this matches the actual target column name
X = data.drop(columns=target_column)
y = data[target_column]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling (standardization)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the models
logistic_model = LogisticRegression()
knn_model = KNeighborsClassifier()
random_forest_model = RandomForestClassifier()
svm_model = SVC(probability=True)  # SVM model with probability output for consistency
decision_tree_model = DecisionTreeClassifier()
naive_bayes_model = GaussianNB()

# Train the models
logistic_model.fit(X_train_scaled, y_train)
knn_model.fit(X_train_scaled, y_train)
random_forest_model.fit(X_train_scaled, y_train)
svm_model.fit(X_train_scaled, y_train)
decision_tree_model.fit(X_train_scaled, y_train)
naive_bayes_model.fit(X_train_scaled, y_train)

# Save the models
joblib.dump(logistic_model, 'logistic_model.pkl')
joblib.dump(knn_model, 'knn_model.pkl')
joblib.dump(random_forest_model, 'random_forest_model.pkl')
joblib.dump(svm_model, 'svm_model.pkl')
joblib.dump(decision_tree_model, 'decision_tree_model.pkl')
joblib.dump(naive_bayes_model, 'naive_bayes_model.pkl')

# Save the scaler for future use
joblib.dump(scaler, 'scaler.pkl')


Missing values in each column:
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
target      0
dtype: int64
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   63    1   1       145   233    1        2      150      0      2.3      3   
1   67    1   4       160   286    0        2      108      1      1.5      2   
2   67    1   4       120   229    0        2      129      1      2.6      2   
3   37    1   3       130   250    0        0      187      0      3.5      3   
4   41    0   2       130   204    0        2      172      0      1.4      1   

    ca  thal  target  
0  0.0   6.0       0  
1  3.0   3.0       2  
2  2.0   7.0       1  
3  0.0   3.0       0  
4  0.0   3.0       0  
Column names in the dataset: Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'sl

['scaler.pkl']