In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

# Load the dataset
file_path = 'data.csv'
heart_data = pd.read_csv(file_path)

# Drop 'id' and 'dataset' columns as they are not useful for prediction
heart_data.drop(columns=['id', 'dataset'], inplace=True)

# Handle categorical columns with Label Encoding
label_cols = ['sex', 'cp', 'restecg', 'exang', 'slope', 'thal','fbs']
label_encoder = LabelEncoder()

for col in label_cols:
    heart_data[col] = heart_data[col].astype(str)  # Ensure column is string before encoding
    heart_data[col] = label_encoder.fit_transform(heart_data[col])

# Handle missing values
imputer_median = SimpleImputer(strategy='median')
imputer_mode = SimpleImputer(strategy='most_frequent')

# Impute median for numerical columns
num_cols = ['trestbps', 'chol', 'thalch', 'oldpeak', 'ca']
heart_data[num_cols] = imputer_median.fit_transform(heart_data[num_cols])

# Impute most frequent for categorical columns
cat_cols = ['slope', 'thal']
heart_data[cat_cols] = imputer_mode.fit_transform(heart_data[cat_cols])

# Check target distribution and apply SMOTE for class balancing
X = heart_data.drop(columns=['num'])
y = heart_data['num']

X.isna().sum()


In [None]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train the RandomForestClassifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)


In [None]:
import numpy as np

# Function to take user input for prediction
def predict_heart_disease():
    print("Please enter the following values to predict heart disease:")

    # Collecting inputs for each feature
    age = float(input("Age: "))
    sex = int(input("Sex (0 = female, 1 = male): "))
    cp = int(input("Chest Pain Type (0, 1, 2, or 3): "))
    trestbps = float(input("Resting Blood Pressure: "))
    chol = float(input("Serum Cholesterol: "))
    fbs = int(input("Fasting Blood Sugar (1 = true, 0 = false): "))
    restecg = int(input("Resting Electrocardiographic Results (0, 1, or 2): "))
    thalch = float(input("Max Heart Rate Achieved: "))
    exang = int(input("Exercise Induced Angina (1 = yes, 0 = no): "))
    oldpeak = float(input("ST Depression: "))
    slope = int(input("Slope of the Peak Exercise ST Segment (0, 1, or 2): "))
    ca = float(input("Number of Major Vessels Colored by Fluoroscopy (0-3): "))
    thal = int(input("Thalassemia (0, 1, 2, or 3): "))

    # Create a numpy array from the input values
    input_data = np.array([[age, sex, cp, trestbps, chol, fbs, restecg, thalch, exang, oldpeak, slope, ca, thal]])

    # Make a prediction using the trained model
    prediction = clf.predict(input_data)

    # Output the result
    if prediction[0] == 0:
        print("Prediction: No heart disease")
    else:
        print(f"Prediction: Heart disease severity level {prediction[0]}")

# Example call to the prediction function
predict_heart_disease()


In [None]:
import joblib

# Save the trained model
model_filename = 'heart_disease_model.pkl'
joblib.dump(clf, model_filename)

# Download the model file
print(f"Model saved as {model_filename}")


In [None]:
heart_data.sample(5)

In [None]:
heart_data['slope'].value_counts()

In [None]:
X.sample(3)

In [None]:
# prompt: create sample from x  2 row for every class and git ther y

# Create an empty list to store the sampled rows and their corresponding target values
sampled_rows = []
sampled_y = []

# Iterate through the unique classes in y
for class_label in y.unique():
  # Get the indices of rows belonging to the current class
  class_indices = y[y == class_label].index

  # Randomly select 2 rows from the current class, if enough rows are available.
  if len(class_indices) >= 1:
    sampled_indices = np.random.choice(class_indices, size=1, replace=False)
  else:
    sampled_indices = class_indices

  # Append the sampled rows and their target values to the lists.
  for index in sampled_indices:
    sampled_rows.append(X.loc[index].tolist())
    sampled_y.append(y.loc[index])


# Convert the list of sampled rows to a DataFrame
sampled_X = pd.DataFrame(sampled_rows, columns=X.columns)



print(sampled_y)



In [None]:
sampled_X

In [None]:
# prompt: i want to put sampled_X and sampled_y in dataframe
sampled_df = pd.DataFrame(sampled_X)
sampled_df['num'] = sampled_y
print(sampled_df)


In [None]:
# prompt: concat sampled_X and sampeled_y in datadframe

sampled_df = pd.concat([sampled_X, pd.DataFrame({'num': sampled_y})], axis=1)
sampled_df