 # ML impacts how computers solve problems. Traditional systems rely on pre-defined rules programmed by humans. This approach struggles with complexity and doesn't adapt to new information. In contrast, ML enables computers to learn directly from data, similar to how humans learn.

## Traditional Code


In [1]:
def heart_disease_risk_rule_based(age, overweight, diabetic):
     """
     Assesses heart disease risk based on a set of predefined rules.

     Args:
         age: Age of the individual (int).
         overweight: True if overweight, False otherwise (bool).
         diabetic: True if diabetic, False otherwise (bool).

     Returns:
         "High Risk" or "Low Risk" (str).
     """
     if age > 50 and overweight and diabetic:
         return "High Risk"
     elif age > 60 and (overweight or diabetic):
         return "High Risk"
     elif age > 40 and overweight and not diabetic:
        return "Moderate Risk"
     else:
         return "Low Risk"

In [2]:
 # Examples
 print(heart_disease_risk_rule_based(55, True, True))  # Output: High Risk
 print(heart_disease_risk_rule_based(45, False, False)) # Output: Low Risk
 print(heart_disease_risk_rule_based(65, False, True))  # Output: High Risk
 print(heart_disease_risk_rule_based(45, True, False)) # Output: Moderate Risk

High Risk
Low Risk
High Risk
Moderate Risk


# Machine Learning (Data-Driven Approach)

In [7]:
# get the data
import random
import pandas as pd

def generate_heart_disease_data(num_records=50):
    """
    Generates synthetic data for heart disease risk assessment.

    Args:
        num_records: The number of data records to generate.

    Returns:
        A pandas DataFrame containing the generated data.
    """

    data = {
        'Age': [],
        'Overweight': [],
        'Diabetic': [],
        'Heart Disease': []
    }

    for _ in range(num_records):
        age = random.randint(30, 80)  # Assuming age range of 30-80
        overweight = random.choice([True, False])
        diabetic = random.choice([True, False])

        # Introduce some logic for heart disease risk based on factors
        if age > 60 and (overweight or diabetic):
            heart_disease = random.choices(['Yes', 'No'], weights=[0.8, 0.2])[0]  # Higher chance of Yes
        elif age > 50 and overweight and diabetic:
             heart_disease = random.choices(['Yes', 'No'], weights=[0.7, 0.3])[0]
        elif age > 40 and overweight and not diabetic:
            heart_disease = random.choices(['Yes', 'No'], weights=[0.3, 0.7])[0] # Lower chance of Yes
        else:
            heart_disease = random.choices(['Yes', 'No'], weights=[0.1, 0.9])[0]  # Low chance of Yes

        data['Age'].append(age)
        data['Overweight'].append(overweight)
        data['Diabetic'].append(diabetic)
        data['Heart Disease'].append(heart_disease)

    return pd.DataFrame(data)


# Create a sample dataframe
data = generate_heart_disease_data(100)
print(data.head(100))

    Age  Overweight  Diabetic Heart Disease
0    69       False      True           Yes
1    43        True      True            No
2    72       False     False            No
3    58        True     False           Yes
4    70       False     False           Yes
..  ...         ...       ...           ...
95   43       False     False            No
96   31        True      True            No
97   45       False      True            No
98   48       False     False            No
99   70        True     False           Yes

[100 rows x 4 columns]


In [None]:

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


df = pd.DataFrame(data)
# Prepare the data
X = df[['Age', 'Overweight', 'Diabetic']]  # Features
y = df['Heart Disease']  # Target

# Split data into training and testing sets
# X has the categories/features
# y has the target value
# train data is for training
# test data is for testing
# .2 means 20% of the data is used for testing 80% for training
# 42 is the seed for random shuffling

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest classifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the model : {accuracy}")

# 70% - 80%: Often considered a reasonable starting point for many classification problems.
# 80% - 90%: Good performance for many applications.
# 90% - 95%: Very good performance. Often challenging to achieve, but possible for well-behaved problems with good data.
# > 95%: Excellent performance, potentially approaching the limits of what's possible for the problem. Be careful of overfitting if you're achieving very high accuracy.
# 100%: Usually a sign of overfitting.

Accuracy of the model: 0.75


In [11]:
# New Data prediction
new_data = pd.DataFrame({
    'Age': [55],
    'Overweight': [True],
    'Diabetic': [True]
})

prediction = model.predict(new_data)
print(f"Prediction on the new data {prediction}")

Prediction on the new data ['Yes']
