In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [30]:
df=pd.read_csv(r'C:\Users\b84266591\Desktop\Tech\MJ TINGs\cardio_train.csv', sep=';')
df

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1


<h3> Update Feature Engineering </h3>

In [40]:
df['height']=df['height']*0.01
df['bmi'] = (df['weight'] / (df['height'] ** 2)).astype('int')
df['age'] = (df['age'] / 365).round().astype('int')

# Define BMI categories
def categorize_bmi(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif 18.5 <= bmi < 25:
        return 'Normal'
    elif 25 <= bmi < 30:
        return 'Overweight'
    else:
        return 'Obese'

# Apply the function to create a new feature
df['bmi_category'] = df['bmi'].apply(categorize_bmi)

# Define age groups
def categorize_age(age):
    if age < 40:
        return 'Young'
    elif 40 <= age < 60:
        return 'Middle-aged'
    else:
        return 'Senior'

# Apply the function to create a new feature
df['age_group'] = df['age'].apply(categorize_age)

# Define blood pressure categories
def categorize_bp(ap_hi, ap_lo):
    if ap_hi < 120 and ap_lo < 80:
        return 'Normal'
    elif ap_hi >= 140 or ap_lo >= 90:
        return 'Hypertension'
    else:
        return 'High-Normal'

# Apply the function to create a new feature
df['bp_category'] = df.apply(lambda row: categorize_bp(row['ap_hi'], row['ap_lo']), axis=1)

In [7]:
df

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,bmi,bmi_category,age_group,bp_category
0,0,50,2,1.68,62.0,110,80,1,1,0,0,1,0,21,Normal,Middle-aged,High-Normal
1,1,55,1,1.56,85.0,140,90,3,1,0,0,1,1,34,Obese,Middle-aged,Hypertension
2,2,52,1,1.65,64.0,130,70,3,1,0,0,0,1,23,Normal,Middle-aged,High-Normal
3,3,48,2,1.69,82.0,150,100,1,1,0,0,1,1,28,Overweight,Middle-aged,Hypertension
4,4,48,1,1.56,56.0,100,60,1,1,0,0,0,0,23,Normal,Middle-aged,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,53,2,1.68,76.0,120,80,1,1,1,0,1,0,26,Overweight,Middle-aged,High-Normal
69996,99995,62,1,1.58,126.0,140,90,2,2,0,0,1,1,50,Obese,Senior,Hypertension
69997,99996,52,2,1.83,105.0,180,90,3,1,0,1,0,1,31,Obese,Middle-aged,Hypertension
69998,99998,61,1,1.63,72.0,135,80,1,2,0,0,0,1,27,Overweight,Senior,High-Normal


In [None]:






# Define which features are numerical and which are categorical
numerical_features = ['age', 'bmi', ...]  # List the names of numerical columns
categorical_features = ['gender', 'bmi_category', 'bp_category', ...]  # List the names of categorical columns

# Create transformers for feature scaling and one-hot encoding
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Use a ColumnTransformer to apply the transformers to the appropriate columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Create a pipeline that combines preprocessing and the classifier
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', rf_classifier)])

# Train the model
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print the evaluation metrics
print(f'Accuracy: {accuracy:.2f}')
print('Confusion Matrix:')
print(confusion)
print('Classification Report:')
print(report)


<h2> Building the Model </h2>

In [8]:
# Importing the necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [31]:
df = df.drop(columns=['id'])

In [32]:
df

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
69995,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,22431,1,163,72.0,135,80,1,2,0,0,0,1


In [41]:
# Define features (X) and target (y)
features = df.drop(columns=['cardio'])  # Exclude the target variable
target = df['cardio']

In [42]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [43]:
df.columns

Index(['age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol',
       'gluc', 'smoke', 'alco', 'active', 'cardio', 'bmi', 'bmi_category',
       'age_group', 'bp_category'],
      dtype='object')

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   age           70000 non-null  int32  
 1   gender        70000 non-null  int64  
 2   height        70000 non-null  float64
 3   weight        70000 non-null  float64
 4   ap_hi         70000 non-null  int64  
 5   ap_lo         70000 non-null  int64  
 6   cholesterol   70000 non-null  int64  
 7   gluc          70000 non-null  int64  
 8   smoke         70000 non-null  int64  
 9   alco          70000 non-null  int64  
 10  active        70000 non-null  int64  
 11  cardio        70000 non-null  int64  
 12  bmi           70000 non-null  int32  
 13  bmi_category  70000 non-null  object 
 14  age_group     70000 non-null  object 
 15  bp_category   70000 non-null  object 
dtypes: float64(2), int32(2), int64(9), object(3)
memory usage: 8.0+ MB


In [21]:
# Define which features are numerical and which are categorical
numerical_features = ['age', 'bmi', 'height', 'weight', 'ap_hi', 'ap_ho']  # List the names of numerical columns
categorical_features = ['age_group', 'bmi_category', 'bp_category']  # List the names of categorical columns

In [22]:
# Create transformers for feature scaling and one-hot encoding
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [24]:
# Use a ColumnTransformer to apply the transformers to the appropriate columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [26]:
# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

In [27]:
# Create a pipeline that combines preprocessing and the classifier
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', rf_classifier)])

In [28]:
# Train the model
clf.fit(X_train, y_train)

ValueError: A given column is not a column of the dataframe

In [29]:
df

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,bmi,bmi_category,age_group,bp_category
0,0,50,2,1.68,62.0,110,80,1,1,0,0,1,0,21,Normal,Middle-aged,High-Normal
1,1,55,1,1.56,85.0,140,90,3,1,0,0,1,1,34,Obese,Middle-aged,Hypertension
2,2,52,1,1.65,64.0,130,70,3,1,0,0,0,1,23,Normal,Middle-aged,High-Normal
3,3,48,2,1.69,82.0,150,100,1,1,0,0,1,1,28,Overweight,Middle-aged,Hypertension
4,4,48,1,1.56,56.0,100,60,1,1,0,0,0,0,23,Normal,Middle-aged,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,53,2,1.68,76.0,120,80,1,1,1,0,1,0,26,Overweight,Middle-aged,High-Normal
69996,99995,62,1,1.58,126.0,140,90,2,2,0,0,1,1,50,Obese,Senior,Hypertension
69997,99996,52,2,1.83,105.0,180,90,3,1,0,1,0,1,31,Obese,Middle-aged,Hypertension
69998,99998,61,1,1.63,72.0,135,80,1,2,0,0,0,1,27,Overweight,Senior,High-Normal
