Importing dependencies

In [None]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport
from scipy import stats

matplotlib.use('TkAgg')

data = pd.read_csv('cardio_data_processed.csv')


Generating Pandas Data profile

In [None]:
profile = ProfileReport(data)
profile.to_file('activity.html')


In [None]:
filteredWeight = data.query("weight <= 35 & bp_category == 'Normal'")
filteredWeight[ ['id','age','height','weight','bmi','bp_category'] ]


In [None]:
newData = data.query("weight >= 36")
print(newData)

In [None]:
newData['New_BMI'] = newData['weight'] / ((newData['height']/100) ** 2)

In [None]:
sns.histplot(newData['age_years'], bins=10, kde=True)
plt.title('Age Distribution')
plt.show()

In [None]:
newData.isnull().sum()
newData.dtypes

In [None]:
duplicates = data[data.duplicated()]
print(duplicates)
data.nunique()

EDA

In [None]:
sns.countplot(x='active', hue='cardio', data=newData)
plt.title("Cardiovascular Disease vs. Physical Activity")
plt.show()

In [None]:
sns.boxplot(x='cardio', y='bmi', data=newData)
plt.title("BMI vs. Cardiovascular Disease")
plt.show()


In [None]:
sns.countplot(data=newData, x='cholesterol')
plt.title("No of people in each Cholesterol Levels")
plt.xlabel('Cholesterol Levels(1:Normal, 2:Above Normal, 3:Well Above Normal)')
plt.show()

In [None]:
sns.histplot(data['age_years'], bins=28, kde=True)
plt.xlabel('Age (years)')
plt.title('Distribution of Age')
plt.show()


In [None]:
sns.histplot(data['age_years'], bins=28)
plt.xlabel('Age (years)')
plt.ylabel('Frequency')
plt.title('Age Distribution')
plt.show()

In [None]:
gluc_counts = data['gluc'].value_counts()
print(gluc_counts)

In [None]:
bmi_summary = data['bmi'].describe()
print(bmi_summary)
plt.hist(data['bmi'], bins=30)
plt.xlabel('BMI')
plt.ylabel('Frequency')
plt.title('BMI Distribution')
plt.show()

In [None]:
minimum_bmi = data['bmi'].min()
row_with_minimum_bmi = data.loc[data['bmi'] == minimum_bmi]
print(row_with_minimum_bmi)
print("Minimum BMI:", minimum_bmi)


In [None]:
newData['bmi'].describe()

In [None]:
newData['New_BMI'].describe()

In [None]:
ageDescribe = newData['age_years'].describe().round(2)
ageMedian = newData['age_years'].median()
print(ageDescribe)
print(f'Median age is - {ageMedian}')

In [None]:
profile = ProfileReport(newData)
profile.to_file('cardio_disease.html')

In [None]:
sns.countplot(data=newData, x='age_years', hue='cardio', palette="Set2")
plt.xlabel('Age (years)')
plt.ylabel('Count')
plt.title('Age vs Cardiovascular Disease')
plt.legend(title='Cardio', labels=['No', 'Yes'])
plt.show()

In [None]:
sns.histplot(newData['New_BMI'], kde=True ,bins=200, color='green', alpha=0.7)
plt.xlabel('BMI')
plt.ylabel('Frequency')
plt.title('BMI Distribution')

plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(x='cardio', y='New_BMI', data=newData, palette="Set1")
plt.xlabel('Cardiovascular Disease (0: No, 1: Yes)')
plt.ylabel('BMI')
plt.title('BMI vs Cardiovascular Disease (Box Plot)')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
ax = sns.countplot(data=newData, x='bp_category', hue='cardio', palette="Set1")
plt.xlabel('Blood Pressure Category')
plt.ylabel('Count')
plt.title('Blood Pressure Category vs Cardiovascular Disease')
plt.legend(title='Cardio', labels=['No', 'Yes'])
# Add data labels on each bar
for p in ax.patches:
    ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', fontsize=10, color='black', xytext=(0, 5), textcoords='offset points')


plt.show()

In [None]:
corr_matrix = newData[ ['id', 'age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
       'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio', 'age_years', 'New_BMI'] ].corr()
# high_correlation = corr_matrix[(corr_matrix > 0.4)]
# mask = high_correlation.isnull()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=.5)
plt.title("Correlation Heatmap")
plt.show()

Import Scikit Learn

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
newData['bp_category'] = le.fit_transform(newData['bp_category'])
newData[ ['id','bp_category'] ].head()

In [None]:
sampled_data = newData
sampled_data.describe()

In [None]:
sns.histplot(sampled_data['cholesterol'], kde=True ,bins=40, color='green', alpha=0.7)
plt.xlabel('BMI')
plt.ylabel('Frequency')
plt.title('BMI Distribution')

plt.show()

In [None]:
presenceCardio = sampled_data[ sampled_data['cardio'] == 1]
sns.scatterplot(x='ap_lo', y='ap_hi', data=presenceCardio, hue='cardio')

# Adjust labels, titles, etc.
plt.title('Scatter plot of Cardio vs AP LO and AP HI')
plt.xlabel('AP LO')
plt.ylabel('AP HI')
plt.legend(title='Cardio')

# Show the plot
plt.show()

In [None]:
from sklearn.linear_model import LinearRegression

# Assuming 'df' is your DataFrame containing the dataset
# X contains the predictors, Y is the target variable
X = sampled_data[['ap_hi', 'ap_lo']]  # Example: Selecting 'ap_hi' and 'ap_lo' as predictors for samples where 'cardio' is 1
Y = sampled_data['cardio']  # Target variable


# Initialize the linear regression model
lin_reg = LinearRegression()

# Fit the linear regression model
lin_reg.fit(X, Y)

print(lin_reg)
# Get coefficients and intercept
coefficients = lin_reg.coef_
intercept = lin_reg.intercept_

# Display results
print("Coefficients:")
print(coefficients)
print("\nIntercept:")
print(intercept)


In [None]:
# Predict the target variable using the model
Y_pred = lin_reg.predict(X)

# Create a scatter plot of the actual data
plt.scatter(X['ap_hi'], Y, color='blue', label='ap_hi')
plt.scatter(X['ap_lo'], Y, color='red', label='ap_lo')

# Plot the linear regression line
plt.plot(X['ap_hi'], Y_pred, color='green', linewidth=2, label='Linear Regression (ap_hi)')
plt.plot(X['ap_lo'], Y_pred, color='orange', linewidth=2, label='Linear Regression (ap_lo)')

# Set plot labels and title
plt.xlabel('ap_hi and ap_lo')
plt.ylabel('cardio')
plt.title('Linear Regression Plot')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
ap_hi_with_cardio = data[data['cardio'] == 1]['age_years']
ap_hi_without_cardio = data[data['cardio'] == 0]['age_years']

t_stat, p_value = stats.ttest_ind(ap_hi_with_cardio, ap_hi_without_cardio, equal_var=False)

print("Independent t-test p-value:", p_value)

if p_value < 0.05:
    print("Reject Null Hypothesis: There is a significant difference in mean 'ap_hi' between individuals with and without cardiovascular disease.")
else:
    print("Fail to Reject Null Hypothesis: There is no significant difference in mean 'ap_hi' between the groups.")

In [None]:
ap_hi_cardio_1 = sampled_data[sampled_data['cardio'] == 1]['ap_hi']

# Create a histogram to visualize the distribution of 'ap_hi' for Cardio=1
plt.figure(figsize=(8, 6))
plt.hist(ap_hi_cardio_1, bins=10, color='skyblue', edgecolor='black')
plt.xlabel('Systolic Blood Pressure (ap_hi) - Cardio Presence')
plt.ylabel('Frequency')
plt.title('Distribution of Systolic Blood Pressure (ap_hi) for Cardio Presence')
plt.show()

In [None]:
import statsmodels.api as sm
X = sampled_data[['ap_hi','ap_lo']]
Y = sampled_data['cardio']

X = sm.add_constant(X)

logit_model = sm.Logit(Y, X)
logit_result = logit_model.fit()

print(logit_result.summary())

In [None]:
from scipy.stats import ttest_ind
# Example: 'binary_var' is your binary variable and 'numerical_var' is your numerical variable
group1 = sampled_data[sampled_data['cardio'] == 0]['ap_hi']
group2 = sampled_data[sampled_data['cardio'] == 1]['ap_hi']

ttest_ind(group1, group2, equal_var=False)  # Assuming unequal variance
# print(f"T-Statistic: {t_stat}")
# print(f"P-value: {p}")

In [None]:
from sklearn.metrics import confusion_matrix
import statsmodels.api as sm
import pandas as pd

logit_model = sm.Logit(Y, sm.add_constant(X))
logit_result = logit_model.fit()

predicted = logit_result.predict()

threshold = 0.5
predicted_classes = (predicted > threshold).astype(int)

conf_matrix = confusion_matrix(Y, predicted_classes)

conf_matrix_df = pd.DataFrame(conf_matrix, columns=['Predicted 0', 'Predicted 1'], index=['Actual 0', 'Actual 1'])

print("Confusion Matrix:")
print(conf_matrix_df)


In [None]:
from sklearn.metrics import confusion_matrix
import statsmodels.api as sm
import pandas as pd

# Assuming 'X' is your predictor variable and 'Y' is your target variable from the dataset
# Replace 'X' and 'Y' with your predictor and target variables

# Fit the logistic regression model
logit_model = sm.Logit(Y, sm.add_constant(X))
logit_result = logit_model.fit()

# Get predictions from the logistic regression model
predicted = logit_result.predict()

# Convert predicted probabilities to binary predictions (0 or 1)
threshold = 0.5  # Adjust threshold as needed
predicted_classes = (predicted > threshold).astype(int)

# Create a confusion matrix
conf_matrix = confusion_matrix(Y, predicted_classes)

# Calculate percentages for the confusion matrix
total = conf_matrix.sum()
conf_matrix_percent = conf_matrix / total * 100

# Convert confusion matrix to a DataFrame for better visualization (optional)
conf_matrix_df = pd.DataFrame(conf_matrix_percent, columns=['Predicted 0 (%)', 'Predicted 1 (%)'], index=['Actual 0 (%)', 'Actual 1 (%)'])

print("Confusion Matrix with Percentages:")
print(conf_matrix_df)


In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_percent, annot=True, fmt=".2f", cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

logistic_model = LogisticRegression()


logistic_model.fit(X_train, y_train)

# Predict on the test set
predictions = logistic_model.predict(X_test)

# Assess performance, e.g., accuracy
accuracy = logistic_model.score(X_test, y_test)
print("Accuracy:", accuracy)


Random Forest

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

features = ['age_years', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'New_BMI']
X = sampled_data[features]
y = sampled_data['cardio']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)  
rf_classifier.fit(X_train, y_train)

predictions2 = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, predictions2)
print(f'Accuracy: {accuracy:.2f}')

print(classification_report(y_test, predictions2))

conf_matrix = confusion_matrix(y_test, predictions2)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt=".2f", cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()


Multilayer Perceptron

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

features = ['age_years', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'New_BMI']
X = sampled_data[features]
y = sampled_data['cardio']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

mlp_classifier = MLPClassifier(hidden_layer_sizes=(25, 25), max_iter=500, 
                               activation='relu', solver='adam', random_state=42)

mlp_classifier.fit(X_train, y_train)

predictions3 = mlp_classifier.predict(X_test)

confu_matrix = confusion_matrix(y_test, predictions3)

print('Accuracy:', accuracy_score(y_test, predictions3))
print('Confusion Matrix:\n', confu_matrix)
print('Classification Report:\n', classification_report(y_test, predictions3))

plt.figure(figsize=(4, 6))
sns.heatmap(confu_matrix, annot=True, fmt=".2f", cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()