In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from pymongo import MongoClient
#from config import mongo_key

In [2]:
# Create an instance of MongoClient
mongo = MongoClient('mongodb+srv://sorianor:Soriano0688!@cluster0.qg1prit.mongodb.net/')

# Assign the database to a variable name
db = mongo['heart_attack_risk_db']

# Assign the collection to a variable
heart_attack_records = db['heart_attack_data']

# Retrieve data from the collection
data_from_mongo = list(heart_attack_records.find())

In [3]:
# Convert the collection to a Pandas DataFrame
heart_attack_df = pd.DataFrame(data_from_mongo)

# Display the number of rows in the DataFrame
print('Number of rows:', len(heart_attack_df))

# Display the DataFrame
heart_attack_df.head()

Number of rows: 8763


Unnamed: 0,_id,Patient ID,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,65cebe94cf813089ec40a416,BMW7812,67,Male,208,158/88,72,0,0,1,...,6.615001,261404,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0
1,65cebe94cf813089ec40a417,CZE1114,21,Male,389,165/93,98,1,1,1,...,4.963459,285768,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0
2,65cebe94cf813089ec40a418,BNI9906,21,Female,324,174/99,72,1,0,0,...,9.463426,235282,28.176571,587,4,4,France,Europe,Northern Hemisphere,0
3,65cebe94cf813089ec40a419,JLN3497,84,Male,383,163/100,73,1,1,1,...,7.648981,125640,36.464704,378,3,4,Canada,North America,Northern Hemisphere,0
4,65cebe94cf813089ec40a41a,GFO8847,66,Male,318,91/88,93,1,1,1,...,1.514821,160555,21.809144,231,1,5,Thailand,Asia,Northern Hemisphere,0


In [4]:
# Preprocess the data
# Select features and target
X = heart_attack_df.drop(['Patient ID', 'Heart Attack Risk', 'Blood Pressure', '_id'], axis=1)
# X = heart_attack_df[['Cholesterol', 'Heart Rate', 'Physical Activity Days Per Week']]
y = heart_attack_df['Heart Attack Risk']



In [5]:
# Handling categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(exclude=['object']).columns.tolist()

categorical_features
numerical_features

['Age',
 'Cholesterol',
 'Heart Rate',
 'Diabetes',
 'Family History',
 'Smoking',
 'Obesity',
 'Alcohol Consumption',
 'Exercise Hours Per Week',
 'Previous Heart Problems',
 'Medication Use',
 'Stress Level',
 'Sedentary Hours Per Day',
 'Income',
 'BMI',
 'Triglycerides',
 'Physical Activity Days Per Week',
 'Sleep Hours Per Day']

In [6]:
# Creating a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

preprocessor

In [7]:
# Define the model pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

model_pipeline

In [8]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Train the model
model_pipeline.fit(X_train, y_train)

In [10]:
# Predictions
y_pred = model_pipeline.predict(X_test)

In [11]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100:.2f}%")
print(classification_report(y_test, y_pred))

Accuracy: 64.18%
              precision    recall  f1-score   support

           0       0.64      1.00      0.78      1125
           1       0.00      0.00      0.00       628

    accuracy                           0.64      1753
   macro avg       0.32      0.50      0.39      1753
weighted avg       0.41      0.64      0.50      1753



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
from collections import Counter
from imblearn.over_sampling import RandomOverSampler

# Assuming you have your data in X and y arrays
# X contains your features, and y contains your labels (0 or 1)

# Check the distribution of classes before oversampling
print("Before oversampling:", Counter(y))

# Define the oversampler with 'minority' sampling strategy
oversampler = RandomOverSampler(sampling_strategy='minority', random_state=42)

# Oversample the minority class (class 1)
X_resampled, y_resampled = oversampler.fit_resample(X, y)

# Check the distribution of classes after oversampling
print("After oversampling:", Counter(y_resampled))


Before oversampling: Counter({0: 5624, 1: 3139})
After oversampling: Counter({0: 5624, 1: 5624})


In [13]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [14]:
# Train the model
model_pipeline.fit(X_train, y_train)

In [15]:
# Predictions
y_pred = model_pipeline.predict(X_test)

In [16]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100:.2f}%")
print(classification_report(y_test, y_pred))

Accuracy: 49.96%
              precision    recall  f1-score   support

           0       0.50      0.51      0.50      1120
           1       0.50      0.49      0.49      1130

    accuracy                           0.50      2250
   macro avg       0.50      0.50      0.50      2250
weighted avg       0.50      0.50      0.50      2250



RANDOM FOREST MODEL

In [17]:
from sklearn.ensemble import RandomForestClassifier

In [18]:
# Define the Random Forest model pipeline
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [19]:
# Train the Random Forest model
pipeline = rf_pipeline.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_pipeline.predict(X_test)

In [20]:
# Evaluate the Random Forest model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf*100:.2f}%")
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 79.16%
              precision    recall  f1-score   support

           0       0.74      0.89      0.81      1120
           1       0.86      0.69      0.77      1130

    accuracy                           0.79      2250
   macro avg       0.80      0.79      0.79      2250
weighted avg       0.80      0.79      0.79      2250



In [21]:
import joblib

joblib.dump(pipeline,'rf_model.pkl')

['rf_model.pkl']

In [24]:
X_test

Unnamed: 0,Age,Sex,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,...,Stress Level,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere
5748,35,Male,290,41,1,0,1,1,0,9.184641,...,9,1.378387,141121,18.714263,411,2,4,China,Asia,Northern Hemisphere
7334,19,Female,359,88,1,0,0,0,1,11.975734,...,1,3.903893,44813,27.314347,391,6,9,Vietnam,Asia,Northern Hemisphere
5931,25,Male,123,43,1,1,1,1,1,8.875226,...,2,11.302907,42029,30.306906,60,3,6,Germany,Europe,Northern Hemisphere
3819,29,Female,153,69,1,1,0,1,0,18.804460,...,9,1.331252,73678,19.887581,380,0,10,Italy,Europe,Southern Hemisphere
6394,47,Female,154,78,1,1,1,1,1,2.204877,...,5,4.028869,179582,27.070979,729,2,6,France,Europe,Northern Hemisphere
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10580,62,Female,188,59,1,0,1,1,0,14.760539,...,5,9.837046,274245,19.794727,444,1,6,Canada,North America,Northern Hemisphere
8009,78,Female,362,42,1,0,1,0,1,4.345560,...,7,4.608067,135065,27.943236,363,6,5,Canada,North America,Northern Hemisphere
2876,70,Female,226,110,0,1,1,0,0,6.637940,...,8,3.334771,31484,39.297030,79,0,8,Argentina,South America,Southern Hemisphere
1310,83,Male,209,49,1,1,1,1,1,2.191937,...,5,10.352066,283830,29.917460,165,3,9,France,Europe,Northern Hemisphere


In [25]:
y_pred_rf

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)