Import the dataset with `mongoimport --type csv -d heart_attack_risk_db  -c heart_attack_data --headerline --drop heart_attack_risk.csv`

In [1]:
# Import dependencies
from pymongo import MongoClient
from pprint import pprint
import pandas as pd

# Import MongoClient Key
from config import mongo_key

In [2]:
# Create an instance of MongoClient\
mongo = MongoClient(mongo_key)

In [3]:
# Confirm that our new database was created
print(mongo.list_database_names())

['COVID19_DB', 'CitiBike_DB', 'heart_attack_risk_db', 'admin', 'local']


In [4]:
# Assign the database to a variable name
db = mongo['heart_attack_risk_db']

In [5]:
# Review the collections in our new database
print(db.list_collection_names())

['heart_attack_data']


In [6]:
# Review a document in the collection
pprint(db.heart_attack_data.find_one())

{'Age': 51,
 'Alcohol Consumption': 0,
 'BMI': 24.66997133,
 'Blood Pressure': '106/106',
 'Cholesterol': 197,
 'Continent': 'South America',
 'Country': 'Argentina',
 'Diabetes': 1,
 'Diet': 'Unhealthy',
 'Exercise Hours Per Week': 14.12351894,
 'Family History': 1,
 'Heart Attack Risk': 0,
 'Heart Rate': 79,
 'Hemisphere': 'Southern Hemisphere',
 'Income': 257061,
 'Medication Use': 0,
 'Obesity': 1,
 'Patient ID': 'QWD3129',
 'Physical Activity Days Per Week': 1,
 'Previous Heart Problems': 0,
 'Sedentary Hours Per Day': 1.539100041,
 'Sex': 'Male',
 'Sleep Hours Per Day': 5,
 'Smoking': 1,
 'Stress Level': 3,
 'Triglycerides': 785,
 '_id': ObjectId('65cebe94cf813089ec40a444')}


In [7]:
# Assign the collection to a variable
heart_attack_records = db['heart_attack_data']

In [8]:
# Retrieve data from the collection
data_from_mongo = list(heart_attack_records.find())

In [9]:
# Convert the collection to a Pandas DataFrame
heart_attack_df = pd.DataFrame(data_from_mongo)

# Display the number of rows in the DataFrame
print('Number of rows:', len(heart_attack_df))

# Display the DataFrame
heart_attack_df.head()

Number of rows: 8763


Unnamed: 0,_id,Patient ID,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,65cebe94cf813089ec40a444,QWD3129,51,Male,197,106/106,79,1,1,1,...,1.5391,257061,24.669971,785,1,5,Argentina,South America,Southern Hemisphere,0
1,65cebe94cf813089ec40a45d,DNY3115,46,Male,333,130/94,63,1,1,1,...,11.037187,128868,23.907091,547,5,10,Spain,Europe,Southern Hemisphere,0
2,65cebe94cf813089ec40a418,BNI9906,21,Female,324,174/99,72,1,0,0,...,9.463426,235282,28.176571,587,4,4,France,Europe,Northern Hemisphere,0
3,65cebe94cf813089ec40a41f,FTJ5456,43,Female,248,160/70,55,0,1,1,...,4.055115,209703,22.558917,232,7,7,Japan,Asia,Northern Hemisphere,0
4,65cebe94cf813089ec40a443,ENZ9640,33,Male,185,120/63,79,0,1,1,...,11.083843,239725,37.209005,675,6,4,Italy,Europe,Southern Hemisphere,1


In [10]:
# Confirm all columns are available
heart_attack_df.columns

Index(['_id', 'Patient ID', 'Age', 'Sex', 'Cholesterol', 'Blood Pressure',
       'Heart Rate', 'Diabetes', 'Family History', 'Smoking', 'Obesity',
       'Alcohol Consumption', 'Exercise Hours Per Week', 'Diet',
       'Previous Heart Problems', 'Medication Use', 'Stress Level',
       'Sedentary Hours Per Day', 'Income', 'BMI', 'Triglycerides',
       'Physical Activity Days Per Week', 'Sleep Hours Per Day', 'Country',
       'Continent', 'Hemisphere', 'Heart Attack Risk'],
      dtype='object')

In [11]:
Count = heart_attack_df['Heart Attack Risk'].count()
print(f'There are a total of {Count} values')

heart_attack_df['Heart Attack Risk'].value_counts()

There are a total of 8763 values


Heart Attack Risk
0    5624
1    3139
Name: count, dtype: int64

In [12]:
# Drop id column
heart_attack_df = heart_attack_df.drop('_id', axis=1)

In [13]:
pprint(heart_attack_df.columns.tolist())

['Patient ID',
 'Age',
 'Sex',
 'Cholesterol',
 'Blood Pressure',
 'Heart Rate',
 'Diabetes',
 'Family History',
 'Smoking',
 'Obesity',
 'Alcohol Consumption',
 'Exercise Hours Per Week',
 'Diet',
 'Previous Heart Problems',
 'Medication Use',
 'Stress Level',
 'Sedentary Hours Per Day',
 'Income',
 'BMI',
 'Triglycerides',
 'Physical Activity Days Per Week',
 'Sleep Hours Per Day',
 'Country',
 'Continent',
 'Hemisphere',
 'Heart Attack Risk']


In [23]:
# Save dataframe to csv
heart_attack_df.to_csv("heart_attack_risk_data.csv", index=False)

In [24]:
import csv
import json

# Convert CSV to JSON for dashboard
csvfile = open('heart_attack_risk_data.csv', 'r')
jsonfile = open('heart_attack_risk.json', 'w')

fieldnames = [
    'Patient ID', 'Age', 'Sex', 'Cholesterol', 'Blood Pressure',
    'Heart Rate', 'Diabetes', 'Family History', 'Smoking', 'Obesity',
    'Alcohol Consumption', 'Exercise Hours Per Week', 'Diet',
    'Previous Heart Problems', 'Medication Use', 'Stress Level',
    'Sedentary Hours Per Day', 'Income', 'BMI', 'Triglycerides',
    'Physical Activity Days Per Week', 'Sleep Hours Per Day', 'Country',
    'Continent', 'Hemisphere', 'Heart Attack Risk'
]

reader = csv.DictReader(csvfile, fieldnames)

# Create a list to hold the rows
data = []

# Skip the first row (header)
next(reader)

for row in reader:
    data.append(row)

# Use json.dump to write the entire list to the file
json.dump(data, jsonfile, indent=2)

# Close the files
csvfile.close()
jsonfile.close()