In [2]:
# Import necessary libraries
import boto3
import pandas as pd
import os

# AWS S3 Configuration
s3_bucket = "cardiovale-solutions-datascience-pipeline"
s3_client = boto3.client("s3")

# Define the root directory and correct data folder path
root_directory = os.path.abspath(os.path.join(os.getcwd(), ".."))  # Go to the root project directory
data_folder = os.path.join(root_directory, "data")  # Save data in the root-level data folder

# Ensure the data folder exists
os.makedirs(data_folder, exist_ok=True)

# Files to fetch from S3
s3_files = {
    "cardio_train": "raw-data/cardio_train.csv",
    "quitline_services": "raw-data/Quitline___Services_Available___Medications_-_2010_To_Present_20250306.csv"
}


In [3]:
# -----------------------------
# 1. Manually Fetch Data from S3
# -----------------------------
print("Fetching data from S3...\n")

for dataset, s3_path in s3_files.items():
    local_file_path = os.path.join(data_folder, f"{dataset}.csv")  # Save in the root-level `data/` folder
    
    try:
        s3_client.download_file(s3_bucket, s3_path, local_file_path)
        print(f"{dataset} downloaded successfully! Saved to {local_file_path}")
    except Exception as e:
        print(f"Error fetching {dataset}: {e}")


Fetching data from S3...

Error fetching cardio_train: An error occurred (404) when calling the HeadObject operation: Not Found
Error fetching quitline_services: An error occurred (404) when calling the HeadObject operation: Not Found


In [4]:
# -----------------------------
# 2. Load and Explore Data
# -----------------------------
print("\nLoading Data into Pandas...")

# Load data into Pandas
cardio_df = pd.read_csv(os.path.join(data_folder, "cardio_train.csv"))
quitline_df = pd.read_csv(os.path.join(data_folder, "quitline_services.csv"))


# Display basic dataset info
print("\nCardio Dataset Sample:")
print(cardio_df.head())

print("\nQuitline Dataset Sample:")
print(quitline_df.head())

# Check data types
print("\nCardio Data Types:")
print(cardio_df.dtypes)

print("\nQuitline Data Types:")
print(quitline_df.dtypes)

# Check for missing values
print("\nMissing Values in Cardio Dataset:")
print(cardio_df.isnull().sum())

print("\nMissing Values in Quitline Dataset:")
print(quitline_df.isnull().sum())



Loading Data into Pandas...

Cardio Dataset Sample:
  id;age;gender;height;weight;ap_hi;ap_lo;cholesterol;gluc;smoke;alco;active;cardio
0              0;18393;2;168;62.0;110;80;1;1;0;0;1;0                               
1              1;20228;1;156;85.0;140;90;3;1;0;0;1;1                               
2              2;18857;1;165;64.0;130;70;3;1;0;0;0;1                               
3             3;17623;2;169;82.0;150;100;1;1;0;0;1;1                               
4              4;17474;1;156;56.0;100;60;1;1;0;0;0;0                               

Quitline Dataset Sample:
   Year  Date Date_Ref LocationAbbr LocationDesc TopicType  \
0  2020    12  Jul-Dec           GA      Georgia  Quitline   
1  2020    12  Jul-Dec           MO     Missouri  Quitline   
2  2020    12  Jul-Dec           MT      Montana  Quitline   
3  2020    12  Jul-Dec           NJ   New Jersey  Quitline   
4  2020    12  Jul-Dec           OK     Oklahoma  Quitline   

            TopicDesc  MeasureDesc         S

In [5]:
# -----------------------------
# 3. Identify Key Fields & Bias
# -----------------------------
print("\n Key Fields & Bias Assessment:")

# Key fields expected
key_fields_cardio = ["id", "age", "gender", "height", "weight", "cholesterol", "smoke", "cardio"]
key_fields_quitline = ["state", "year", "medication_available", "medication_type"]

# Bias Analysis
data_bias_concerns = """
- Cardio dataset may have selection bias based on demographic distribution.
- Quitline services dataset may be biased towards urban areas where programs are well-documented.
"""

print(f"Key Fields in Cardio Dataset: {key_fields_cardio}")
print(f"Key Fields in Quitline Dataset: {key_fields_quitline}")
print(f"\nData Bias Concerns: {data_bias_concerns}")



 Key Fields & Bias Assessment:
Key Fields in Cardio Dataset: ['id', 'age', 'gender', 'height', 'weight', 'cholesterol', 'smoke', 'cardio']
Key Fields in Quitline Dataset: ['state', 'year', 'medication_available', 'medication_type']

Data Bias Concerns: 
- Cardio dataset may have selection bias based on demographic distribution.
- Quitline services dataset may be biased towards urban areas where programs are well-documented.



In [6]:
# -----------------------------
# 4. Security & Privacy Checklist
# -----------------------------
print("Security & Privacy Checklist:")

# PHI (Protected Health Information) and PII (Personally Identifiable Information)
contains_phi = "No"
contains_pii = "No"
tracks_user_behavior = "No"
processes_credit_card_data = "No"

# S3 Buckets Read/Write
s3_buckets_used = [s3_files["cardio_train"], s3_files["quitline_services"]]

# Print Security Assessment
print(f"Contains PHI: {contains_phi}")
print(f"Contains PII: {contains_pii}")
print(f"Tracks User Behavior: {tracks_user_behavior}")
print(f"Processes Credit Card Data: {processes_credit_card_data}")
print(f"S3 Buckets Accessed: {s3_buckets_used}")


Security & Privacy Checklist:
Contains PHI: No
Contains PII: No
Tracks User Behavior: No
Processes Credit Card Data: No
S3 Buckets Accessed: ['raw-data/cardio_train.csv', 'raw-data/Quitline___Services_Available___Medications_-_2010_To_Present_20250306.csv']


In [7]:
import os

# Define correct data folder (outside notebooks)
data_folder = "/home/sagemaker-user/ads-508-team/data"

# Ensure the folder exists
os.makedirs(data_folder, exist_ok=True)

# -----------------------------
# 5. Measuring Impact
# -----------------------------
print("Measuring Project Impact:")
metrics = [
    "Reduction in missing data (%)",
    "Increase in data quality scores (%)",
]
print(f"Expected Impact Metrics: {metrics}")

# Store Data Locally for Further Processing
print("\nSaving processed data locally...")

cardio_df.to_csv(os.path.join(data_folder, "processed_cardio_train.csv"), index=False)
quitline_df.to_csv(os.path.join(data_folder, "processed_quitline_services.csv"), index=False)

print("Data Successfully Stored for Further Analysis!")


Measuring Project Impact:
Expected Impact Metrics: ['Reduction in missing data (%)', 'Increase in data quality scores (%)']

Saving processed data locally...
Data Successfully Stored for Further Analysis!


# Realease Resources

In [8]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [9]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}

<IPython.core.display.Javascript object>