In [None]:
import pandas as pd
import os
import sys
import boto3
import matplotlib.pyplot as plt
from dotenv import load_dotenv

parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(parent_dir)

load_dotenv()

from utils.helper_methods import (
    redshift_read_table,
    read_s3_data,
    get_redshift_endpoint,
)

redshift_master_user = os.environ.get("REDSHIFT_MASTER_USER")
redshift_master_password = os.environ.get("REDSHIFT_MASTER_P")
redshift_db_name = os.environ.get("REDSHIFT_DB_NAME")
redshift_cluster_name = os.environ.get("REDSHIFT_CLUSTER_NAME")
redshift_table_name = os.environ.get("REDSHIFT_TABLE_NAME")

bucket_name = os.environ.get("S3_BUCKET_NAME")

cluster_address, cluster_port = get_redshift_endpoint(redshift_cluster_name)

connection_string = f"dbname={redshift_db_name} user={redshift_master_user} password={redshift_master_password} host={cluster_address} port={cluster_port}"

# Verify raw intermediate data in S3 bucket


In [None]:
s3_client = boto3.client("s3")
bucket_data = read_s3_data(
    s3_client,
    "firehose-raw-data",
)

print(len(bucket_data))
bucket_data

In [None]:
bucket_data.values()

---


## Connect to and read from Redshift Database


In [None]:
table_name = "user_health_data"


sql_query = f"SELECT * FROM {table_name}"

df_user_health_data = redshift_read_table(connection_string, sql_query)
print(len(df_user_health_data))

df_user_health_data

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
for key, grp in df_user_health_data.groupby(["name"]):
    ax.plot(grp["age"], grp["novel_stress_marker"], marker="o", linestyle="", label=key)

ax.set_xlabel("Age")
ax.set_ylabel("Novel Stress Marker")
plt.title("Novel Stress Marker by Age and User")
plt.legend()
plt.show()