# Queries to use for A3c, A3d, A3e, A3f


In [1]:
# Required libraries
import os
import time
import boto3
import logging
import pandas as pd
from typing import Dict

In [2]:
logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

In [3]:
# configuration details 
SCHEMA_NAME = "schema_name"

# fill in your net id below.
netid = "imb59"
S3_STAGING_PREFIX = "data/a05"
S3_BUCKET_NAME = f"athena-{netid}"
S3_STAGING_DIR = f"s3://{S3_BUCKET_NAME}/{S3_STAGING_PREFIX}/"
S3_OUTPUT_DIRECTORY = "data"
AWS_REGION = "us-east-1"

In [4]:
# initialize the Athena client
athena_client = boto3.client("athena", region_name=AWS_REGION)

In [11]:
def download_and_load_query_results(
    client: boto3.client, query_response: Dict
) -> pd.DataFrame:
    logger.info("download_and_load_query_results, enter")
    while True:
        try:
            # This function only loads the first 1000 rows
            client.get_query_results(
                QueryExecutionId=query_response["QueryExecutionId"]
            )
            break
        except Exception as err:
            if "not yet finished" in str(err):
                time.sleep(0.5)
            else:
                raise err
    logger.info(f"Time to complete query: {time.time() - start_time}s")
    temp_file_location: str = "athena_query_results.csv"
    s3_client = boto3.client(
        "s3",
        region_name=AWS_REGION,
    )
    s3_path = os.path.join(S3_STAGING_PREFIX, f"{query_response['QueryExecutionId']}.csv")
    logger.info(f"downloading file from S3_BUCKET_NAME={S3_BUCKET_NAME}, s3_path={s3_path}, to local file {temp_file_location}")
    s3_client.download_file(
        S3_BUCKET_NAME,
        s3_path,
        temp_file_location,
    )
    df = pd.read_csv(temp_file_location)
    logger.info(f"results dataframe shape is {df.shape}")
    return df

## Sorting Subreddits Relevant to: 

(A1, B3a, B13), A2b, A3a, A3b

* “Frustrating” or “frustrat” and “cancer” (HINTS A2b)
* “cancer” and “doctors” or “trust” (i.e. does not “need” to contain trust because trust is included in the NRC sentiment analysis) (HINTS A3a)
* “cancer” and “family” or “friends” or “sister” or “brother” or “mother” or “mom” or “father” or “mother” or “cousin” or “aunt” or “uncle” or “trust” (HINTS A3b)

HINTS Questions: 

* SeekCancerInfo: A1 | Have you ever looked for information about cancer from any source?
* Electronic2_HealthInfo: B3a | In the past 12 months have you used the Internet to look for health or medical information?
* MisleadingHealthInfo: B13 | How much of the health information that you see on social media do you think is false or misleading?


In [12]:
# query to get comments containing "frustrat" and "cancer"
q = '''
SELECT * 
FROM "AwsDataCatalog"."a05"."a05"
WHERE LOWER("body") LIKE '%frustrat%' AND LOWER("body") LIKE '%cancer%'
'''

# Start the query execution
start_time = time.time()
response = athena_client.start_query_execution(
    QueryString=q,
    QueryExecutionContext={"Database": SCHEMA_NAME},
    ResultConfiguration={
        "OutputLocation": S3_STAGING_DIR,
        "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"},
    },
)

# Fetch and load the results
logger.info(response)
df_filtered_comments = download_and_load_query_results(athena_client, response)

# Save the filtered comments to a CSV file
output_file = "Query_A2b.csv"
df_filtered_comments.to_csv(output_file, index=False)
logger.info(f"Results saved to {output_file}")
logger.info(f"Data fetched and committed in {time.time() - start_time}s")

# Preview the filtered DataFrame
print(df_filtered_comments.head())


[2024-11-11 19:17:13,311] p124 {1841568608.py:20} INFO - {'QueryExecutionId': 'dfc1c730-8fd5-4d61-8d5b-9435c5c23a66', 'ResponseMetadata': {'RequestId': '5716afc4-ef67-475a-a4d0-fb3163692535', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Mon, 11 Nov 2024 19:17:13 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '59', 'connection': 'keep-alive', 'x-amzn-requestid': '5716afc4-ef67-475a-a4d0-fb3163692535'}, 'RetryAttempts': 0}}
[2024-11-11 19:17:13,311] p124 {272223420.py:4} INFO - download_and_load_query_results, enter
[2024-11-11 19:17:19,637] p124 {272223420.py:17} INFO - Time to complete query: 6.428727149963379s
[2024-11-11 19:17:19,643] p124 {272223420.py:24} INFO - downloading file from S3_BUCKET_NAME=athena-imb59, s3_path=data/a05/dfc1c730-8fd5-4d61-8d5b-9435c5c23a66.csv, to local file athena_query_results.csv
[2024-11-11 19:17:19,744] p124 {272223420.py:31} INFO - results dataframe shape is (25, 17)
[2024-11-11 19:17:19,749] p124 {1841568608.py:26} INFO - R

               author author_flair_css_class author_flair_text  \
0     No-Occasion5142                    NaN               NaN   
1   therealstevielong                    NaN               NaN   
2          Bored_AF18                    NaN               NaN   
3  Honest_Objective67                     ep                SA   
4         cindysinner                    NaN               NaN   

                                                body  controversiality  \
0  I have the same thing lol majority of my frien...                 0   
1  brother the world is unfair. some people are b...                 0   
2  thank you for pointing out how it’s misunderst...                 0   
3  The deaths aren't tracked because it's likely ...                 0   
4  Maybe ask her if she calls a plumber when plum...                 0   

   created_utc  distinguished  edited  gilded       id     link_id  \
0   1719793781            NaN     NaN       0  lb22v68  t3_1dsfd3r   
1   1719798591    

In [63]:
# Query to get comments containing "cancer" and either "doctors" or "trust"
q = '''
SELECT * 
FROM "AwsDataCatalog"."a05"."a05"
WHERE LOWER("body") LIKE '%cancer%' 
  AND (LOWER("body") LIKE '%doctors%' OR LOWER("body") LIKE '%trust%')
'''

# Start the query execution
start_time = time.time()
response = athena_client.start_query_execution(
    QueryString=q,
    QueryExecutionContext={"Database": SCHEMA_NAME},
    ResultConfiguration={
        "OutputLocation": S3_STAGING_DIR,
        "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"},
    },
)

# Fetch and load the results
logger.info(response)
df_filtered_comments = download_and_load_query_results(athena_client, response)

# Save the filtered comments to a CSV file
output_file = "Query_A3a.csv"
df_filtered_comments.to_csv(output_file, index=False)
logger.info(f"Results saved to {output_file}")
logger.info(f"Data fetched and committed in {time.time() - start_time}s")

# Preview the filtered DataFrame
print(df_filtered_comments.head())


[2024-11-11 03:57:45,880] p537 {1344576528.py:21} INFO - {'QueryExecutionId': '51a02f1a-c5dd-48d3-924f-db59f721656d', 'ResponseMetadata': {'RequestId': 'f596f2b4-84b7-4655-b0c8-13fc3521f063', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Mon, 11 Nov 2024 03:57:45 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '59', 'connection': 'keep-alive', 'x-amzn-requestid': 'f596f2b4-84b7-4655-b0c8-13fc3521f063'}, 'RetryAttempts': 0}}
[2024-11-11 03:57:45,881] p537 {1553365550.py:3} INFO - Waiting for query to complete...
[2024-11-11 03:57:52,316] p537 {1553365550.py:19} INFO - Downloading results from data/results/51a02f1a-c5dd-48d3-924f-db59f721656d.csv to local file athena_query_results.csv
[2024-11-11 03:57:52,404] p537 {1553365550.py:26} INFO - Results loaded, DataFrame shape: (184, 17)
[2024-11-11 03:57:52,415] p537 {1344576528.py:27} INFO - Results saved to Query_A3a.csv
[2024-11-11 03:57:52,415] p537 {1344576528.py:28} INFO - Data fetched and committed in 6.6717264

                 author author_flair_css_class author_flair_text  \
0         tropical_moss                    NaN               NaN   
1         AutoModerator                    NaN               NaN   
2   Green_DREAM-lizards                    NaN               NaN   
3      Low-Addendum9282                    NaN               NaN   
4  Reinmaindiewithglory                    NaN               NaN   

                                                body  controversiality  \
0  This pisses me off so much. I’m a stage IV can...                 0   
1  We're sorry to hear that you need to visit thi...                 0   
2  No. It is your opinion.  Vegans NEED to supple...                 0   
3  Science has long been our go-to for unraveling...                 0   
4  I set aside half in case of taxes then I move ...                 0   

   created_utc distinguished  edited  gilded       id     link_id   parent_id  \
0   1719801439           NaN     NaN       0  lb2l5k0  t3_1drvfz5

In [64]:
# Query to get comments containing "cancer" and any of the specified family-related terms or "trust"
q = '''
SELECT * 
FROM "AwsDataCatalog"."a05"."a05"
WHERE LOWER("body") LIKE '%cancer%' 
  AND (
      LOWER("body") LIKE '%family%' 
      OR LOWER("body") LIKE '%friends%' 
      OR LOWER("body") LIKE '%sister%' 
      OR LOWER("body") LIKE '%brother%' 
      OR LOWER("body") LIKE '%mother%' 
      OR LOWER("body") LIKE '%mom%' 
      OR LOWER("body") LIKE '%father%' 
      OR LOWER("body") LIKE '%cousin%' 
      OR LOWER("body") LIKE '%aunt%' 
      OR LOWER("body") LIKE '%uncle%' 
      OR LOWER("body") LIKE '%trust%'
  )
'''

# Start the query execution
start_time = time.time()
response = athena_client.start_query_execution(
    QueryString=q,
    QueryExecutionContext={"Database": SCHEMA_NAME},
    ResultConfiguration={
        "OutputLocation": S3_STAGING_DIR,
        "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"},
    },
)

# Fetch and load the results
logger.info(response)
df_filtered_comments = download_and_load_query_results(athena_client, response)

# Save the filtered comments to a CSV file
output_file = "Query_A3b.csv"
df_filtered_comments.to_csv(output_file, index=False)
logger.info(f"Results saved to {output_file}")
logger.info(f"Data fetched and committed in {time.time() - start_time}s")

# Preview the filtered DataFrame
print(df_filtered_comments.head())


[2024-11-11 03:58:03,302] p537 {1649558915.py:33} INFO - {'QueryExecutionId': '50520700-e02b-4b38-85f4-fb24b7a368d9', 'ResponseMetadata': {'RequestId': '84624bc4-35a3-4800-9e88-ee68b8039c6e', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Mon, 11 Nov 2024 03:58:03 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '59', 'connection': 'keep-alive', 'x-amzn-requestid': '84624bc4-35a3-4800-9e88-ee68b8039c6e'}, 'RetryAttempts': 0}}
[2024-11-11 03:58:03,303] p537 {1553365550.py:3} INFO - Waiting for query to complete...
[2024-11-11 03:58:09,360] p537 {1553365550.py:19} INFO - Downloading results from data/results/50520700-e02b-4b38-85f4-fb24b7a368d9.csv to local file athena_query_results.csv
[2024-11-11 03:58:09,455] p537 {1553365550.py:26} INFO - Results loaded, DataFrame shape: (884, 17)
[2024-11-11 03:58:09,493] p537 {1649558915.py:39} INFO - Results saved to Query_A3b.csv
[2024-11-11 03:58:09,494] p537 {1649558915.py:40} INFO - Data fetched and committed in 6.2710237

                 author author_flair_css_class author_flair_text  \
0   Spare-Patience-6195                    NaN               NaN   
1  Street_Hedgehog_9595                   cath          Catholic   
2     got_knee_gas_enit                    NaN               NaN   
3  Spiritual-Freedom-71                    NaN               NaN   
4    Infamous-Area-2964                    NaN          DMs open   

                                                body  controversiality  \
0  Oh wow! You might be right. I didn’t think abo...                 0   
1  To win, you need to make it your number one go...                 0   
2  Thanks so much.... that's kind.  Sometimes you...                 0   
3  Thank you so much for your comment and support...                 0   
4  hang yourself n1gger 😛 if that's not working t...                 0   

   created_utc distinguished  edited  gilded       id     link_id   parent_id  \
0   1719792014           NaN     NaN       0  lb1ylx2  t3_1dsbt6r

## Sorting Subreddits Relevant to: 

A3c, A3d, A3e, A3f

* “cancer” and government_healthcare_programs = [ "medicare", "medicaid", "children’s health insurance program", "chip", "veterans health administration", "vha", "indian health service", "ihs", "federal employees health benefits program", "fehbp", "affordable care act", "aca", "health insurance marketplace", "public health depart", "local health depart", "national health service corps", "nhsc", "community health centers", "chcs", "national institutes of health", "nih", "nci", "national cancer institute" ] or “trust” (HINTS A3c)

* “cancer” and cancer_charities = [ "american cancer society”, “acs", "cancer research institute", "breast cancer research foundation", "bcrf", "leukemia lymphoma society", "lls", "stand up to cancer", "su2c", "susan g. komen for the cure", "st. jude children’s", "national foundation for cancer research", "nfcr", "livestrong", "mesothelioma research foundation", "prostate cancer foundation", "american brain tumor association", "abta", "colon cancer coalition", "the american institute for cancer research", "aicr" ] or “trust” (HINTS A3d)
* “cancer” and charitable_religious_organizations = [ "catholic relief services”, “crs", "world vision", "samaritan", "jewish federations of north america", "islamic relief worldwide", "buddhist global relief", "the salvation army", "christian aid", "lutheran world relief", "tzu chi foundation", "care”, “cooperative for assistance and relief everywhere", "habitat for humanity", "church world service”, “cws", "heifer international" ] or “trust” (HINTS A3e)
* “cancer” and top_cancer_institutes = [ "researcher", "scientist", "physicians", "md anderson cancer center", "memorial sloan kettering cancer center", "msk", "mayo clinic cancer center", "johns hopkins sidney kimmel comprehensive cancer center", "cleveland clinic", "ucla medical center", "massachusetts general hospital cancer center", "duke cancer institute", "stanford cancer institute", "university of california, san francisco medical center", "ucsf", "northwestern medicine feinberg school of medicine", "university of pennsylvania abramson cancer center", "roswell park comprehensive cancer center", "fred hutchinson cancer research center" ] or “trust” (HINTS A3f)


HINTS Questions: 

* CancerTrustGov: A3c. In general, how much would you trust information about cancer from government health agencies?
* CancerTrustCharities: A3d. In general, how much would you trust information about cancer from charitable organizations?
* CancerTrustReligiousOrgs: A3e. In general, how much would you trust information about cancer from religious organizations and leaders?
* CancerTrustScientists: A3f. In general how much would you trust information about cancer from scientists?.

In [13]:
# Query to get comments containing "cancer" and nay goverment healthcare programs 
q = '''
SELECT * 
FROM "AwsDataCatalog"."a05"."a05"
WHERE LOWER("body") LIKE '%cancer%' 
  AND (
      LOWER("body") LIKE '%medicare%' 
      OR LOWER("body") LIKE '%medicaid%' 
      OR LOWER("body") LIKE '%"children’s health insurance program%' 
      OR LOWER("body") LIKE '%chip%' 
      OR LOWER("body") LIKE '%veterans health administration%' 
      OR LOWER("body") LIKE '%vha%' 
      OR LOWER("body") LIKE '%indian health service%' 
      OR LOWER("body") LIKE '%ihs%' 
      OR LOWER("body") LIKE '%federal employees health benefits program%' 
      OR LOWER("body") LIKE '%fehbp%' 
      OR LOWER("body") LIKE '%affordable care act%'
      OR LOWER("body") LIKE '%aca%'
      OR LOWER("body") LIKE '%health insurance marketplace%'
      OR LOWER("body") LIKE '%public health depart%'
      OR LOWER("body") LIKE '%local health depart%'
      OR LOWER("body") LIKE '%national health service corps%'
      OR LOWER("body") LIKE '%nhsc%'
      OR LOWER("body") LIKE '%community health centers%'
      OR LOWER("body") LIKE '%national institutes of health%'
      OR LOWER("body") LIKE '%nih%'
      OR LOWER("body") LIKE '%nci%'
      OR LOWER("body") LIKE '%national cancer institute%'
      OR LOWER("body") LIKE '%trust%'
  )
'''

# Start the query execution
start_time = time.time()
response = athena_client.start_query_execution(
    QueryString=q,
    QueryExecutionContext={"Database": SCHEMA_NAME},
    ResultConfiguration={
        "OutputLocation": S3_STAGING_DIR,
        "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"},
    },
)

# Fetch and load the results
logger.info(response)
df_filtered_comments = download_and_load_query_results(athena_client, response)

# Save the filtered comments to a CSV file
output_file = "Query_A3c.csv"
df_filtered_comments.to_csv(output_file, index=False)
logger.info(f"Results saved to {output_file}")
logger.info(f"Data fetched and committed in {time.time() - start_time}s")

# Preview the filtered DataFrame
print(df_filtered_comments.head())

[2024-11-11 19:39:08,804] p124 {269285766.py:45} INFO - {'QueryExecutionId': '3000e576-a987-40d2-aeec-505462905581', 'ResponseMetadata': {'RequestId': '3ea47da2-7210-4e7f-8491-f1dda68b1493', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Mon, 11 Nov 2024 19:39:08 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '59', 'connection': 'keep-alive', 'x-amzn-requestid': '3ea47da2-7210-4e7f-8491-f1dda68b1493'}, 'RetryAttempts': 0}}
[2024-11-11 19:39:08,805] p124 {272223420.py:4} INFO - download_and_load_query_results, enter
[2024-11-11 19:39:18,855] p124 {272223420.py:17} INFO - Time to complete query: 10.208207130432129s
[2024-11-11 19:39:18,861] p124 {272223420.py:24} INFO - downloading file from S3_BUCKET_NAME=athena-imb59, s3_path=data/a05/3000e576-a987-40d2-aeec-505462905581.csv, to local file athena_query_results.csv
[2024-11-11 19:39:18,941] p124 {272223420.py:31} INFO - results dataframe shape is (326, 17)
[2024-11-11 19:39:18,956] p124 {269285766.py:51} INFO - R

              author author_flair_css_class author_flair_text  \
0  Direct_Layer_5233                    NaN               NaN   
1      AutoModerator                    NaN               NaN   
2     fresitachulita                    NaN               NaN   
3       serenerepose                    NaN               NaN   
4    BigRedDoggyDawg                    NaN               NaN   

                                                body  controversiality  \
0  I do for sure! I have for sure cut down on fri...                 0   
1  ^^^^AUTOMOD  ***Thanks for posting! This comme...                 0   
2  It was closer to 4-5 when he could more easily...                 0   
3  Sooooooo, let's talk about that historically!\...                 0   
4  I'll start by saying I upvoted your post. And ...                 0   

   created_utc distinguished  edited  gilded       id     link_id   parent_id  \
0   1719812077           NaN     NaN       0  lb35xfy  t3_1drxzq3  t3_1drxzq3   
1 

In [14]:
# Query to get comments containing "cancer" and certain cancer chareties 
q = '''
SELECT * 
FROM "AwsDataCatalog"."a05"."a05"
WHERE LOWER("body") LIKE '%cancer%' 
  AND (
      LOWER("body") LIKE '%american cancer society%' 
      OR LOWER("body") LIKE '%acs%' 
      OR LOWER("body") LIKE '%cancer research institute%' 
      OR LOWER("body") LIKE '%breast cancer research foundation%' 
      OR LOWER("body") LIKE '%veterans health administration%' 
      OR LOWER("body") LIKE '%bcrf%' 
      OR LOWER("body") LIKE '%leukemia lymphoma society%' 
      OR LOWER("body") LIKE '%ihs%' 
      OR LOWER("body") LIKE '%lls%' 
      OR LOWER("body") LIKE '%stand up to cancer%' 
      OR LOWER("body") LIKE '%su2c%'
      OR LOWER("body") LIKE '%susan g. komen for the cure%'
      OR LOWER("body") LIKE '%st. jude children’s%'
      OR LOWER("body") LIKE '%national foundation for cancer research%'
      OR LOWER("body") LIKE '%nfcr%'
      OR LOWER("body") LIKE '%livestrong%'
      OR LOWER("body") LIKE '%nhsc%'
      OR LOWER("body") LIKE '%mesothelioma research foundation%'
      OR LOWER("body") LIKE '%prostate cancer foundation%'
      OR LOWER("body") LIKE '%american brain tumor association%'
      OR LOWER("body") LIKE '%abta%'
      OR LOWER("body") LIKE '%colon cancer coalition%'
      OR LOWER("body") LIKE '%the american institute for cancer research%'
      OR LOWER("body") LIKE '%aicr%'
      OR LOWER("body") LIKE '%trust%'
  )
'''

# Start the query execution
start_time = time.time()
response = athena_client.start_query_execution(
    QueryString=q,
    QueryExecutionContext={"Database": SCHEMA_NAME},
    ResultConfiguration={
        "OutputLocation": S3_STAGING_DIR,
        "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"},
    },
)

# Fetch and load the results
logger.info(response)
df_filtered_comments = download_and_load_query_results(athena_client, response)

# Save the filtered comments to a CSV file
output_file = "Query_A3d.csv"
df_filtered_comments.to_csv(output_file, index=False)
logger.info(f"Results saved to {output_file}")
logger.info(f"Data fetched and committed in {time.time() - start_time}s")

# Preview the filtered DataFrame
print(df_filtered_comments.head())

[2024-11-11 19:53:22,896] p124 {4134224649.py:47} INFO - {'QueryExecutionId': '565b7bb9-2d39-4311-a904-98d41ee41f75', 'ResponseMetadata': {'RequestId': '3ee0cf80-4ded-4d2c-84d1-ce92dfa93945', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Mon, 11 Nov 2024 19:53:22 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '59', 'connection': 'keep-alive', 'x-amzn-requestid': '3ee0cf80-4ded-4d2c-84d1-ce92dfa93945'}, 'RetryAttempts': 0}}
[2024-11-11 19:53:22,897] p124 {272223420.py:4} INFO - download_and_load_query_results, enter
[2024-11-11 19:53:32,942] p124 {272223420.py:17} INFO - Time to complete query: 10.201913595199585s
[2024-11-11 19:53:32,948] p124 {272223420.py:24} INFO - downloading file from S3_BUCKET_NAME=athena-imb59, s3_path=data/a05/565b7bb9-2d39-4311-a904-98d41ee41f75.csv, to local file athena_query_results.csv
[2024-11-11 19:53:33,024] p124 {272223420.py:31} INFO - results dataframe shape is (353, 17)
[2024-11-11 19:53:33,040] p124 {4134224649.py:53} INFO -

                 author author_flair_css_class author_flair_text  \
0  Street_Hedgehog_9595                   cath          Catholic   
1            wagdog1970                    NaN               NaN   
2     YtterbiusAntimony                    NaN               NaN   
3  Extra_Particular8653                    NaN               NaN   
4          tweetysvoice                    NaN               NaN   

                                                body  controversiality  \
0  To win, you need to make it your number one go...                 0   
1  Ok, now you are just being pedantic because yo...                 0   
2  Uncertainty is scary. Look at conspiracy theor...                 0   
3  Yes I agree the list of side effects from pain...                 0   
4  For an Aquatic Veterinarian, It’s Never ‘Just ...                 0   

   created_utc distinguished  edited  gilded       id     link_id   parent_id  \
0   1719792034           NaN     NaN       0  lb1ynpf  t3_1dseiwd

In [15]:
# Query to get comments containing "cancer" and certain charitable religious orgs 
q = '''
SELECT * 
FROM "AwsDataCatalog"."a05"."a05"
WHERE LOWER("body") LIKE '%cancer%' 
  AND (
      LOWER("body") LIKE '%catholic relief services%' 
      OR LOWER("body") LIKE '%crs%' 
      OR LOWER("body") LIKE '%world vision%' 
      OR LOWER("body") LIKE '%samaritan%' 
      OR LOWER("body") LIKE '%jewish federations of north america%' 
      OR LOWER("body") LIKE '%islamic relief worldwide%' 
      OR LOWER("body") LIKE '%buddhist global relief%' 
      OR LOWER("body") LIKE '%the salvation army%' 
      OR LOWER("body") LIKE '%christian aid%' 
      OR LOWER("body") LIKE '%lutheran world relief%' 
      OR LOWER("body") LIKE '%"tzu chi foundation%'
      OR LOWER("body") LIKE '%susan g. komen for the cure%'
      OR LOWER("body") LIKE '%care%'
      OR LOWER("body") LIKE '%cooperative for assistance and relief everywhere%'
      OR LOWER("body") LIKE '%habitat for humanity%'
      OR LOWER("body") LIKE '%church world service%'
      OR LOWER("body") LIKE '%cws%'
      OR LOWER("body") LIKE '%heifer international%'
      OR LOWER("body") LIKE '%trust%'
  )
'''

# Start the query execution
start_time = time.time()
response = athena_client.start_query_execution(
    QueryString=q,
    QueryExecutionContext={"Database": SCHEMA_NAME},
    ResultConfiguration={
        "OutputLocation": S3_STAGING_DIR,
        "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"},
    },
)

# Fetch and load the results
logger.info(response)
df_filtered_comments = download_and_load_query_results(athena_client, response)

# Save the filtered comments to a CSV file
output_file = "Query_A3e.csv"
df_filtered_comments.to_csv(output_file, index=False)
logger.info(f"Results saved to {output_file}")
logger.info(f"Data fetched and committed in {time.time() - start_time}s")

# Preview the filtered DataFrame
print(df_filtered_comments.head())

[2024-11-11 20:03:39,832] p124 {129923019.py:41} INFO - {'QueryExecutionId': '66c13367-6492-4d8d-9a52-87fa592adff4', 'ResponseMetadata': {'RequestId': '7eb9eb4d-daf6-46ec-828f-87ad3d242c8e', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Mon, 11 Nov 2024 20:03:39 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '59', 'connection': 'keep-alive', 'x-amzn-requestid': '7eb9eb4d-daf6-46ec-828f-87ad3d242c8e'}, 'RetryAttempts': 0}}
[2024-11-11 20:03:39,833] p124 {272223420.py:4} INFO - download_and_load_query_results, enter
[2024-11-11 20:03:52,571] p124 {272223420.py:17} INFO - Time to complete query: 12.883120059967041s
[2024-11-11 20:03:52,578] p124 {272223420.py:24} INFO - downloading file from S3_BUCKET_NAME=athena-imb59, s3_path=data/a05/66c13367-6492-4d8d-9a52-87fa592adff4.csv, to local file athena_query_results.csv
[2024-11-11 20:03:52,691] p124 {272223420.py:31} INFO - results dataframe shape is (481, 17)
[2024-11-11 20:03:52,711] p124 {129923019.py:47} INFO - R

                 author author_flair_css_class author_flair_text  \
0         Particular507                    NaN               NaN   
1  Serious_Concert_1520                    NaN               NaN   
2        JoylsNotatrick                    NaN               NaN   
3    Ill_Connection1631                    NaN               NaN   
4           Snapper1916                    NaN               NaN   

                                                body  controversiality  \
0  * I understand she doesn't know exactly what h...                 0   
1  Then you’re pretty uneducated because the rest...                 0   
2  Highly suggestive malignancy is scary as hell....                 0   
3  I don’t know who the bigger coward is? The gro...                 0   
4  Thank you for this response. I relate to every...                 0   

   created_utc distinguished  edited  gilded       id     link_id   parent_id  \
0   1719801072           NaN     NaN       0  lb2kbe5  t3_1drm1bw

“cancer” and top_cancer_institutes = [ "researcher", "scientist", "physicians", "md anderson cancer center", "memorial sloan kettering cancer center", "msk", "mayo clinic cancer center", "johns hopkins sidney kimmel comprehensive cancer center", "cleveland clinic", "ucla medical center", "massachusetts general hospital cancer center", "duke cancer institute", "stanford cancer institute", "university of california, san francisco medical center", "ucsf", "northwestern medicine feinberg school of medicine", "university of pennsylvania abramson cancer center", "roswell park comprehensive cancer center", "fred hutchinson cancer research center" ] or “trust” (HINTS A3f)

In [16]:
# Query to get comments containing "cancer" and top cancer institutes 
q = '''
SELECT * 
FROM "AwsDataCatalog"."a05"."a05"
WHERE LOWER("body") LIKE '%cancer%' 
  AND (
      LOWER("body") LIKE '%researcher%' 
      OR LOWER("body") LIKE '%scientist%' 
      OR LOWER("body") LIKE '%physicians%' 
      OR LOWER("body") LIKE '%md anderson cancer center%' 
      OR LOWER("body") LIKE '%memorial sloan kettering cancer center%' 
      OR LOWER("body") LIKE '%msk%' 
      OR LOWER("body") LIKE '%mayo clinic cancer center%' 
      OR LOWER("body") LIKE '%johns hopkins sidney kimmel comprehensive cancer center%' 
      OR LOWER("body") LIKE '%cleveland clinic%' 
      OR LOWER("body") LIKE '%ucla medical center%' 
      OR LOWER("body") LIKE '%massachusetts general hospital cancer center%'
      OR LOWER("body") LIKE '%duke cancer institute%'
      OR LOWER("body") LIKE '%stanford cancer institute%'
      OR LOWER("body") LIKE '%university of california%'
      OR LOWER("body") LIKE '%san francisco medical center%'
      OR LOWER("body") LIKE '%ucsf%'
      OR LOWER("body") LIKE '%northwestern medicine feinberg school of medicine%'
      OR LOWER("body") LIKE '%university of pennsylvania abramson cancer center%'
      OR LOWER("body") LIKE '%roswell park comprehensive cancer center%'
      OR LOWER("body") LIKE '%fred hutchinson cancer research center%'
      OR LOWER("body") LIKE '%trust%'
  )
'''

# Start the query execution
start_time = time.time()
response = athena_client.start_query_execution(
    QueryString=q,
    QueryExecutionContext={"Database": SCHEMA_NAME},
    ResultConfiguration={
        "OutputLocation": S3_STAGING_DIR,
        "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"},
    },
)

# Fetch and load the results
logger.info(response)
df_filtered_comments = download_and_load_query_results(athena_client, response)

# Save the filtered comments to a CSV file
output_file = "Query_A3f.csv"
df_filtered_comments.to_csv(output_file, index=False)
logger.info(f"Results saved to {output_file}")
logger.info(f"Data fetched and committed in {time.time() - start_time}s")

# Preview the filtered DataFrame
print(df_filtered_comments.head())

[2024-11-11 20:12:54,380] p124 {3663637816.py:43} INFO - {'QueryExecutionId': 'e2aca06e-8170-43e0-999e-692161044895', 'ResponseMetadata': {'RequestId': '4a60bc5c-17f8-441f-bb77-f1fac5c5e39d', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Mon, 11 Nov 2024 20:12:54 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '59', 'connection': 'keep-alive', 'x-amzn-requestid': '4a60bc5c-17f8-441f-bb77-f1fac5c5e39d'}, 'RetryAttempts': 0}}
[2024-11-11 20:12:54,381] p124 {272223420.py:4} INFO - download_and_load_query_results, enter
[2024-11-11 20:13:05,407] p124 {272223420.py:17} INFO - Time to complete query: 11.177748441696167s
[2024-11-11 20:13:05,413] p124 {272223420.py:24} INFO - downloading file from S3_BUCKET_NAME=athena-imb59, s3_path=data/a05/e2aca06e-8170-43e0-999e-692161044895.csv, to local file athena_query_results.csv
[2024-11-11 20:13:05,504] p124 {272223420.py:31} INFO - results dataframe shape is (119, 17)
[2024-11-11 20:13:05,511] p124 {3663637816.py:49} INFO -

                 author author_flair_css_class author_flair_text  \
0   Green_DREAM-lizards                    NaN               NaN   
1      Low-Addendum9282                    NaN               NaN   
2  Reinmaindiewithglory                    NaN               NaN   
3        rubyslippers3x                    NaN               NaN   
4           Cr4zy5ant0s                    NaN               NaN   

                                                body  controversiality  \
0  No. It is your opinion.  Vegans NEED to supple...                 0   
1  Science has long been our go-to for unraveling...                 0   
2  I set aside half in case of taxes then I move ...                 0   
3  I did IV, no port. I used ice bags on my hands...                 0   
4  There's likely medical explanation for thst as...                 0   

   created_utc distinguished  edited  gilded       id     link_id   parent_id  \
0   1719801632           NaN     NaN       0  lb2llds  t3_1dppdgz