# First Queries


In [34]:
# Required libraries
import boto3
import pandas as pd
import logging
import time

In [35]:
logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

In [36]:
# configuration details 
SCHEMA_NAME = "schema_name"

# fill in your net id below.
netid = "ek976"
S3_STAGING_PREFIX = "data/results"
S3_BUCKET_NAME = f"athena-{netid}"
S3_STAGING_DIR = f"s3://{S3_BUCKET_NAME}/{S3_STAGING_PREFIX}/"
S3_OUTPUT_DIRECTORY = "data"
AWS_REGION = "us-east-1"

In [37]:
# initialize the Athena client
athena_client = boto3.client("athena", region_name=AWS_REGION)

In [38]:
def download_and_load_query_results(client: boto3.client, query_response: dict) -> pd.DataFrame:
    """Downloads Athena query results to a local CSV and loads it into a Pandas DataFrame."""
    logger.info("Waiting for query to complete...")
    while True:
        try:
            # Check query completion
            result = client.get_query_results(QueryExecutionId=query_response["QueryExecutionId"])
            break
        except Exception as err:
            if "not yet finished" in str(err):
                time.sleep(0.5)  # Wait a bit longer for large datasets
            else:
                raise err

    # Set up the S3 client to download the results
    s3_client = boto3.client("s3", region_name=AWS_REGION)
    temp_file_location = "athena_query_results.csv"
    s3_path = f"data/results/{query_response['QueryExecutionId']}.csv"
    logger.info(f"Downloading results from {s3_path} to local file {temp_file_location}")

    # Download the file from S3
    s3_client.download_file(S3_BUCKET_NAME, s3_path, temp_file_location)
    
    # Load CSV into a DataFrame and return it
    df = pd.read_csv(temp_file_location)
    logger.info(f"Results loaded, DataFrame shape: {df.shape}")
    return df

In [39]:
# call the function and assign the DataFrame to df
df = download_and_load_query_results(athena_client, response)

# print the first few rows of df
print(df.head())

[2024-11-11 03:34:06,479] p537 {1553365550.py:3} INFO - Waiting for query to complete...
[2024-11-11 03:34:06,677] p537 {1553365550.py:19} INFO - Downloading results from data/results/9ca7e0c0-6f88-4e41-a324-616e72ac8266.csv to local file athena_query_results.csv
[2024-11-11 03:34:06,778] p537 {1553365550.py:26} INFO - Results loaded, DataFrame shape: (10, 17)


              author author_flair_css_class author_flair_text  \
0         Jozimastar                    NaN               NaN   
1        SameCommon3                    NaN               NaN   
2    franklyimstoned                    NaN               NaN   
3        Warmstar219                    NaN               NaN   
4  Massive_Being6115                    NaN               NaN   

                                                body  controversiality  \
0                                      Talebi delamo                 0   
1  True. He is probably moving the XRT basket. Or...                 0   
2  Yea technically. If you want a balanced lifest...                 0   
3                Checks and balances no longer exist                 0   
4  Você pode achar pra compra no eBay ou na Amazo...                 0   

   created_utc distinguished  edited  gilded       id     link_id   parent_id  \
0   1719834838           NaN     NaN       0  lb44ayh  t3_1dsnfn3  t1_lb444pf   
1 

In [44]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   author                  10 non-null     object 
 1   author_flair_css_class  1 non-null      object 
 2   author_flair_text       2 non-null      object 
 3   body                    10 non-null     object 
 4   controversiality        10 non-null     int64  
 5   created_utc             10 non-null     int64  
 6   distinguished           1 non-null      object 
 7   edited                  0 non-null      float64
 8   gilded                  10 non-null     int64  
 9   id                      10 non-null     object 
 10  link_id                 10 non-null     object 
 11  parent_id               10 non-null     object 
 12  retrieved_on            10 non-null     int64  
 13  score                   10 non-null     int64  
 14  stickied                10 non-null     bool 

In [51]:
print(df.sample(5))

                 author author_flair_css_class author_flair_text  \
3           Warmstar219                    NaN               NaN   
9  BullShit-AdminReddit                    NaN               NaN   
1           SameCommon3                    NaN               NaN   
2       franklyimstoned                    NaN               NaN   
5               bomiyeo                    NaN           nan’s 🍑   

                                                body  controversiality  \
3                Checks and balances no longer exist                 0   
9  Tới có Chấm điểm công dân như tàu+ thì dân vịt...                 0   
1  True. He is probably moving the XRT basket. Or...                 0   
2  Yea technically. If you want a balanced lifest...                 0   
5  I could see him as a finalist or someone who g...                 0   

   created_utc distinguished  edited  gilded       id     link_id   parent_id  \
3   1719849333           NaN     NaN       0  lb55sy8  t3_1dsueha

In [52]:
print(df.isnull().sum())

author                     0
author_flair_css_class     9
author_flair_text          8
body                       0
controversiality           0
created_utc                0
distinguished              9
edited                    10
gilded                     0
id                         0
link_id                    0
parent_id                  0
retrieved_on               0
score                      0
stickied                   0
subreddit                  0
subreddit_id               0
dtype: int64


In [53]:
print(df.describe())

       controversiality   created_utc  edited  gilded  retrieved_on      score
count              10.0  1.000000e+01     0.0    10.0  1.000000e+01  10.000000
mean                0.0  1.719831e+09     NaN     0.0  1.719831e+09   3.000000
std                 0.0  1.441007e+04     NaN     0.0  1.440960e+04   2.981424
min                 0.0  1.719796e+09     NaN     0.0  1.719796e+09   1.000000
25%                 0.0  1.719829e+09     NaN     0.0  1.719829e+09   1.000000
50%                 0.0  1.719834e+09     NaN     0.0  1.719834e+09   1.000000
75%                 0.0  1.719838e+09     NaN     0.0  1.719838e+09   4.500000
max                 0.0  1.719849e+09     NaN     0.0  1.719849e+09   9.000000


In [54]:
print(df['subreddit'].value_counts())

subreddit
AnarchyChess         1
DeepFuckingValue     1
2007scape            1
scotus               1
StardewValleyBR      1
MasterchefAU         1
GoldCoast            1
Steam                1
WifeWantstoPlay      1
TroChuyenLinhTinh    1
Name: count, dtype: int64


In [55]:
print(df.columns)

Index(['author', 'author_flair_css_class', 'author_flair_text', 'body',
       'controversiality', 'created_utc', 'distinguished', 'edited', 'gilded',
       'id', 'link_id', 'parent_id', 'retrieved_on', 'score', 'stickied',
       'subreddit', 'subreddit_id'],
      dtype='object')


## Sorting Subreddits Relevant to: 

(A1, B3a, B13), A2b, A3a, A3b

* “Frustrating” or “frustrat” and “cancer” (HINTS A2b)
* “cancer” and “doctors” or “trust” (i.e. does not “need” to contain trust because trust is included in the NRC sentiment analysis) (HINTS A3a)
* “cancer” and “family” or “friends” or “sister” or “brother” or “mother” or “mom” or “father” or “mother” or “cousin” or “aunt” or “uncle” or “trust” (HINTS A3b)

HINTS Questions: 

* SeekCancerInfo: A1 | Have you ever looked for information about cancer from any source?
* Electronic2_HealthInfo: B3a | In the past 12 months have you used the Internet to look for health or medical information?
* MisleadingHealthInfo: B13 | How much of the health information that you see on social media do you think is false or misleading?


In [62]:
# query to get comments containing "frustrat" and "cancer"
q = '''
SELECT * 
FROM "AwsDataCatalog"."a05"."a05"
WHERE LOWER("body") LIKE '%frustrat%' AND LOWER("body") LIKE '%cancer%'
'''

# Start the query execution
start_time = time.time()
response = athena_client.start_query_execution(
    QueryString=q,
    QueryExecutionContext={"Database": SCHEMA_NAME},
    ResultConfiguration={
        "OutputLocation": S3_STAGING_DIR,
        "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"},
    },
)

# Fetch and load the results
logger.info(response)
df_filtered_comments = download_and_load_query_results(athena_client, response)

# Save the filtered comments to a CSV file
output_file = "Query_A2b.csv"
df_filtered_comments.to_csv(output_file, index=False)
logger.info(f"Results saved to {output_file}")
logger.info(f"Data fetched and committed in {time.time() - start_time}s")

# Preview the filtered DataFrame
print(df_filtered_comments.head())


[2024-11-11 03:57:08,239] p537 {1841568608.py:20} INFO - {'QueryExecutionId': '3e51bd5f-850a-4666-950a-29d8082cb6da', 'ResponseMetadata': {'RequestId': '804b270c-cba2-4e3a-a045-6b881f95fb36', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Mon, 11 Nov 2024 03:57:08 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '59', 'connection': 'keep-alive', 'x-amzn-requestid': '804b270c-cba2-4e3a-a045-6b881f95fb36'}, 'RetryAttempts': 0}}
[2024-11-11 03:57:08,240] p537 {1553365550.py:3} INFO - Waiting for query to complete...
[2024-11-11 03:57:14,034] p537 {1553365550.py:19} INFO - Downloading results from data/results/3e51bd5f-850a-4666-950a-29d8082cb6da.csv to local file athena_query_results.csv
[2024-11-11 03:57:14,121] p537 {1553365550.py:26} INFO - Results loaded, DataFrame shape: (25, 17)
[2024-11-11 03:57:14,127] p537 {1841568608.py:26} INFO - Results saved to Query_A2b.csv
[2024-11-11 03:57:14,128] p537 {1841568608.py:27} INFO - Data fetched and committed in 5.97528386

              author author_flair_css_class author_flair_text  \
0    No-Occasion5142                    NaN               NaN   
1  therealstevielong                    NaN               NaN   
2         Bored_AF18                    NaN               NaN   
3          zombieus1                    NaN               NaN   
4        MzOpinion8d         verified-nurse  Registered Nurse   

                                                body  controversiality  \
0  I have the same thing lol majority of my frien...                 0   
1  brother the world is unfair. some people are b...                 0   
2  thank you for pointing out how it’s misunderst...                 0   
3  You are not alone... I am a 22y/o and I had my...                 0   
4  Yeah, could be a stomach bug, could be cancer ...                 0   

   created_utc  distinguished  edited  gilded       id     link_id  \
0   1719793781            NaN     NaN       0  lb22v68  t3_1dsfd3r   
1   1719798591          

In [63]:
# Query to get comments containing "cancer" and either "doctors" or "trust"
q = '''
SELECT * 
FROM "AwsDataCatalog"."a05"."a05"
WHERE LOWER("body") LIKE '%cancer%' 
  AND (LOWER("body") LIKE '%doctors%' OR LOWER("body") LIKE '%trust%')
'''

# Start the query execution
start_time = time.time()
response = athena_client.start_query_execution(
    QueryString=q,
    QueryExecutionContext={"Database": SCHEMA_NAME},
    ResultConfiguration={
        "OutputLocation": S3_STAGING_DIR,
        "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"},
    },
)

# Fetch and load the results
logger.info(response)
df_filtered_comments = download_and_load_query_results(athena_client, response)

# Save the filtered comments to a CSV file
output_file = "Query_A3a.csv"
df_filtered_comments.to_csv(output_file, index=False)
logger.info(f"Results saved to {output_file}")
logger.info(f"Data fetched and committed in {time.time() - start_time}s")

# Preview the filtered DataFrame
print(df_filtered_comments.head())


[2024-11-11 03:57:45,880] p537 {1344576528.py:21} INFO - {'QueryExecutionId': '51a02f1a-c5dd-48d3-924f-db59f721656d', 'ResponseMetadata': {'RequestId': 'f596f2b4-84b7-4655-b0c8-13fc3521f063', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Mon, 11 Nov 2024 03:57:45 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '59', 'connection': 'keep-alive', 'x-amzn-requestid': 'f596f2b4-84b7-4655-b0c8-13fc3521f063'}, 'RetryAttempts': 0}}
[2024-11-11 03:57:45,881] p537 {1553365550.py:3} INFO - Waiting for query to complete...
[2024-11-11 03:57:52,316] p537 {1553365550.py:19} INFO - Downloading results from data/results/51a02f1a-c5dd-48d3-924f-db59f721656d.csv to local file athena_query_results.csv
[2024-11-11 03:57:52,404] p537 {1553365550.py:26} INFO - Results loaded, DataFrame shape: (184, 17)
[2024-11-11 03:57:52,415] p537 {1344576528.py:27} INFO - Results saved to Query_A3a.csv
[2024-11-11 03:57:52,415] p537 {1344576528.py:28} INFO - Data fetched and committed in 6.6717264

                 author author_flair_css_class author_flair_text  \
0         tropical_moss                    NaN               NaN   
1         AutoModerator                    NaN               NaN   
2   Green_DREAM-lizards                    NaN               NaN   
3      Low-Addendum9282                    NaN               NaN   
4  Reinmaindiewithglory                    NaN               NaN   

                                                body  controversiality  \
0  This pisses me off so much. I’m a stage IV can...                 0   
1  We're sorry to hear that you need to visit thi...                 0   
2  No. It is your opinion.  Vegans NEED to supple...                 0   
3  Science has long been our go-to for unraveling...                 0   
4  I set aside half in case of taxes then I move ...                 0   

   created_utc distinguished  edited  gilded       id     link_id   parent_id  \
0   1719801439           NaN     NaN       0  lb2l5k0  t3_1drvfz5

In [64]:
# Query to get comments containing "cancer" and any of the specified family-related terms or "trust"
q = '''
SELECT * 
FROM "AwsDataCatalog"."a05"."a05"
WHERE LOWER("body") LIKE '%cancer%' 
  AND (
      LOWER("body") LIKE '%family%' 
      OR LOWER("body") LIKE '%friends%' 
      OR LOWER("body") LIKE '%sister%' 
      OR LOWER("body") LIKE '%brother%' 
      OR LOWER("body") LIKE '%mother%' 
      OR LOWER("body") LIKE '%mom%' 
      OR LOWER("body") LIKE '%father%' 
      OR LOWER("body") LIKE '%cousin%' 
      OR LOWER("body") LIKE '%aunt%' 
      OR LOWER("body") LIKE '%uncle%' 
      OR LOWER("body") LIKE '%trust%'
  )
'''

# Start the query execution
start_time = time.time()
response = athena_client.start_query_execution(
    QueryString=q,
    QueryExecutionContext={"Database": SCHEMA_NAME},
    ResultConfiguration={
        "OutputLocation": S3_STAGING_DIR,
        "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"},
    },
)

# Fetch and load the results
logger.info(response)
df_filtered_comments = download_and_load_query_results(athena_client, response)

# Save the filtered comments to a CSV file
output_file = "Query_A3b.csv"
df_filtered_comments.to_csv(output_file, index=False)
logger.info(f"Results saved to {output_file}")
logger.info(f"Data fetched and committed in {time.time() - start_time}s")

# Preview the filtered DataFrame
print(df_filtered_comments.head())


[2024-11-11 03:58:03,302] p537 {1649558915.py:33} INFO - {'QueryExecutionId': '50520700-e02b-4b38-85f4-fb24b7a368d9', 'ResponseMetadata': {'RequestId': '84624bc4-35a3-4800-9e88-ee68b8039c6e', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Mon, 11 Nov 2024 03:58:03 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '59', 'connection': 'keep-alive', 'x-amzn-requestid': '84624bc4-35a3-4800-9e88-ee68b8039c6e'}, 'RetryAttempts': 0}}
[2024-11-11 03:58:03,303] p537 {1553365550.py:3} INFO - Waiting for query to complete...
[2024-11-11 03:58:09,360] p537 {1553365550.py:19} INFO - Downloading results from data/results/50520700-e02b-4b38-85f4-fb24b7a368d9.csv to local file athena_query_results.csv
[2024-11-11 03:58:09,455] p537 {1553365550.py:26} INFO - Results loaded, DataFrame shape: (884, 17)
[2024-11-11 03:58:09,493] p537 {1649558915.py:39} INFO - Results saved to Query_A3b.csv
[2024-11-11 03:58:09,494] p537 {1649558915.py:40} INFO - Data fetched and committed in 6.2710237

                 author author_flair_css_class author_flair_text  \
0   Spare-Patience-6195                    NaN               NaN   
1  Street_Hedgehog_9595                   cath          Catholic   
2     got_knee_gas_enit                    NaN               NaN   
3  Spiritual-Freedom-71                    NaN               NaN   
4    Infamous-Area-2964                    NaN          DMs open   

                                                body  controversiality  \
0  Oh wow! You might be right. I didn’t think abo...                 0   
1  To win, you need to make it your number one go...                 0   
2  Thanks so much.... that's kind.  Sometimes you...                 0   
3  Thank you so much for your comment and support...                 0   
4  hang yourself n1gger 😛 if that's not working t...                 0   

   created_utc distinguished  edited  gilded       id     link_id   parent_id  \
0   1719792014           NaN     NaN       0  lb1ylx2  t3_1dsbt6r