In [20]:
import pandas as pd
import pyarrow.parquet as pq
from tqdm import tqdm  # for monitoring progress
import json

In [3]:
# this is how we read the parquet file in batches

parquet_file = pq.ParquetFile('../data/reddit.parquet')

for batch in parquet_file.iter_batches():
    batch_df = batch.to_pandas()
    break

In [4]:
batch_df.head()

Unnamed: 0,aware_post_type,aware_created_ts,reddit_id,reddit_name,reddit_created_utc,reddit_author,reddit_text,reddit_permalink,reddit_title,reddit_url,reddit_subreddit,reddit_link_id,reddit_parent_id,reddit_submission
0,submission,2023-04-02T13:58:03,129sqka,t3_129sqka,1680458283,MoodyStarGirl,That's it.,/r/starbucks/comments/129sqka/hot_chai_lattes_...,Hot chai lattes shouldn't have water,https://www.reddit.com/r/starbucks/comments/12...,starbucks,,,
1,comment,2023-04-02T14:32:57,jeounwc,t1_jeounwc,1680460377,Lost_Treat_6296,We should make the chai tea latte with the sam...,/r/starbucks/comments/129sqka/hot_chai_lattes_...,,,starbucks,t3_129sqka,t3_129sqka,129sqka
2,comment,2023-04-02T14:48:18,jeowus2,t1_jeowus2,1680461298,MoodyStarGirl,Oh like using the chai tea bags?,/r/starbucks/comments/129sqka/hot_chai_lattes_...,,,starbucks,t3_129sqka,t1_jeounwc,129sqka
3,comment,2023-04-02T14:48:49,jeowxe5,t1_jeowxe5,1680461329,Lost_Treat_6296,"No, the whole half water and half milk thing",/r/starbucks/comments/129sqka/hot_chai_lattes_...,,,starbucks,t3_129sqka,t1_jeowus2,129sqka
4,comment,2023-04-02T21:59:22,jeqiuw3,t1_jeqiuw3,1680487162,MoodyStarGirl,That's a lot of water :(,/r/starbucks/comments/129sqka/hot_chai_lattes_...,,,starbucks,t3_129sqka,t1_jeowxe5,129sqka


In [5]:
batch_df.columns

Index(['aware_post_type', 'aware_created_ts', 'reddit_id', 'reddit_name',
       'reddit_created_utc', 'reddit_author', 'reddit_text',
       'reddit_permalink', 'reddit_title', 'reddit_url', 'reddit_subreddit',
       'reddit_link_id', 'reddit_parent_id', 'reddit_submission'],
      dtype='object')

In [7]:
# total number of entries (rows) in the parquet file (there are 5,528,298 entries)

counter = 0

for batch in parquet_file.iter_batches():
    counter += len(batch)

print(counter)

5528298


In [8]:
type(batch)

pyarrow.lib.RecordBatch

In [11]:
# identifying all the subreddits in the dataset (takes a few seconds to run on my computer)

subreddits = set()

for batch in parquet_file.iter_batches():
    pd_batch = batch.to_pandas()
    subreddits = subreddits.union(set(pd_batch["reddit_subreddit"]))

In [13]:
len(subreddits) # there are 34 subreddits

34

In [16]:
subreddits

{'BestBuyWorkers',
 'Bestbuy',
 'CVS',
 'Chase',
 'DisneyWorld',
 'Disneyland',
 'DollarTree',
 'FedEmployees',
 'Fedexers',
 'GameStop',
 'GeneralMotors',
 'KrakenSupport',
 'Lowes',
 'McDonaldsEmployees',
 'McLounge',
 'Panera',
 'PaneraEmployees',
 'RiteAid',
 'TalesFromYourBank',
 'Target',
 'TjMaxx',
 'UPSers',
 'WalmartEmployees',
 'WaltDisneyWorld',
 'cabincrewcareers',
 'cybersecurity',
 'disney',
 'fidelityinvestments',
 'nursing',
 'starbucks',
 'starbucksbaristas',
 'sysadmin',
 'walmart',
 'wholefoods'}

In [None]:
# Creating a separate json file for individial subreddits (takes a couple of minutes to run on my computer)
counts = []

for subreddit in tqdm(subreddits):
    df = pd.DataFrame()
    for batch in parquet_file.iter_batches():
        pd_batch = batch.to_pandas()
        df = pd.concat([df, pd_batch[pd_batch["reddit_subreddit"]==subreddit]])
    counts.append(len(df))
    df.to_json(subreddit+'.json.gz', orient='records', compression='gzip')

 32%|█████████████▉                             | 11/34 [01:49<04:24, 11.52s/it]

In [17]:
# lets take a look at BestBuyWorkers subreddit

with open('../data/BestBuyWorkers.json', 'r', encoding='utf-8') as f:
    BestBuyWorkers = json.load(f)
pd.DataFrame(BestBuyWorkers).head()

Unnamed: 0,aware_post_type,aware_created_ts,reddit_id,reddit_name,reddit_created_utc,reddit_author,reddit_text,reddit_permalink,reddit_title,reddit_url,reddit_subreddit,reddit_link_id,reddit_parent_id,reddit_submission
0,submission,2023-04-16T17:32:45,12opsul,t3_12opsul,1681680765,utaustinresearch,,/r/BestBuyWorkers/comments/12opsul/research_st...,Research Study Recruitment - Managers,https://i.redd.it/nrmkf51rebua1.png,BestBuyWorkers,,,
1,comment,2023-04-16T18:48:06,jgjgy9e,t1_jgjgy9e,1681685286,Not_A_Real_Boy69,![gif](giphy|YmQLj2KxaNz58g7Ofg)\n\n$50?,/r/BestBuyWorkers/comments/12opsul/research_st...,,,BestBuyWorkers,t3_12opsul,t3_12opsul,12opsul
2,comment,2023-04-16T19:55:39,jgjpqmp,t1_jgjpqmp,1681689339,GSAgentsLivesMatter,bullshit on getting $50 its just a coupon to B...,/r/BestBuyWorkers/comments/12opsul/research_st...,,,BestBuyWorkers,t3_12opsul,t3_12opsul,12opsul
3,submission,2023-04-14T11:18:57,12m0ozl,t3_12m0ozl,1681485537,,Good luck y’all,/r/BestBuyWorkers/comments/12m0ozl/sr_role_eli...,Sr role eliminated,https://www.reddit.com/r/BestBuyWorkers/commen...,BestBuyWorkers,,,
4,comment,2023-04-14T14:52:47,jg9its6,t1_jg9its6,1681498367,JiminyWillikerz,I was there almost 5 years before my position ...,/r/BestBuyWorkers/comments/12m0ozl/sr_role_eli...,,,BestBuyWorkers,t3_12m0ozl,t3_12m0ozl,12m0ozl


In [18]:
# lets take a look at RiteAid subreddit

with open('../data/RiteAid.json', 'r', encoding='utf-8') as f:
    RiteAid = json.load(f)
pd.DataFrame(RiteAid).head()

Unnamed: 0,aware_post_type,aware_created_ts,reddit_id,reddit_name,reddit_created_utc,reddit_author,reddit_text,reddit_permalink,reddit_title,reddit_url,reddit_subreddit,reddit_link_id,reddit_parent_id,reddit_submission
0,submission,2023-04-07T00:22:46,12e9kvk,t3_12e9kvk,1680841366,CaptThrowaway1,"Hi all, I was hoping you could help me underst...",/r/RiteAid/comments/12e9kvk/rx_refill_process_...,Rx Refill Process question,https://www.reddit.com/r/RiteAid/comments/12e9...,RiteAid,,,
1,comment,2023-04-07T07:28:51,jfazqv2,t1_jfazqv2,1680866931,thedukeofwhalez,"Couple of different ways to answer this, based...",/r/RiteAid/comments/12e9kvk/rx_refill_process_...,,,RiteAid,t3_12e9kvk,t3_12e9kvk,12e9kvk
2,comment,2023-04-07T07:55:24,jfb2ac9,t1_jfb2ac9,1680868524,CaptThrowaway1,"Thank you, this is very helpful",/r/RiteAid/comments/12e9kvk/rx_refill_process_...,,,RiteAid,t3_12e9kvk,t1_jfazqv2,12e9kvk
3,comment,2023-04-10T19:01:39,jfrb56b,t1_jfrb56b,1681167699,StorytellingGiant,Thank you so much for your reply! One point of...,/r/RiteAid/comments/12e9kvk/rx_refill_process_...,,,RiteAid,t3_12e9kvk,t1_jfazqv2,12e9kvk
4,submission,2023-04-06T16:58:23,12dxrz0,t3_12dxrz0,1680814703,AlternativeAfter,"Was at my local Rite Aid in Tinton Falls, New ...",/r/RiteAid/comments/12dxrz0/waiting_for_corpor...,Waiting for corporate to call.,https://www.reddit.com/r/RiteAid/comments/12dx...,RiteAid,,,


In [47]:
# total values count in each subreddits 
lis_of_dicts = [{subreddit: count} for subreddit, count in zip(subreddits, counts)]
reddit_count = {}
for d in lis_of_dicts:
    reddit_count.update(d)
dict(sorted(reddit_count.items(), key=lambda item: item[1]))

{'FedEmployees': 280,
 'PaneraEmployees': 2694,
 'RiteAid': 3970,
 'BestBuyWorkers': 5629,
 'WalmartEmployees': 10752,
 'KrakenSupport': 14533,
 'Chase': 16931,
 'cabincrewcareers': 23408,
 'TalesFromYourBank': 28444,
 'GeneralMotors': 37277,
 'McLounge': 38627,
 'disney': 43954,
 'TjMaxx': 46286,
 'DollarTree': 59745,
 'DisneyWorld': 65549,
 'Panera': 79436,
 'wholefoods': 82052,
 'Bestbuy': 121077,
 'fidelityinvestments': 129423,
 'starbucksbaristas': 132019,
 'GameStop': 137071,
 'Fedexers': 154572,
 'cybersecurity': 161868,
 'McDonaldsEmployees': 174679,
 'CVS': 179598,
 'Lowes': 198805,
 'Disneyland': 231981,
 'UPSers': 262483,
 'Target': 340401,
 'WaltDisneyWorld': 373138,
 'starbucks': 393597,
 'sysadmin': 557558,
 'walmart': 630962,
 'nursing': 789499}