In [1]:
import os
import polars as pl
import pyarrow as pa
from typing import List

TEMP_DATA_DIR = "../temp_data/"
RAW_DATA = pl.read_parquet("../reddit_data/reddit.parquet")

In [2]:
pl.Config.set_tbl_rows(40)
print(RAW_DATA["reddit_subreddit"].value_counts())
pl.Config.set_tbl_rows(10)

shape: (34, 2)
┌─────────────────────┬────────┐
│ reddit_subreddit    ┆ count  │
│ ---                 ┆ ---    │
│ str                 ┆ u32    │
╞═════════════════════╪════════╡
│ Disneyland          ┆ 231981 │
│ Panera              ┆ 79436  │
│ wholefoods          ┆ 82052  │
│ GeneralMotors       ┆ 37277  │
│ disney              ┆ 43954  │
│ DollarTree          ┆ 59745  │
│ cybersecurity       ┆ 161868 │
│ WalmartEmployees    ┆ 10752  │
│ Lowes               ┆ 198805 │
│ Target              ┆ 340401 │
│ McLounge            ┆ 38627  │
│ KrakenSupport       ┆ 14533  │
│ nursing             ┆ 789499 │
│ walmart             ┆ 630962 │
│ Chase               ┆ 16931  │
│ McDonaldsEmployees  ┆ 174679 │
│ TalesFromYourBank   ┆ 28444  │
│ TjMaxx              ┆ 46286  │
│ RiteAid             ┆ 3970   │
│ Fedexers            ┆ 154572 │
│ sysadmin            ┆ 557558 │
│ UPSers              ┆ 262483 │
│ cabincrewcareers    ┆ 23408  │
│ starbucks           ┆ 393597 │
│ fidelityinvestments ┆ 1294

polars.config.Config

In [3]:
def generate_subset_parquet(RAW_DATA:pl.DataFrame,
                            subreddits: List[str], 
                            filename: str,
                            overwrite: bool=False)->None:
    '''
        Generate a parquete containing only the data for the specified 
        subreddits. Will be saved at "TEMP_DATA_DIR/[filename]"

        Parameters:
        -----------
            raw_data: pl.DataFrame
                A dataframe containing all of the raw data
            subreddits: List[str]
                A list of subreddits to include in the subset
                E.g.: ["starbucks", "Chase"]
            filename: str
                The filename to save the subset of data to. 
                E.g.: "raw_data_gm.parquet"
                
        Returns:
        --------
            None
    '''
    file_exists = os.path.exists(TEMP_DATA_DIR+filename)
    if file_exists and not overwrite:
        print("File already exists. To overwrite, set 'overwrite=True'")
    else:
        data_subset = RAW_DATA.filter(
            pl.col("reddit_subreddit").is_in(subreddits))
        data_subset.write_parquet(TEMP_DATA_DIR+filename)
    
    

In [4]:
generate_subset_parquet(RAW_DATA=RAW_DATA, 
                        subreddits=["starbucks", "Chase"], 
                        filename="raw_data_subset_starbucks_chase.parquet")

In [5]:
generate_subset_parquet(RAW_DATA=RAW_DATA, 
                        subreddits=["GeneralMotors"], 
                        filename="raw_data_subset_gm.parquet")

In [6]:
generate_subset_parquet(RAW_DATA=RAW_DATA, 
                        subreddits=["UPSers", "Fedexers", "Lowes"], 
                        filename="raw_data_subset_ups_fedex_lowes.parquet")