<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Exporting-the-Training-Set" data-toc-modified-id="Exporting-the-Training-Set-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Exporting the Training Set</a></span></li><li><span><a href="#Exporting-the-Holdout-Set" data-toc-modified-id="Exporting-the-Holdout-Set-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Exporting the Holdout Set</a></span></li></ul></div>

# Export `AllTheNews21_Training` and `AllTheNews21_Holdout` From Postgre DB to CSV

In [14]:
import os
import pandas as pd

# conda install -c conda-forge python-dotenv
from dotenv import load_dotenv

# conda install -c anaconda sqlalchemy
from sqlalchemy import create_engine

In [15]:
pd.options.display.max_rows = 1000

In [16]:
load_dotenv() # => True if no error

True

In [17]:
# Load secrets from the .env file
db_name = os.getenv("db_name")
db_username = os.getenv("db_username")
db_password = os.getenv("db_password")
db_table_schema = os.getenv("db_table_schema")
connection_string = f"postgres://{db_username}:{db_password}@localhost:5432/{db_name}"
engine = create_engine(connection_string)

## Exporting the Training Set

For the training, due to the limiting size of this dataset, we will randomly select 3146 samples from each category:

In [18]:
# List of distinct categories in the DB
categories = [
    "arts and entertainment",
    "automobiles",
    "business",
    "climate and environment",
    "energy",
    "finance and economics",
    "food",
    "global healthcare",
    "health and wellness",
    "legal and crimes",
    "life",
    "markets and investments",
    "personal finance",
    "politics",
    "real estate",
    "science and technology",
    "sports",
    "travel and transportation",
    "us",
    "wealth",
    "world"
]

In [19]:
# Select random articles per category to use as holdout set
# Make sure to apply all the necessary filters

AllTheNews21_Training = pd.DataFrame()

for cat in categories:

    q = f"""
    SELECT 
        article,
        category,
        article_length,
        word_count
    FROM public."AllTheNews21"
    WHERE category = '{cat}'
    AND is_good_article = true
    AND is_holdout = false
    AND article_length < 20000
    AND year >= 2018
    ORDER BY RANDOM()
    LIMIT 3200
    """
    AllTheNews21_Training = AllTheNews21_Training.append(pd.read_sql(q, con=engine))
    display(AllTheNews21_Training.shape)

(3200, 4)

(5404, 4)

(8604, 4)

(11804, 4)

(15004, 4)

(18204, 4)

(21404, 4)

(24604, 4)

(27804, 4)

(31004, 4)

(34204, 4)

(37404, 4)

(40544, 4)

(43744, 4)

(46944, 4)

(50144, 4)

(53344, 4)

(56544, 4)

(59744, 4)

(61533, 4)

(64733, 4)

In [20]:
# # AllTheNews21_Training Dataset
# q = """
# SELECT 
#     article,
#     category,
#     article_length,
#     word_count
# FROM public."AllTheNews21"
# AND is_holdout = false
# AND year >= 2018
# """

# AllTheNews21_Training = pd.read_sql(q, con=engine)
# display(AllTheNews21_Training.shape)
# display(AllTheNews21_Training.head())

In [21]:
# Export AllTheNews21_Training to CSV
AllTheNews21_Training.to_csv("../clean-datasets/exported-from-db/AllTheNews21_Training.csv")

## Exporting the Holdout Set

In [11]:
# AllTheNews21_Holdout Dataset
q = """
SELECT 
    article,
    category,
    article_length,
    word_count
FROM public."AllTheNews21"
WHERE is_good_article = true
AND is_holdout = true
AND category is not null
AND year >= 2018
"""

AllTheNews21_Holdout = pd.read_sql(q, con=engine)
display(AllTheNews21_Holdout.shape)

(3150, 4)

In [14]:
# Export AllTheNews21_Holdout to CSV
AllTheNews21_Holdout.to_csv("../clean-datasets/exported-from-db/AllTheNews21_Holdout.csv")