<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Exporting-the-Training-Set" data-toc-modified-id="Exporting-the-Training-Set-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Exporting the Training Set</a></span></li><li><span><a href="#Exporting-the-Holdout-Set" data-toc-modified-id="Exporting-the-Holdout-Set-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Exporting the Holdout Set</a></span></li></ul></div>

# Export `AllTheNews21_Training` and `AllTheNews21_Holdout` From Postgre DB to CSV

In [1]:
import os
import pandas as pd

# conda install -c conda-forge python-dotenv
from dotenv import load_dotenv

# conda install -c anaconda sqlalchemy
from sqlalchemy import create_engine

In [2]:
pd.options.display.max_rows = 1000

In [3]:
load_dotenv() # => True if no error

True

In [4]:
# Load secrets from the .env file
db_name = os.getenv("db_name")
db_username = os.getenv("db_username")
db_password = os.getenv("db_password")
db_table_schema = os.getenv("db_table_schema")
connection_string = f"postgres://{db_username}:{db_password}@localhost:5432/{db_name}"
engine = create_engine(connection_string)

## Exporting the Training Set

In [5]:
# AllTheNews21_Training Dataset
q = """
SELECT 
    article,
    category,
    article_length,
    word_count
FROM public."AllTheNews21"
WHERE is_good_article = true
AND is_holdout = false
AND category is not null
AND year >= 2018
"""

AllTheNews21_Training = pd.read_sql(q, con=engine)
display(AllTheNews21_Training.shape)
display(AllTheNews21_Training.head())

(646903, 4)

Unnamed: 0,article,category,article_length,word_count
0,"In Early Works, we talk to artists young and ...",arts and entertainment,4551,822
1,"PEBBLE BEACH, CA. (Reuters) - Gary Woodland sh...",sports,3672,672
2,JERUSALEM (Reuters) - Israel on Tuesday outlaw...,health and wellness,1731,270
3,"ORLANDO, Fla. (Reuters) - The widow of the Pul...",us,3936,649
4,(Reuters) - The Wynn Resorts Ltd (WYNN.O) boar...,business,695,110


In [6]:
# Export AllTheNews21_Training to CSV
AllTheNews21_Training.to_csv("../clean-datasets/exported-from-db/AllTheNews21_Training.csv")

## Exporting the Holdout Set

In [7]:
# AllTheNews21_Holdout Dataset
q = """
SELECT 
    article,
    category,
    article_length,
    word_count
FROM public."AllTheNews21"
WHERE is_good_article = true
AND is_holdout = true
AND category is not null
AND year >= 2018
"""

AllTheNews21_Holdout = pd.read_sql(q, con=engine)
display(AllTheNews21_Holdout.shape)

(3150, 4)

In [8]:
# Export AllTheNews21_Holdout to CSV
AllTheNews21_Holdout.to_csv("../clean-datasets/exported-from-db/AllTheNews21_Holdout.csv")