<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Exporting-the-Training-Set" data-toc-modified-id="Exporting-the-Training-Set-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Exporting the Training Set</a></span></li><li><span><a href="#Exporting-the-Holdout-Set" data-toc-modified-id="Exporting-the-Holdout-Set-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Exporting the Holdout Set</a></span></li></ul></div>

# Export `AllTheNews21_Training` and `AllTheNews21_Holdout` From Postgre DB to CSV

In [1]:
import os
import pandas as pd

# conda install -c conda-forge python-dotenv
from dotenv import load_dotenv

# conda install -c anaconda sqlalchemy
from sqlalchemy import create_engine

In [2]:
pd.options.display.max_rows = 1000

In [3]:
load_dotenv() # => True if no error

True

In [4]:
# Load secrets from the .env file
db_name = os.getenv("db_name")
db_username = os.getenv("db_username")
db_password = os.getenv("db_password")
db_table_schema = os.getenv("db_table_schema")
connection_string = f"postgres://{db_username}:{db_password}@localhost:5432/{db_name}"
engine = create_engine(connection_string)

## Exporting the Training Set

In [5]:
# AllTheNews21_Training Dataset
q = """
SELECT 
    article,
    article_length,
    category  
FROM public."AllTheNews21"
WHERE is_holdout = false
AND category IS NOT NULL
"""

AllTheNews21_Training = pd.read_sql(q, con=engine)
display(AllTheNews21_Training.shape)
display(AllTheNews21_Training.head())

(1392369, 3)

Unnamed: 0,article,article_length,category
0,Feb 10 (Reuters) - ChemoMetec A/S: * H1 revenu...,405,global healthcare
1,(Reuters) - The shooting of a black man and wo...,2131,us
2,The New York Islanders vie for just their firs...,2916,sports
3,Titans hold on to earn first postseason berth ...,4466,sports
4,The relationship between Disney and Netflix ma...,1706,arts and entertainment


In [6]:
# Export AllTheNews21_Training to CSV
AllTheNews21_Training.to_csv("../clean-datasets/exported-from-db/AllTheNews21_Training.csv")

## Exporting the Holdout Set

In [7]:
# AllTheNews21_Holdout Dataset
q = """
SELECT 
    article,
    article_length,
    category  
FROM public."AllTheNews21"
WHERE is_holdout = true
AND category IS NOT NULL
"""

AllTheNews21_Holdout = pd.read_sql(q, con=engine)
display(AllTheNews21_Holdout.shape)

(4200, 3)

In [8]:
# Export AllTheNews21_Holdout to CSV
AllTheNews21_Holdout.to_csv("../clean-datasets/exported-from-db/AllTheNews21_Holdout.csv")