In [1]:
import pandas as pd
pd.options.display.max_columns = 50
import sqlite3
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from data_utils import (
    clean_transcripts,
    flesch,
    fog,
    smog,
    terms,
    uniqueterms, 
    ttr, 
    mtld, 
    hdd
)

con = sqlite3.connect("../data/nyt.db")

In [2]:
df_nyt = (pd.read_sql_query("SELECT * from nyt_stories", con)
          .rename(columns={'publish_date': 'date'})
          .query('date==date')
          .assign(text=lambda df: df.text.str.strip())
          .query('text!=""')
          .assign(
              text=lambda df: df.text.apply(clean_transcripts),
              # Compute readability
              flesch_reading_ease=lambda df: df.text.apply(flesch),
              fog=lambda df: df.text.apply(fog), 
              smog=lambda df: df.text.apply(smog),  
              terms=lambda df: df.text.apply(terms),
              # Compute lexical richness
              uniqueterms=lambda df: df.text.apply(uniqueterms), 
              ttr=lambda df: df.text.apply(ttr),
              mtld=lambda df: df.text.apply(mtld),
              hdd=lambda df: df.text.apply(hdd),
         )
         )
        

df_nyt.head(3)

Unnamed: 0,source,url,date,title,authors,text,flesch_reading_ease,fog,smog,terms,uniqueterms,ttr,mtld,hdd
0,NYT,https://www.nytimes.com/2022/06/28/us/politics...,2022-06-28 00:00:00,Trump Aides Watch Testimony and Brace for Damage,"[""Michael C. Bender"", ""Maggie Haberman""]",current and former aides to mr. trump sent one...,35.556422,,21.534483,290,163,0.592727,99.05981,0.826813
1,NYT,https://www.nytimes.com/2022/06/29/us/women-ab...,2022-06-29 00:00:00,"For Many Women, Roe Was About More Than Aborti...","[""Julie Bosman""]",the ruling only deepened the desire for yoland...,55.4075,,16.846154,260,143,0.590909,101.422544,0.862269
2,NYT,https://www.nytimes.com/interactive/2022/06/29...,2022-06-29 00:00:00,Illinois Abortion Clinics Prepare for Rush of ...,"[""Allison Mccann""]","fairview heights, ill. illinois is quickly eme...",46.660288,17.768303,16.663046,2086,652,0.320236,98.950793,0.873779


In [3]:
(df_nyt
 .drop('text', axis=1)
 .to_csv('/Users/derekwillis/code/unreadable_news/data/nyt_2_textstat.csv.gz', compression="gzip")
)