In [2]:
import pandas as pd
pd.options.display.max_columns = 50
import sqlite3
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from data_utils import (
    clean_transcripts,
    flesch,
    fog,
    smog,
    terms,
    uniqueterms, 
    ttr, 
    mtld, 
    hdd
)

con = sqlite3.connect("../data/npr.db")

In [3]:
df_npr = (pd.read_sql_query("SELECT * from npr_stories", con)
          .rename(columns={'publish_date': 'date'})
          .query('date==date')
          .assign(text=lambda df: df.text.str.strip())
          .query('text!=""')
          .assign(
              text=lambda df: df.text.apply(clean_transcripts),
              # Compute readability
              flesch_reading_ease=lambda df: df.text.apply(flesch),
              fog=lambda df: df.text.apply(fog), 
              smog=lambda df: df.text.apply(smog),  
              terms=lambda df: df.text.apply(terms),
              # Compute lexical richness
              uniqueterms=lambda df: df.text.apply(uniqueterms), 
              ttr=lambda df: df.text.apply(ttr),
              mtld=lambda df: df.text.apply(mtld),
              hdd=lambda df: df.text.apply(hdd),
         )
         )
        

df_npr.head(3)

Unnamed: 0,source,url,date,title,authors,text,flesch_reading_ease,fog,smog,terms,uniqueterms,ttr,mtld,hdd
0,NPR,https://www.npr.org/2022/06/29/1108806145/emme...,2022-06-29 00:00:00,Emmett Till's family seeks the arrest of a wom...,"[""The Associated Press""]",emmett till's family seeks the arrest of a wom...,51.546339,13.133205,15.458427,903,386,0.434685,106.567093,0.858271
1,NPR,https://www.npr.org/2022/06/29/1108798387/jan-...,2022-06-29 00:00:00,"Jan. 6 committee subpoenas Pat Cipollone, Trum...","[""Npr Washington Desk""]","jan. committee subpoenas pat cipollone, trump'...",41.754071,,19.621229,655,315,0.480916,83.837238,0.845079
2,NPR,https://www.npr.org/2022/06/29/1108698652/oxfo...,2022-06-29 00:00:00,The dealer that sold the gun used in the Oxfor...,"[""Jaclyn Diaz""]",the dealer that sold the gun used in the oxfor...,51.505102,,14.491673,735,319,0.435198,88.878447,0.83517


In [4]:
(df_npr
 .drop('text', axis=1)
 .to_csv('../data/npr_2_textstat.csv.gz', compression="gzip")
)