In [1]:
import pandas as pd
pd.options.display.max_columns = 50
import sqlite3
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from data_utils import (
    clean_transcripts,
    flesch,
    fog,
    smog,
    terms,
    uniqueterms, 
    ttr, 
    mtld, 
    hdd
)

con = sqlite3.connect("../data/cbs.db")

In [2]:
df_cbs = (pd.read_sql_query("SELECT * from cbs_stories", con)
          .rename(columns={'publish_date': 'date'})
          .query('date==date')
          .assign(text=lambda df: df.text.str.strip())
          .query('text!=""')
          .assign(
              text=lambda df: df.text.apply(clean_transcripts),
              # Compute readability
              flesch_reading_ease=lambda df: df.text.apply(flesch),
              fog=lambda df: df.text.apply(fog), 
              smog=lambda df: df.text.apply(smog),  
              terms=lambda df: df.text.apply(terms),
              # Compute lexical richness
              uniqueterms=lambda df: df.text.apply(uniqueterms), 
              ttr=lambda df: df.text.apply(ttr),
              mtld=lambda df: df.text.apply(mtld),
              hdd=lambda df: df.text.apply(hdd),
         )
         )
        

df_cbs.head(3)

Unnamed: 0,source,url,date,title,authors,text,flesch_reading_ease,fog,smog,terms,uniqueterms,ttr,mtld,hdd
0,CBS,https://www.cbsnews.com/news/supreme-court-dec...,,"""Extremely dangerous path"": Supreme Court deci...","[""Li Cohen"", ""Li Cohen Is A Social Media Produ...",the supreme court ruled on thursday to limit t...,46.321875,14.068176,16.292708,1536,586,0.394879,112.303583,0.86577
1,CBS,https://www.cbsnews.com/live-updates/supreme-c...,,Supreme Court limits EPA's authority to regula...,"[""Melissa Quinn"", ""Li Cohen""]",rich friends in childhood kansas voters to wei...,,,,61,51,0.87931,134.56,0.898048
2,CBS,https://www.cbsnews.com/news/ben-jerrys-west-b...,,Ben & Jerry's objects to sale of its ice cream...,[],a new agreement in israel will put ben & jerry...,41.073093,15.601566,17.271064,872,367,0.418472,88.264542,0.835029


In [3]:
(df_cbs
 .drop('text', axis=1)
 .to_csv('../data/cbs_2_textstat.csv.gz', compression="gzip")
)