In [7]:
# !pip install textstat
# !pip install lexicalrichness

In [1]:
import pandas as pd
pd.options.display.max_columns = 50
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from data_utils import (
    clean_transcripts,
    flesch,
    fog,
    smog,
    terms,
    uniqueterms, 
    ttr, 
    mtld, 
    hdd
)

# FP_NYT = '../../readable_news_lf/nyt.zip'
FP_NYT = 'raw_data/nyt.zip'

In [2]:
%%time
# _usecols = ['Body', 'Headline', 'Lead Paragraph', 'News Desk', 'Online Section', 'Publication Year']
_usecols = ['Body', 'News Desk', 'Online Section', 'Publication Date']

df_nyt = (pd.read_csv(FP_NYT, usecols=_usecols)
          .rename(columns={'Publication Date': 'date'})
          .query('date==date')
          .query('Body==Body')
          .assign(Body=lambda df: df.Body.str.strip())
          .query('Body!=""')
          .assign(
              Body=lambda df: df.Body.apply(clean_transcripts),
              # Compute readability
              flesch_reading_ease=lambda df: df.Body.apply(flesch),
              fog=lambda df: df.Body.apply(fog), 
              smog=lambda df: df.Body.apply(smog),  
              terms=lambda df: df.Body.apply(terms),
              # Compute lexical richness
              uniqueterms=lambda df: df.Body.apply(uniqueterms), 
              ttr=lambda df: df.Body.apply(ttr),
              mtld=lambda df: df.Body.apply(mtld),
              hdd=lambda df: df.Body.apply(hdd),
          )
         )

# df_nyt.memory_usage(deep=True).sum()/ 1073741824
df_nyt.head(3)

CPU times: user 3d 1h 7min 41s, sys: 9min 21s, total: 3d 1h 17min 2s
Wall time: 3d 1h 45min 24s


Unnamed: 0,Body,News Desk,Online Section,date,flesch_reading_ease,fog,smog,terms,uniqueterms,ttr,mtld,hdd
0,"company reports aar corp qtr to nov sales ,, ,...",Financial Desk,Business,19870101T000000,,,,85.0,37,0.45679,24.536496,0.619748
1,company reports american cytogenetics qtr to o...,Financial Desk,Business,19870101T000000,,,,46.0,17,0.386364,22.622642,0.404435
2,company reports applied power qtr to nov sales...,Financial Desk,Business,19870101T000000,,,,30.0,14,0.5,28.0,


In [3]:
(df_nyt
 .drop('Body', axis=1)
 .to_csv('data/nyt_1987_2007_textstat.csv.gz', compression="gzip")
)