In [1]:
import os
import pandas as pd
from glob import glob
import textstat
from utils import uniqueterms, ttr, mtld, hdd

# FP_CNN = '../../readable_news_lf/cnn_cc/'
FP_MSNBC1 = 'raw_data/msnbc/msnbc--2003--2014.csv.gz'
FP_MSNBC2 = 'raw_data/msnbc/msnbc-2010--2021.csv.gz'

In [4]:
df1 = (pd.read_csv(FP_MSNBC1)
       .query('Content==Content')
       .reset_index(drop=True)
       .assign(
           # Compute readability for text
           flesch_reading_ease=lambda df: df.Content.apply(textstat.flesch_reading_ease),
           flesch_kincaid_grade=lambda df: df.Content.apply(textstat.flesch_kincaid_grade),
           fog=lambda df: df.Content.apply(textstat.gunning_fog), 
           smog=lambda df: df.Content.apply(textstat.smog_index),  
           terms=lambda df: df.Content.apply(textstat.lexicon_count),
           # Compute lexical richness for text
           uniqueterms=lambda df: df.Content.apply(uniqueterms), 
           ttr=lambda df: df.Content.apply(ttr),
           mtld=lambda df: df.Content.apply(mtld),
           hdd=lambda df: df.Content.apply(hdd),    
       )           
      )
df1.head(3)

Unnamed: 0,cite,Source,Date,Show,Author,Location(s),Dateline,Section,Index Terms,Record Number,...,Content,flesch_reading_ease,flesch_kincaid_grade,fog,smog,terms,uniqueterms,ttr,mtld,hdd
0,"MSNBC , Hardball for September 24, 2003",MSNBC,"Tuesday, September 24, 2002",HARDBALL,Chris MatthewsDavid Shuster,,,"News, International",Terrorism ; Iraq ; Politics ; War ; Elections ...,0FE8AF52AF84A531,...,"b'CHRIS MATTHEWS, HOST: I`m Chris Matthews. Le...",37.94,14.1,11.25,11.6,8740,1619,0.163304,41.606257,0.817531
1,"COUNTDOWN: IRAQ for March 31, 2003",MSNBC,"Monday, March 31, 2003",COUNTDOWN: IRAQ,Keith OlbermannCarl RochelleJohn IrvineKerry S...,,,"News, International",Iraq ; War ; United States ; United Nations ; ...,0FE5A1D810A218B3,...,"b'KEITH OLBERMANN, HOST: 8 p.m. in the East, 4...",30.94,16.8,14.38,13.7,8027,1966,0.221696,66.644927,0.845062
2,"HARDBALL for March 31, 2003",MSNBC,"Monday, March 31, 2003",HARDBALL,Chris MatthewsJohn WardenCarl RochelleJohn Irv...,,,"News, International",War ; Military ; Iraq ; SHOW,0FE5A1CEDF3C105E,...,b'(BEGIN VIDEO CLIP)\n \n\n GEORGE W...,31.35,16.6,13.82,13.1,13316,2328,0.157244,63.714404,0.849839


In [5]:
(df1
 .drop(['Content'], axis=1)
 .to_csv('data/msnbc2003_2014_textstat.csv.gz', index=False, compression="gzip")
)

In [9]:
df2 = (pd.read_csv(FP_MSNBC2)
       .query('text==text')
       .reset_index(drop=True)
       .assign(
           # Compute readability for text
           flesch_reading_ease=lambda df: df.text.apply(textstat.flesch_reading_ease),
           flesch_kincaid_grade=lambda df: df.text.apply(textstat.flesch_kincaid_grade),
           fog=lambda df: df.text.apply(textstat.gunning_fog), 
           smog=lambda df: df.text.apply(textstat.smog_index),  
           terms=lambda df: df.text.apply(textstat.lexicon_count),
           # Compute lexical richness for text
           uniqueterms=lambda df: df.text.apply(uniqueterms), 
           ttr=lambda df: df.text.apply(ttr),
           mtld=lambda df: df.text.apply(mtld),
           hdd=lambda df: df.text.apply(hdd),    
       )           
      )
df2.head(3)

Unnamed: 0,air_date,show_name,headline,guests,url,channel.name,program.name,uid,duration,year,...,text,flesch_reading_ease,flesch_kincaid_grade,fog,smog,terms,uniqueterms,ttr,mtld,hdd
0,May 28 2010,The Rachel Maddow Show,"The Rachel Maddow Show, Transcript 05/27/10","Guests: Rep. Charlie Melancon, Linda Tollner, ...",https://www.msnbc.com/transcripts/rachel-maddo...,MSNBC,"The Rachel Maddow Show, Transcript 05/27/10",,,2010,...,"KEITH OLBERMANN, HOST, COUNTDOWN: And now, to...",35.55,23.3,25.41,15.0,8089,1541,0.192049,58.395263,0.87062
1,Jun 02 2010,The Rachel Maddow Show,"The Rachel Maddow Show, Transcript 06/01/10",Guests: Chris Hayes,https://www.msnbc.com/transcripts/rachel-maddo...,MSNBC,"The Rachel Maddow Show, Transcript 06/01/10",,,2010,...,"RACHEL MADDOW, HOST: Good evening, Keith. Th...",11.26,30.6,32.15,19.0,7826,1759,0.223507,84.383256,0.876087
2,Jun 03 2010,The Rachel Maddow Show,"The Rachel Maddow Show, Transcript 06/02/10","Guests: Sen. Barbara Boxer, David Muth, Larry ...",https://www.msnbc.com/transcripts/rachel-maddo...,MSNBC,"The Rachel Maddow Show, Transcript 06/02/10",,,2010,...,"KEITH OLBERMANN, “COUNTDOWN” HOST: And now, f...",11.36,30.5,31.97,17.7,7820,1600,0.204473,70.737984,0.872863


In [13]:
(df2
 .drop(['text'], axis=1)
 .to_csv('data/msnbc2010_2021_textstat.csv.gz', index=False, compression="gzip")
)