# A look at some of the readability score metrics

<A HREF="https://en.wikipedia.org/wiki/Readability">Wikipedia - Readability</A><BR>
<A HREF="https://www.geeksforgeeks.org/readability-index-pythonnlp/">GeeksforGeeks Readability - Index in Python</A><BR>
<A HREF="https://pypi.org/project/readability/">Readability python package</A><BR>
    <A HREF="https://pypi.org/project/textstat/">Textstat python package</A>

In [412]:
import os
import re
import unicodedata
import numpy as np
import pandas as pd
import warnings
import nltk
import en_core_web_md
import matplotlib.pyplot as plt
import plotly.express as px
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import euclidean_distances
from collections import defaultdict
from textblob import TextBlob
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import RegexpTokenizer
import readability
import pprint # pretty print for easy printing of ordered dictionary
# import spacy # Needed for GeeksforGeeks code which is a mess
from textstat.textstat import textstatistics
import spacy
spacy.load('en_core_web_md')
from nltk.corpus import cmudict
nltk.download("punkt")
from nltk.stem import WordNetLemmatizer
nltk.download ('wordnet')

[nltk_data] Downloading package punkt to /home/muddy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/muddy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
tidy_data = pd.read_csv('tidy_data.csv')
tidy_data['date'] = pd.to_datetime(tidy_data['date'], format='%Y-%m-%d')
tidy_data.head(2)

Unnamed: 0,date,source,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,...,sadness,surprise,trust,num_sents,num_words,num_unique_words,depth,TBsubjectivity,TBpolarity,words_per_sentence
0,2008-06-04,nyt,0.064458,0.065088,0.0624,0.064962,0.064416,0.063408,0.055598,0.06492,...,0.07563,0.046218,0.130252,47,1459,620,7.276596,0.406976,0.13588,31.042553
1,2008-06-04,oba,0.064649,0.06467,0.06444,0.064398,0.064471,0.064638,0.054967,0.064503,...,0.042017,0.048739,0.159664,217,5856,939,5.986175,0.445383,0.167408,26.986175


In [4]:
sentences = pd.read_csv('sentence_depth.csv')
sentences['date'] = pd.to_datetime(sentences['date'], format='%Y-%m-%d')
sentences.head(2)

Unnamed: 0,date,source,sentence,depth
0,2004-07-28,oba,"On behalf of the great state of Illinois, cros...",8
1,2004-07-28,oba,"Tonight is a particular honor for me because, ...",5


### First, using the readability package

In [None]:
# Text should be encoded with UTF-8, one sentence per line, tokens space-separated.
results = readability.getmeasures(sentences['sentence'][0], lang='en')
pprint.pprint(results)

In [None]:
pprint.pprint(results['readability grades'])

### I'm gonna try some stuff below

<A HREF="https://computingeverywhere.soc.northwestern.edu/wp-content/uploads/2017/07/Text-Analysis-with-NLTK-Cheatsheet.pdf">Northwestern NLTK cheat sheet</A>

In [51]:
sentences['sentence'][0:2]

0    On behalf of the great state of Illinois, cros...
1    Tonight is a particular honor for me because, ...
Name: sentence, dtype: object

In [512]:
######################## HELPER FUNCTIONS ########################
def words_per_sentence(sentence):
    '''returns: integer number of words in a sentence'''
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    return(len(tokens))

def chars_per_word(word):
    '''returns: integer number of characters in a word'''
    return(len(word))

def string_to_list(sentence):
    '''converts a string/sentence to a list of words'''
    tokenizer = RegexpTokenizer(r'\w+')
    return(tokenizer.tokenize(sentence))

def chars_per_word_sentence(sentence):
    '''input: string of a sentence
       returns: list of number of characters in a sentence'''
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    char_len_list = [chars_per_word(word) for word in tokens]
    return(char_len_list)

def text_to_sentence(text):
    '''uses spacy nlp object to break up sentences
       input: pandas series of strings
       returns: list of sentence strings'''
    doc = nlp(' '.join(text.tolist()))
    assert doc.has_annotation("SENT_START")
    return([str(sent) for sent in doc.sents])

def text_to_wordlist(text):
    '''input: string or pandas series of text
       returns: list of all words'''
    if isinstance(text, str):
        text = [text]
    return(' '.join(text).split())

def syllable_count(word):
    '''counts number of syllables in a word'''
    word = word.lower()
    count = 0
    vowels = "aeiouy"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count += 1
    return count

def count_total_words(text):
    '''count total number of words in a text
       input: str or pandas.core.series.Series of text
       returns: integer'''
    if isinstance(text, list):
        list_of_sentence = text
    elif isinstance(text, str):
        list_of_sentence = [text]
    elif isinstance(text, pd.Series):
        list_of_sentence = text_to_sentence(text)
    else:
        print('count_total_words: Error: not a string or pandas series object.')
    list_of_word_count = [words_per_sentence(str(sentence)) for sentence in list_of_sentence]
    return(np.sum(list_of_word_count))

def count_total_sentences(text):
    '''count total number of sentences in a text
       input: str or pandas.core.series.Series of text
       returns: integer'''
    if isinstance(text, pd.Series):
        text = ' '.join(text)
    sentences = sent_tokenize(text)
    return(len(sentences))

# Count Syllables
# https://datascience.stackexchange.com/questions/23376/how-to-get-the-number-of-syllables-in-a-word
def syllables(word):
    '''backup syllable counter if word not in NLTK-CMU dictionary'''
    #referred from stackoverflow.com/questions/14541303/count-the-number-of-syllables-in-a-word
    count = 0
    vowels = 'aeiouy'
    word = word.lower()
    try:
        if word[0] in vowels:
            count +=1
    except:
        count += 0
    for index in range(1,len(word)):
        if word[index] in vowels and word[index-1] not in vowels:
            count +=1
    if word.endswith('e'):
        count -= 1
    if word.endswith('le'):
        count += 1
    if count == 0:
        count += 1
    return count

d = cmudict.dict()
def nsyl(word):
    '''input: string - word
       returns: integer count of syllables in word'''
    try:
        # needs the [0] otherwise words like 'of' returns [1,1]
        return [len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]][0]
    except KeyError:
        #if word not found in cmudict
        return syllables(word)

def count_total_syllables(text):
    '''count total number of sentences in a text
       input: str or pandas.core.series.Series of text
       returns: integer count'''
    if isinstance(text, str):
        list_of_sentence = [text]
    elif isinstance(text, pd.Series):
        list_of_sentence = text_to_sentence(text)
    else:
        print('count_total_syllables: Error: not a string or pandas series object.')
    list_of_words = text_to_wordlist(list_of_sentence)
    syllable_list = [nsyl(word) for word in list_of_words]
    return(np.sum(syllable_list))

def count_of_letters(text):
    '''count total number of letters or digits in a text
       input: str or pandas.core.series.Series of text
       returns: integer count'''
    if isinstance(text, pd.Series):
        text = ' '.join(text)
    # Replace punctuations with an empty string.
    str1 = re.sub(r"[^\w\s]|_", "", text)
    no_spaces = str1.replace(" ", "")
    return(len(no_spaces))


def difficult_words_list(list1):
    '''returns difference of list with easy_word list for Dale-Chall
       input: two lists of strings/words
       returns: list of unique words in both lists'''
    if isinstance(list1, pd.Series):
        list1 = ' '.join(text)
    if isinstance(list1, list):
        list1 = list1[0]
    try:
        easy_words_file = open('./word_lists/DaleChallEasyWordList.txt', 'r')
        easy_words = easy_words_file.read().split('\n')
    except E:
        print("Error reading easy words file", E)
    easy_words_file.close()
    easy_words = [word.lower() for word in easy_words]
    easy_words = set(easy_words)
    diff = [word.lower() for word in list1.split() if word.lower() not in easy_words]
    return(diff)

def dc_difficult_word_count(text):
    '''Count of difficult words - those not in Dale-Chall Easy Word List
       input: str or pandas.core.series.Series of text
       returns: integer count of difficult words in text'''
    list_of_dc_difficult = difficult_words_list(text)
    return(len(list_of_dc_difficult))

lemmatizer = WordNetLemmatizer()
def gf_complex_word_count(text):
    '''Count of complex - >= 3 syllables with caveats
       input: str or pandas.core.series.Series of text
       returns: integer count of complex words in text'''
    if isinstance(text, pd.Series):
        text = ' '.join(text)
    if isinstance(text, list):
        text = str(text[0])
    text = [word.lower() for word in text.split()]
    lemma = [lemmatizer.lemmatize(word) for word in text]
    stem = [re.sub("(?:ing|ed|es|ly)$","",word) for word in text]
    syllable_list = [nsyl(word) for word in stem]
    complex_count = sum(x > 2 for x in syllable_list)
    return(complex_count)

def smog_poly_count(text):
    '''counts number of words with 3 or more syllables
       input: str or pandas.core.series.Series of text
       returns: integer count of polysyllabic words in text'''
    if isinstance(text, pd.Series):
        text = ' '.join(text)
    if isinstance(text, list):
        text = str(text[0])
    text = [word.lower() for word in text.split()]
    syllable_list = [nsyl(word) for word in text]
    poly_count = sum(x > 2 for x in syllable_list)
    return(poly_count)


######################## READABILITY SCORES ########################

# https://en.wikipedia.org/wiki/Automated_readability_index
# 4.71(chars/word) + 0.5(words/sentence) - 21.43
def ari(text):
    '''input: string of sentence
       returns: float ari score'''
    character_count = count_of_letters(text)
    word_count = count_total_words(text)
    sentence_count = count_total_sentences(text)
    ari = 4.71*(character_count/word_count) + 0.5*(word_count/sentence_count) - 21.43
    return(ari)

# https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests
# Flesch–Kincaid grade level
# 0.39(total words/total sentences) + 11.8(total syllables/total words) - 15.59
def flesch_kincaid(text):
    '''input: string or pandas series of text, multiple sentences
       returns: float - flesh kincaid grade level score'''
    num_total_words = count_total_words(text)
    num_total_sentences = count_total_sentences(text)
    num_total_syllables = count_total_syllables(text)
    fkgl = 0.39*(num_total_words/num_total_sentences) + 11.8*(num_total_syllables/num_total_words) - 15.59
    return(fkgl)

# https://en.wikipedia.org/wiki/Coleman%E2%80%93Liau_index
# 0.0588(average number of letters per 100 words) - 0.296(average number of sentences per 100 words) - 15.8
def coleman_liau(text):
    '''input: string or pandas series of text, multiple sentences
       returns: float - Coleman-Liau index'''
    character_count = count_of_letters(text)
    word_count = count_total_words(text)
    sentence_count = count_total_sentences(text)
    l = character_count/word_count*100
    s = sentence_count/word_count*100
    cl = 0.0588*l - 0.296*s - 15.8
    return(cl)

# https://en.wikipedia.org/wiki/Dale%E2%80%93Chall_readability_formula
# 0.1579(100*difficult words/words) + 0.496(words/sentences)
def dale_chall(text):
    '''input: string or pandas series of text, multiple sentences
       returns: float - Dale-Chall readability score'''
    difficult_words = dc_difficult_word_count(text)
    word_count = count_total_words(text)
    sentence_count = count_total_sentences(text)
    dc = 0.1579*(100*difficult_words/word_count) + 0.496*(word_count/sentence_count)
    return(dc)

# https://en.wikipedia.org/wiki/Gunning_fog_index
# 0.4[(words/sentence) + 100(complex words/words)]
def gunning_fog(text):
    '''input: string or pandas series of text, multiple sentences
       returns: float - Gunning Fog index readability score'''
    complex_words = gf_complex_word_count(text)
    word_count = count_total_words(text)
    sentence_count = count_total_sentences(text)
    gf = 0.4*((word_count/sentence_count) + 100*(complex_words/word_count))
    return(gf)

# https://en.wikipedia.org/wiki/SMOG
# 1.043*sqrt(30*number polysylables/number sentences)+3.1291
def smog(text):
    '''input: string or pandas series of text, multiple sentences
       returns: float - SMOG grade readability score'''
    poly_count = smog_poly_count(text)
    sentence_count = count_total_sentences(text)
    smog_score = 1.043*np.sqrt(30*poly_count/sentence_count) + 3.1291
    return(smog_score)

In [513]:
char_count = count_of_letters(text)
syl_count = count_total_syllables(text)
word_count = count_total_words(text)
char_per_word = char_count/word_count
syl_per_word = syl_count/word_count
sent_count = count_total_sentences(text)
word_per_sent = word_count/sent_count

dc_word_count = dc_difficult_word_count(text)
gf_word_count = gf_complex_word_count(text)
ploy_word_count = smog_poly_count(text)

ari(text)
flesch_kincaid(text)
coleman_liau(text)
dale_chall(text)
gunning_fog(text)
smog(text)

15.521786647896054

### Load up the speeches

In [514]:
# Load up the files
paths = ['./speeches/', './NYTimes/', './WSJ/'] 
list_of_files = []

dates = pd.read_csv('dateSpeeches.csv')
for path in paths:
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith('.txt'):
                list_of_files.append(os.path.join(root,file))

speeches = []
for file in list_of_files:
    with open(file, encoding='utf-8') as f:
        #print(file)
        text = f.read()
    f.close()
    speeches.append([text, file])

#clean out goofy unicode  space characters 
speeches = [(unicodedata.normalize("NFKD", speech[0]), speech[1]) for speech in speeches if len(speech)>0 ]

# remove [stuff] in between square brackets
def remove_bracket(text):
    return re.sub('(\[[^w]*\]\s)', '',text)
speeches = [(remove_bracket(speech[0]), speech[1]) for speech in speeches]

def get_source(text):
    regex = "[^./][a-zA-Z]+[^/]"
    string = re.findall(regex, str(text))[0]
    if string == 'speeches': string = 'oba'
    if string == 'NYTimes': string = 'nyt'
    return string.lower()

def get_date(text):
    regex = "([0-9]+[\-][0-9]+[\-][0-9]+)"
    return re.findall(regex, str(text))[0]

def get_filename(text):
    regex = "[-]([a-zA-Z]+)"
    return re.findall(regex, str(text))[0]

cols = ['text', 'filepath']
text_df = pd.DataFrame(speeches, columns=cols)
text_df['date'] = text_df['filepath'].apply(get_date)
text_df['date'] = pd.to_datetime(text_df['date'], format='%Y-%m-%d')
text_df['source'] = text_df['filepath'].apply(get_source)

text_df.head(3)

Unnamed: 0,text,filepath,date,source
0,"Good afternoon, everybody. One year ago this m...",./speeches/2014-07-01-Immigration.txt,2014-07-01,oba
1,"Good morning, everybody. I want to take just a...",./speeches/2009-12-25-UnderwearBomber.txt,2009-12-25,oba
2,"Hello, Chicago.\n\nIf there is anyone out ther...",./speeches/2008-11-05-ObamaElected.txt,2008-11-05,oba


In [515]:
text_df['char_count'] = text_df['text'].apply(count_of_letters)
text_df['syl_count'] = text_df['text'].apply(count_total_syllables)
text_df['word_count'] = text_df['text'].apply(count_total_words)
text_df['char_per_word'] = text_df['char_count']/text_df['word_count']#
text_df['syl_per_word'] = text_df['syl_count']/text_df['word_count']#
text_df['sent_count'] = text_df['text'].apply(count_total_sentences)
text_df['word_per_sent'] = text_df['word_count']/text_df['sent_count']#

text_df['dc_word_count'] = text_df['text'].apply(dc_difficult_word_count)
text_df['gf_word_count'] = text_df['text'].apply(gf_complex_word_count)
text_df['poly_word_count'] = text_df['text'].apply(smog_poly_count)

text_df['dc_word_perc'] = text_df['dc_word_count']/text_df['word_count']#
text_df['gf_word_perc'] = text_df['gf_word_count']/text_df['word_count']#
text_df['poly_word_perc'] = text_df['poly_word_count']/text_df['word_count']#

text_df['ari'] = text_df['text'].apply(ari)
text_df['flesch_kincaid'] = text_df['text'].apply(flesch_kincaid)
text_df['coleman_liau'] = text_df['text'].apply(coleman_liau)
text_df['dale_chall'] = text_df['text'].apply(dale_chall)
text_df['gunning_fog'] = text_df['text'].apply(gunning_fog)
text_df['smog'] = text_df['text'].apply(smog)

In [516]:
text_df

Unnamed: 0,text,filepath,date,source,char_count,syl_count,word_count,char_per_word,syl_per_word,sent_count,...,poly_word_count,dc_word_perc,gf_word_perc,poly_word_perc,ari,flesch_kincaid,coleman_liau,dale_chall,gunning_fog,smog
0,"Good afternoon, everybody. One year ago this m...",./speeches/2014-07-01-Immigration.txt,2014-07-01,oba,9131,3044,2057,4.438989,1.479825,112,...,248,0.308216,0.108410,0.120564,8.660673,9.034703,8.689587,13.976300,11.682841,11.629938
1,"Good morning, everybody. I want to take just a...",./speeches/2009-12-25-UnderwearBomber.txt,2009-12-25,oba,5269,1782,1106,4.764014,1.611212,53,...,188,0.331826,0.146474,0.169982,11.442470,11.560787,10.793960,15.590029,14.206121,13.888447
2,"Hello, Chicago.\n\nIf there is anyone out ther...",./speeches/2008-11-05-ObamaElected.txt,2008-11-05,oba,8752,2896,2093,4.181558,1.383660,96,...,188,0.270903,0.079790,0.089823,9.166178,9.239998,7.429890,15.091392,11.912424,11.123544
3,"Good afternoon, everybody. As a candidate for ...",./speeches/2011-12-16-EndIraqWar.txt,2011-12-16,oba,4416,1519,982,4.496945,1.546843,51,...,153,0.340122,0.141548,0.155804,9.378062,10.172161,9.104766,14.920961,13.363875,13.023867
4,"Thank you. Good evening, everybody. Good eveni...",./speeches/2012-04-29-WHCorrespondentsDinner.txt,2012-04-29,oba,5559,1862,1338,4.154709,1.391629,90,...,130,0.305680,0.088191,0.097160,5.572010,6.629226,6.638655,12.200556,9.474320,9.994967
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,"President Barack Obama, emphasizing U.S. allie...",./WSJ/2014-03-26-NATOBelgium.txt,2014-03-26,wsj,4810,1627,957,5.026123,1.700104,39,...,178,0.459770,0.168234,0.185998,14.512272,14.041233,12.547335,19.430847,16.544747,15.333674
299,The White House cleared the way for tough new ...,./WSJ/2012-03-31-OilIran.txt,2012-03-31,wsj,2251,765,459,4.904139,1.666667,20,...,81,0.466231,0.156863,0.176471,13.143497,13.027167,11.746580,18.744986,15.454510,14.625780
300,"President Barack Obama, seeking to restore con...",./WSJ/2014-01-29-StateoftheUnion.txt,2014-01-29,wsj,5540,1898,1151,4.813206,1.649001,46,...,211,0.422242,0.166811,0.183319,13.751069,13.626689,11.318679,19.077976,16.681154,15.364192
301,President Barack Obama won re-election Tuesday...,./WSJ/2012-11-07-ObamaWinsRe-election.txt,2012-11-07,wsj,7763,2650,1631,4.759657,1.624770,82,...,290,0.404660,0.156346,0.177805,10.933105,11.339482,10.698614,16.255138,14.209930,13.872383


In [517]:
text_df.drop(['text', 'filepath'], axis=1, inplace=True)

In [518]:
text_df

Unnamed: 0,date,source,char_count,syl_count,word_count,char_per_word,syl_per_word,sent_count,word_per_sent,dc_word_count,...,poly_word_count,dc_word_perc,gf_word_perc,poly_word_perc,ari,flesch_kincaid,coleman_liau,dale_chall,gunning_fog,smog
0,2014-07-01,oba,9131,3044,2057,4.438989,1.479825,112,18.366071,634,...,248,0.308216,0.108410,0.120564,8.660673,9.034703,8.689587,13.976300,11.682841,11.629938
1,2009-12-25,oba,5269,1782,1106,4.764014,1.611212,53,20.867925,367,...,188,0.331826,0.146474,0.169982,11.442470,11.560787,10.793960,15.590029,14.206121,13.888447
2,2008-11-05,oba,8752,2896,2093,4.181558,1.383660,96,21.802083,567,...,188,0.270903,0.079790,0.089823,9.166178,9.239998,7.429890,15.091392,11.912424,11.123544
3,2011-12-16,oba,4416,1519,982,4.496945,1.546843,51,19.254902,334,...,153,0.340122,0.141548,0.155804,9.378062,10.172161,9.104766,14.920961,13.363875,13.023867
4,2012-04-29,oba,5559,1862,1338,4.154709,1.391629,90,14.866667,409,...,130,0.305680,0.088191,0.097160,5.572010,6.629226,6.638655,12.200556,9.474320,9.994967
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,2014-03-26,wsj,4810,1627,957,5.026123,1.700104,39,24.538462,440,...,178,0.459770,0.168234,0.185998,14.512272,14.041233,12.547335,19.430847,16.544747,15.333674
299,2012-03-31,wsj,2251,765,459,4.904139,1.666667,20,22.950000,214,...,81,0.466231,0.156863,0.176471,13.143497,13.027167,11.746580,18.744986,15.454510,14.625780
300,2014-01-29,wsj,5540,1898,1151,4.813206,1.649001,46,25.021739,486,...,211,0.422242,0.166811,0.183319,13.751069,13.626689,11.318679,19.077976,16.681154,15.364192
301,2012-11-07,wsj,7763,2650,1631,4.759657,1.624770,82,19.890244,660,...,290,0.404660,0.156346,0.177805,10.933105,11.339482,10.698614,16.255138,14.209930,13.872383


In [519]:
tidy_data

Unnamed: 0,date,source,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,...,sadness,surprise,trust,num_sents,num_words,num_unique_words,depth,TBsubjectivity,TBpolarity,words_per_sentence
0,2008-06-04,nyt,0.064458,0.065088,0.062400,0.064962,0.064416,0.063408,0.055598,0.064920,...,0.075630,0.046218,0.130252,47,1459,620,7.276596,0.406976,0.135880,31.042553
1,2008-06-04,oba,0.064649,0.064670,0.064440,0.064398,0.064471,0.064638,0.054967,0.064503,...,0.042017,0.048739,0.159664,217,5856,939,5.986175,0.445383,0.167408,26.986175
2,2008-06-04,wsj,0.072536,0.071300,0.070185,0.071579,0.072456,0.072177,0.000000,0.072616,...,0.054945,0.047619,0.168498,64,1709,657,6.421875,0.424590,0.178206,26.703125
3,2008-11-05,nyt,0.065299,0.065636,0.065602,0.064593,0.065131,0.065266,0.042603,0.065972,...,0.079618,0.041401,0.114650,54,1862,735,7.259259,0.412085,0.115966,34.481481
4,2008-11-05,oba,0.067288,0.067345,0.064371,0.067174,0.067145,0.067260,0.066802,0.066745,...,0.062222,0.046667,0.160000,96,2254,755,5.760417,0.435926,0.144582,23.479167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,2016-11-09,oba,0.069479,0.069178,0.069730,0.069078,0.068376,0.069429,0.042841,0.069329,...,0.034014,0.091837,0.187075,79,1303,470,4.392405,0.490976,0.237315,16.493671
296,2016-11-09,wsj,0.069166,0.068667,0.068939,0.068576,0.068078,0.069120,0.000000,0.069211,...,0.076364,0.080000,0.152727,61,1446,623,6.098361,0.397283,0.114535,23.704918
297,2017-01-11,nyt,0.068761,0.069864,0.067920,0.065872,0.069811,0.069916,0.041708,0.070074,...,0.073826,0.057047,0.161074,53,1275,558,5.679245,0.454084,0.178910,24.056604
298,2017-01-11,oba,0.066897,0.066921,0.066529,0.066823,0.066811,0.066885,0.066676,0.066493,...,0.050076,0.044765,0.157815,231,5143,1592,4.900433,0.500204,0.182039,22.264069


In [521]:
new = pd.merge(tidy_data, text_df, on=['source', 'date'], how="left")

In [522]:
new

Unnamed: 0,date,source,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,...,poly_word_count,dc_word_perc,gf_word_perc,poly_word_perc,ari,flesch_kincaid,coleman_liau,dale_chall,gunning_fog,smog
0,2008-06-04,nyt,0.064458,0.065088,0.062400,0.064962,0.064416,0.063408,0.055598,0.064920,...,249,0.380171,0.157632,0.177603,15.452576,15.325928,10.632325,20.798477,18.237193,16.278189
1,2008-06-04,oba,0.064649,0.064670,0.064440,0.064398,0.064471,0.064638,0.054967,0.064503,...,692,0.291358,0.108995,0.122046,12.214807,11.736337,8.759753,17.560543,14.811401,13.330696
2,2008-06-04,wsj,0.072536,0.071300,0.070185,0.071579,0.072456,0.072177,0.000000,0.072616,...,289,0.422932,0.169799,0.181078,13.762434,13.247780,11.381454,19.047102,16.766980,15.268686
3,2008-11-05,nyt,0.065299,0.065636,0.065602,0.064593,0.065131,0.065266,0.042603,0.065972,...,305,0.367757,0.156053,0.174986,16.572693,16.143715,10.577900,21.816657,19.153222,16.705917
4,2008-11-05,oba,0.067288,0.067345,0.064371,0.067174,0.067145,0.067260,0.066802,0.066745,...,188,0.270903,0.079790,0.089823,9.166178,9.239998,7.429890,15.091392,11.912424,11.123544
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,2016-11-09,oba,0.069479,0.069178,0.069730,0.069078,0.068376,0.069429,0.042841,0.069329,...,144,0.249196,0.106109,0.115756,6.142879,6.942263,6.913215,11.745237,10.543107,10.841910
296,2016-11-09,wsj,0.069166,0.068667,0.068939,0.068576,0.068078,0.069120,0.000000,0.069211,...,215,0.413254,0.137007,0.160089,12.226879,11.895723,11.130335,17.445410,14.286825,13.854148
297,2017-01-11,nyt,0.068761,0.069864,0.067920,0.065872,0.069811,0.069916,0.041708,0.070074,...,201,0.417029,0.159861,0.174631,11.795875,12.102038,10.760626,17.356505,15.081232,14.254228
298,2017-01-11,oba,0.066897,0.066921,0.066529,0.066823,0.066811,0.066885,0.066676,0.066493,...,631,0.323535,0.117986,0.129702,10.026644,10.249659,8.919137,15.554685,13.143667,12.570875


### Save to csv, leave commented to not accidently write over existing file by accident with trash

In [523]:
# Save data
#new.to_csv('tidy_data.csv', index=False)