In [1]:
# For dependency parse tree depth
# Using Stanford's CoreNLP parser with NLTK
# 1. Download CoreNLP from https://stanfordnlp.github.io/CoreNLP/download.html
# 2. make sure Java is installed, otherwise download and install Java - https://www.java.com/en/download/windows_manual.jsp
# 3. Unzip/extract CoreNLP zip file to a directory
# 4. Go to that directory and open a command terminal, and run the following command...
# 4b. on my laptop its in C:\Users\peter\stanford-corenlp-4.5.2
# 5. java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000
# 6. Now for graphviz if you want to view the parse trees, download from https://graphviz.org/download/ then install
# 7. Now, can run the following python code

In [2]:
import pandas as pd
import numpy as np
import spacy
import en_core_web_md
import regex as re
import os
import sys
import unicodedata
import re
import stanza
from collections import defaultdict
import nltk
from nltk.parse.corenlp import CoreNLPParser
from nltk.parse.corenlp import CoreNLPDependencyParser
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import cmudict
stanza.download('en')
nltk.download('punkt')
word_token = TreebankWordTokenizer()
from nrclex import NRCLex
from textblob import TextBlob
from nltk.stem import WordNetLemmatizer
nltk.download ('wordnet')
from nltk.parse.corenlp import CoreNLPDependencyParser

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2023-04-28 17:34:04 INFO: Downloading default packages for language: en (English) ...
2023-04-28 17:34:05 INFO: File exists: C:\Users\peter\stanza_resources\en\default.zip
2023-04-28 17:34:09 INFO: Finished downloading models and saved to C:\Users\peter\stanza_resources.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\peter\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\peter\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [24]:
stanza.download('en')
nlp_stanza = stanza.Pipeline('en', processors='tokenize, lemma, pos')
nlp_spacy = spacy.load("en_core_web_md")
stopwords = pd.read_table('./Data/word_lists/kaggle_stopwords.txt')
paths = ['./Data/amrhet/']
suffix = 'amrhet'
source = 'amrhet'
dates_file = 'datetitle.csv'
#paths = ['./Data/amrhet/']
#suffix = 'amrhet'
#source = suffix
#dates_file = 'datetitle.csv'
dates = pd.read_csv(dates_file)
dates['date'] = pd.to_datetime(dates['date'], format='%Y-%m-%d')
# date, title, file
try:
    dates.rename(columns={"url":"file"}, inplace=True)
except:
    print()
try:
    dates = dates.drop('title', axis=1)
except:
    print()

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2023-04-29 08:35:52 INFO: Downloading default packages for language: en (English) ...
2023-04-29 08:35:54 INFO: File exists: C:\Users\peter\stanza_resources\en\default.zip
2023-04-29 08:35:58 INFO: Finished downloading models and saved to C:\Users\peter\stanza_resources.
2023-04-29 08:35:58 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2023-04-29 08:35:58 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |

2023-04-29 08:35:58 INFO: Use device: gpu
2023-04-29 08:35:58 INFO: Loading: tokenize
2023-04-29 08:35:59 INFO: Loading: pos
2023-04-29 08:35:59 INFO: Loading: lemma
2023-04-29 08:35:59 INFO: Done loading processors!


In [124]:
print('---------LOADING DOCUMENTS----------')
# Load up the speeches
speeches = []
for path in paths:
    list_of_files = []
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith('.txt'):
                list_of_files.append(os.path.join(root,file))

    for file in list_of_files:
        with open(file, encoding='utf-8') as f:
            text = f.read()
        f.close()
        speeches.append(text)

#clean out goofy unicode  space characters 
speeches = [unicodedata.normalize("NFKD", speech) for speech in speeches if len(speech)>0 ]
#clean out xa0 space characters
[speech.replace(u'\xa0', '') for speech in speeches]; # ; supresses output
# remove [stuff] in between square brackets
def remove_bracket(text):
    return re.sub(r'(\[[^w]*\]\s)', '',text)
speeches = [remove_bracket(speech) for speech in speeches]
# Clean up whitespace
speeches = [re.sub('[\s+]', ' ', speech) for speech in speeches]
# Remove -- that's all over the amrhet files
def remove_dashes(text):
    return re.sub(r'-- ', '', text)
speeches = [remove_dashes(speech) for speech in speeches]
text_df = pd.DataFrame({'file' : list_of_files,
                        'text' : speeches})

text_df = pd.merge(text_df, dates, how='inner', on='file')
text_df = text_df.sort_values(by='date', ignore_index=True)
text_df = text_df[['date', 'file', 'text']]
text_df['source'] = 'oba'
text_df.set_index('file', inplace=True)
text_df['sentences'] = text_df['text'].apply(sent_tokenize)
text_df['words'] = text_df['text'].apply(word_token.tokenize)
text_df['word_set'] = text_df['words'].apply(set)
text_df['num_sents'] = text_df['sentences'].apply(len)
text_df['num_words'] = text_df['words'].apply(len)
text_df['num_unique_words'] = text_df['word_set'].apply(len)
print("Length of text_df:", len(text_df))

---------LOADING DOCUMENTS----------
Length of text_df: 433


In [125]:
############# POS TAGGING ###################  NEW TRY!!!!
print('---------POS TAGGING---------')

# The nlp(text) uses a lot of gpu memory, causes errors sometimes, may need to restart notebook to freshen things up
parts_of_speech = ['NUM','ADV','SYM','NOUN','ADP','PROPN','DET','INTJ','AUX',
                   'CCONJ','ADJ','PRON','SCONJ','X','VERB','PUNCT','PART']
for col in parts_of_speech:
    text_df[col] = 0
print("Length of text_df:", len(text_df))
print('...pst, this is slow')
for i, text in enumerate(text_df.text):
    doc = nlp_stanza(text) # Run stanza on each speech
    mat_of_pos = [[word.pos for word in sentence.words] for sentence in doc.sentences] # matrix of POS for each sentence
    # How to flatten a list = [item for sublist in list_of_lists for item in sublist]
    list_of_pos = [pos for sentence in mat_of_pos for pos in sentence] # flatten matrix into one list of all pos
    total_pos_count = len(list_of_pos)
    for pos in parts_of_speech:
        #dfd.iloc[[0, 2], dfd.columns.get_loc('A')]
        text_df.iloc[i, text_df.columns.get_loc(pos)] = list_of_pos.count(pos)/total_pos_count
        #text_df.at[i, pos] = list_of_pos.count(pos)/total_pos_count
        if i%40==0: print(i, pos, list_of_pos.count(pos)/total_pos_count)
        

text_df = text_df.dropna()
print("Length of text_df:", len(text_df))
print("----------------------DONE!!!!!!!!!!!!-------------------")

---------POS TAGGING---------
Length of text_df: 433
...pst, this is slow
0 NUM 0.0009478672985781991
0 ADV 0.024644549763033177
0 SYM 0.0
0 NOUN 0.1895734597156398
0 ADP 0.11374407582938388
0 PROPN 0.046445497630331754
0 DET 0.0995260663507109
0 INTJ 0.0
0 AUX 0.04549763033175355
0 CCONJ 0.04928909952606635
0 ADJ 0.07203791469194312
0 PRON 0.0976303317535545
0 SCONJ 0.01990521327014218
0 X 0.0
0 VERB 0.10900473933649289
0 PUNCT 0.10710900473933649
0 PART 0.024644549763033177
40 NUM 0.003927729772191673
40 ADV 0.03534956794972506
40 SYM 0.0
40 NOUN 0.16496465043205027
40 ADP 0.09478921183555905
40 PROPN 0.05943964388583399
40 DET 0.0945273631840796
40 INTJ 0.0005236973029588898
40 AUX 0.061534433097669546
40 CCONJ 0.05027494108405342
40 ADJ 0.07462686567164178
40 PRON 0.09714584969887405
40 SCONJ 0.02251898402723226
40 X 0.0
40 VERB 0.11442786069651742
40 PUNCT 0.09583660644147683
40 PART 0.03011259492013616
80 NUM 0.012147195426938193
80 ADV 0.0420388233893057
80 SYM 0.000357270453733

In [None]:
# Things have changed, this may not work
print('---POS Long-facet data---')
# Reshape POS data to be long for facet plots
POSFacetPlotData  = pd.DataFrame(columns=['date', 'file', 'source', 'pos', 'proportion'])
for i in range(len(text_df)):
    for pos in unique_pos:
        POSFacetPlotData.loc[len(POSFacetPlotData)] = [text_df.date[i], text_df.index[i], source, pos, text_df[pos].iloc[i]]            
POSFacetPlotData['date'] = pd.to_datetime(POSFacetPlotData['date'])
POSFacetPlotData.to_csv('./Data/genData/pos_long_data'+suffix+'.csv', index=False)

print("Length of text_df:", len(text_df))
print('DONE!!!!!!!!!!!!!!!!!!!!!!!!!!!')

In [12]:
print('---------NRCLex EMOTION TAGGING---------')
text_df.dropna(inplace=True)
text_df['emo'] = text_df.text.apply(NRCLex)

# Get names of emotion attributes, locate and remove anticipation as it seems to alway be 0
emotions = {'fear': 0.077,
            'anger': 0.019,
            'anticip': 0.0,
            'trust': 0.203,
            'surprise': 0.019,
            'positive': 0.330,
            'negative': 0.106,
            'sadness': 0.067,
            'disgust': 0.019,
            'joy': 0.087,
            'anticipation': 0.067}
Attributes = list(emotions.keys())
antIndx = Attributes.index('anticip')
#AttNo = len(emotions[0].affect_frequencies.keys())-1

# Make a column for emo values for each source and each emotion
# We could also try correlation between each of these columns and the econ/approval data

# use list() around the affect_frequencies.values() to get numbers in list form
# starting point: [list(emotion.affect_frequencies.values())[1] for emotion in df['emo_oba']]
indexes = [0,1,3,4,5,6,7,8,9,10] # skip 2 which is anticipation and seems to always be 0
for i, attr in enumerate(Attributes):
    if not(i==2):
        text_df[attr]=[list(emotion.affect_frequencies.values())[i] for emotion in text_df['emo']]


---------NRCLex EMOTION TAGGING---------


In [None]:
print('----------TextBlob----------')
text_df['TBsubjectivity']=[TextBlob(text).sentiment.subjectivity for text in text_df['text']]
text_df['TBpolarity']=[TextBlob(text).sentiment.polarity for text in text_df['text']]

In [None]:
print('----------READABILITY----------')

########## HELPER FUNCTIONS #############
def words_per_sentence(sentence):
    '''returns: integer number of words in a sentence'''
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    return(len(tokens))

def chars_per_word(word):
    '''returns: integer number of characters in a word'''
    return(len(word))

def string_to_list(sentence):
    '''converts a string/sentence to a list of words'''
    tokenizer = RegexpTokenizer(r'\w+')
    return(tokenizer.tokenize(sentence))

def chars_per_word_sentence(sentence):
    '''input: string of a sentence
       returns: list of number of characters in a sentence'''
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    char_len_list = [chars_per_word(word) for word in tokens]
    return(char_len_list)

def text_to_sentence(text):
    '''uses spacy nlp object to break up sentences
       input: pandas series of strings
       returns: list of sentence strings'''
    doc = nlp(' '.join(text.tolist()))
    assert doc.has_annotation("SENT_START")
    return([str(sent) for sent in doc.sents])

def text_to_wordlist(text):
    '''input: string or pandas series of text
       returns: list of all words'''
    if isinstance(text, str):
        text = [text]
    return(' '.join(text).split())

def syllable_count(word):
    '''counts number of syllables in a word'''
    word = word.lower()
    count = 0
    vowels = "aeiouy"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count += 1
    return count

def count_total_words(text):
    '''count total number of words in a text
       input: str or pandas.core.series.Series of text
       returns: integer'''
    if isinstance(text, list):
        list_of_sentence = text
    elif isinstance(text, str):
        list_of_sentence = [text]
    elif isinstance(text, pd.Series):
        list_of_sentence = text_to_sentence(text)
    else:
        print('count_total_words: Error: not a string or pandas series object.')
    list_of_word_count = [words_per_sentence(str(sentence)) for sentence in list_of_sentence]
    return(np.sum(list_of_word_count))

def count_total_sentences(text):
    '''count total number of sentences in a text
       input: str or pandas.core.series.Series of text
       returns: integer'''
    if isinstance(text, pd.Series):
        text = ' '.join(text)
    sentences = sent_tokenize(text)
    return(len(sentences))

# Count Syllables
# https://datascience.stackexchange.com/questions/23376/how-to-get-the-number-of-syllables-in-a-word
def syllables(word):
    '''backup syllable counter if word not in NLTK-CMU dictionary'''
    #referred from stackoverflow.com/questions/14541303/count-the-number-of-syllables-in-a-word
    count = 0
    vowels = 'aeiouy'
    word = word.lower()
    try:
        if word[0] in vowels:
            count +=1
    except:
        count += 0
    for index in range(1,len(word)):
        if word[index] in vowels and word[index-1] not in vowels:
            count +=1
    if word.endswith('e'):
        count -= 1
    if word.endswith('le'):
        count += 1
    if count == 0:
        count += 1
    return count

d = cmudict.dict()
def nsyl(word):
    '''input: string - word
       returns: integer count of syllables in word'''
    try:
        # needs the [0] otherwise words like 'of' returns [1,1]
        return [len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]][0]
    except KeyError:
        #if word not found in cmudict
        return syllables(word)

def count_total_syllables(text):
    '''count total number of sentences in a text
       input: str or pandas.core.series.Series of text
       returns: integer count'''
    if isinstance(text, str):
        list_of_sentence = [text]
    elif isinstance(text, pd.Series):
        list_of_sentence = text_to_sentence(text)
    else:
        print('count_total_syllables: Error: not a string or pandas series object.')
    list_of_words = text_to_wordlist(list_of_sentence)
    syllable_list = [nsyl(word) for word in list_of_words]
    return(np.sum(syllable_list))

def count_of_letters(text):
    '''count total number of letters or digits in a text
       input: str or pandas.core.series.Series of text
       returns: integer count'''
    if isinstance(text, pd.Series):
        text = ' '.join(text)
    # Replace punctuations with an empty string.
    str1 = re.sub(r"[^\w\s]|_", "", text)
    no_spaces = str1.replace(" ", "")
    return(len(no_spaces))


def difficult_words_list(list1):
    '''returns difference of list with easy_word list for Dale-Chall
       input: two lists of strings/words
       returns: list of unique words in both lists'''
    if isinstance(list1, pd.Series):
        list1 = ' '.join(text)
    if isinstance(list1, list):
        list1 = list1[0]
    try:
        easy_words_file = open('./Data/word_lists/DaleChallEasyWordList.txt', 'r')
        easy_words = easy_words_file.read().split('\n')
    except E:
        print("Error reading easy words file", E)
    easy_words_file.close()
    easy_words = [word.lower() for word in easy_words]
    easy_words = set(easy_words)
    diff = [word.lower() for word in list1.split() if word.lower() not in easy_words]
    return(diff)

def dc_difficult_word_count(text):
    '''Count of difficult words - those not in Dale-Chall Easy Word List
       input: str or pandas.core.series.Series of text
       returns: integer count of difficult words in text'''
    list_of_dc_difficult = difficult_words_list(text)
    return(len(list_of_dc_difficult))

lemmatizer = WordNetLemmatizer()
def gf_complex_word_count(text):
    '''Count of complex - >= 3 syllables with caveats
       input: str or pandas.core.series.Series of text
       returns: integer count of complex words in text'''
    if isinstance(text, pd.Series):
        text = ' '.join(text)
    if isinstance(text, list):
        text = str(text[0])
    text = [word.lower() for word in text.split()]
    lemma = [lemmatizer.lemmatize(word) for word in text]
    stem = [re.sub("(?:ing|ed|es|ly)$","",word) for word in text]
    syllable_list = [nsyl(word) for word in stem]
    complex_count = sum(x > 2 for x in syllable_list)
    return(complex_count)

def smog_poly_count(text):
    '''counts number of words with 3 or more syllables
       input: str or pandas.core.series.Series of text
       returns: integer count of polysyllabic words in text'''
    if isinstance(text, pd.Series):
        text = ' '.join(text)
    if isinstance(text, list):
        text = str(text[0])
    text = [word.lower() for word in text.split()]
    syllable_list = [nsyl(word) for word in text]
    poly_count = sum(x > 2 for x in syllable_list)
    return(poly_count)


######################## READABILITY SCORES ########################

# https://en.wikipedia.org/wiki/Automated_readability_index
# 4.71(chars/word) + 0.5(words/sentence) - 21.43
def ari(text):
    '''input: string of sentence
       returns: float ari score'''
    character_count = count_of_letters(text)
    word_count = count_total_words(text)
    sentence_count = count_total_sentences(text)
    ari = 4.71*(character_count/word_count) + 0.5*(word_count/sentence_count) - 21.43
    return(ari)

# https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests
# Flesch–Kincaid grade level
# 0.39(total words/total sentences) + 11.8(total syllables/total words) - 15.59
def flesch_kincaid(text):
    '''input: string or pandas series of text, multiple sentences
       returns: float - flesh kincaid grade level score'''
    num_total_words = count_total_words(text)
    num_total_sentences = count_total_sentences(text)
    num_total_syllables = count_total_syllables(text)
    fkgl = 0.39*(num_total_words/num_total_sentences) + 11.8*(num_total_syllables/num_total_words) - 15.59
    return(fkgl)

# https://en.wikipedia.org/wiki/Coleman%E2%80%93Liau_index
# 0.0588(average number of letters per 100 words) - 0.296(average number of sentences per 100 words) - 15.8
def coleman_liau(text):
    '''input: string or pandas series of text, multiple sentences
       returns: float - Coleman-Liau index'''
    character_count = count_of_letters(text)
    word_count = count_total_words(text)
    sentence_count = count_total_sentences(text)
    l = character_count/word_count*100
    s = sentence_count/word_count*100
    cl = 0.0588*l - 0.296*s - 15.8
    return(cl)

# https://en.wikipedia.org/wiki/Dale%E2%80%93Chall_readability_formula
# 0.1579(100*difficult words/words) + 0.496(words/sentences)
def dale_chall(text):
    '''input: string or pandas series of text, multiple sentences
       returns: float - Dale-Chall readability score'''
    difficult_words = dc_difficult_word_count(text)
    word_count = count_total_words(text)
    sentence_count = count_total_sentences(text)
    dc = 0.1579*(100*difficult_words/word_count) + 0.496*(word_count/sentence_count)
    return(dc)

# https://en.wikipedia.org/wiki/Gunning_fog_index
# 0.4[(words/sentence) + 100(complex words/words)]
def gunning_fog(text):
    '''input: string or pandas series of text, multiple sentences
       returns: float - Gunning Fog index readability score'''
    complex_words = gf_complex_word_count(text)
    word_count = count_total_words(text)
    sentence_count = count_total_sentences(text)
    gf = 0.4*((word_count/sentence_count) + 100*(complex_words/word_count))
    return(gf)

# https://en.wikipedia.org/wiki/SMOG
# 1.043*sqrt(30*number polysylables/number sentences)+3.1291
def smog(text):
    '''input: string or pandas series of text, multiple sentences
       returns: float - SMOG grade readability score'''
    poly_count = smog_poly_count(text)
    sentence_count = count_total_sentences(text)
    smog_score = 1.043*np.sqrt(30*poly_count/sentence_count) + 3.1291
    return(smog_score)

text_df['char_count'] = text_df['text'].apply(count_of_letters)
text_df['syl_count'] = text_df['text'].apply(count_total_syllables)
text_df['word_count'] = text_df['text'].apply(count_total_words)
text_df['char_per_word'] = text_df['char_count']/text_df['word_count']#
text_df['syl_per_word'] = text_df['syl_count']/text_df['word_count']#
text_df['sent_count'] = text_df['text'].apply(count_total_sentences)
text_df['word_per_sent'] = text_df['word_count']/text_df['sent_count']#

text_df['dc_word_count'] = text_df['text'].apply(dc_difficult_word_count)
text_df['gf_word_count'] = text_df['text'].apply(gf_complex_word_count)
text_df['poly_word_count'] = text_df['text'].apply(smog_poly_count)

text_df['dc_word_perc'] = text_df['dc_word_count']/text_df['word_count']#
text_df['gf_word_perc'] = text_df['gf_word_count']/text_df['word_count']#
text_df['poly_word_perc'] = text_df['poly_word_count']/text_df['word_count']#

text_df['ari'] = text_df['text'].apply(ari)
text_df['flesch_kincaid'] = text_df['text'].apply(flesch_kincaid)
text_df['coleman_liau'] = text_df['text'].apply(coleman_liau)
text_df['dale_chall'] = text_df['text'].apply(dale_chall)
text_df['gunning_fog'] = text_df['text'].apply(gunning_fog)
text_df['smog'] = text_df['text'].apply(smog)

In [None]:
print('----------PARALLEL PHRASE COUNT---------')
parser = CoreNLPParser()
def count_parallels(sents):
    count = 0
    print_error = True
    for phrase in sents:
        try:
            sent, = parser.parse_text(phrase)
        except:
            if print_error:
                print("Be sure Stanford CoreNLP server started and parser instantiated!")
                print("Errors also seem to occur with quotes")
                print_error = False
            continue
        poss = []
        words = []
        for word in sent.pos():
            poss.append(word[1])
            words.append(word[0])
        stop = False
        results = []
        for length in range(7,3,-1):
            length = min(length, len(words))
            for i in range(len(poss)-length+1):
                for j in range(len(poss)-length+1):
                    if abs(i-j) > length:
                        if poss[i:i+length]==poss[j:j+length]:
                            if length > 4 or (',' not in poss[i:i+length] and '``' not in poss[i:i+length]):
                                results.append([i,j,length])
                                count += 1
                                stop = True
                                break
                if stop: break
            if stop: break

    return(count)

#for i in range(len(text_df.text)):
#    sent_count, parallel_count = text_df.sentences[3].apply(count_parallels)
#    text_df.loc[i, 'sent_count'] = sent_count
#    text_df.loc[i, 'parallel_count'] = parallel_count
text_df['parallel_count'] = text_df.sentences.apply(count_parallels)
text_df['parallel_per_sent'] = text_df.parallel_count/text_df.num_sents

In [None]:
print('----------PARSE TREE DEPTH----------')

# Make data frame of sentences and parse tree depth of each
def walk_tree_depth(node, depth):
    if node.n_lefts + node.n_rights > 0:
        return max(walk_tree_depth(child, depth+1) for child in node.children )
    else:
        return depth
    
tree_depth = pd.DataFrame(columns = ['date', 'source', 'sentence', 'depth'])
for i, speech in enumerate(text_df['sentences']):
    for j, sentence in enumerate(speech):
        doc = nlp_spacy(sentence)
        depth = [walk_tree_depth(sent.root, 0) for sent in doc.sents][0]
        tree_depth.loc[len(tree_depth)] = [text_df.index[i], text_df['source'].iloc[i], sentence, depth]

mean_depth=tree_depth.groupby(by='date').mean(depth)
text_df=pd.merge(text_df, mean_depth, how='left', on='date')
tree_depth.to_csv('./Data/genData/sentence_depth_'+suffix+'.csv',index=False)

In [158]:
pos = text_df.copy()
#pos['file'] = pos.index
pos.reset_index(drop=False,inplace=True)
rest = pd.read_csv('./Data/genData/tidy_data_amrhet.csv')

In [160]:
intersection = list(set(pos.columns).intersection(set(rest.columns)))
intersection.remove('file')
intersection

['DET',
 'INTJ',
 'num_sents',
 'PRON',
 'VERB',
 'NOUN',
 'source',
 'AUX',
 'num_unique_words',
 'SCONJ',
 'PROPN',
 'PART',
 'ADV',
 'SYM',
 'ADP',
 'CCONJ',
 'ADJ',
 'X',
 'num_words',
 'NUM',
 'PUNCT']

In [161]:
rest.drop(intersection, axis=1, inplace=True)

In [162]:
df = pd.merge(pos, rest, how='inner', on='file')

In [165]:
#df.to_csv('./Data/genData/tidy_data_amrhet.csv', index=False)