# Moral Foundations Dictionary Counter

imports -> these should go into the requirements.txt

In [1]:
import pandas as pd 
import numpy as np 
from matplotlib import pyplot as plt
import re 
import spacy

from nltk.corpus import stopwords
nltk_stopwords = stopwords.words('english')
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
punctuation += '’'
for i in range(0,10):
    punctuation += str(i)
    
stopwords = set(list(nltk_stopwords) + list(ENGLISH_STOP_WORDS) + list(STOP_WORDS))
from collections import Counter
import re, fnmatch

***

## Load MFDs and CSV with Text

In [808]:
# Load E-MFD
emfd = pd.read_pickle('dictionaries/emfd_scoring.pkl')
probabilites = [c for c in emfd.columns if c.endswith('_p')]
foundations = ['care','fairness','loyalty','authority','sanctity']
senti = [c for c in emfd.columns if c.endswith('_sent')]
emfd = emfd.T.to_dict()

In [149]:
mfd2_foundations

array(['care.virtue', 'care.vice', 'authority.virtue', 'fairness.vice',
       'fairness.virtue', 'loyalty.vice', 'loyalty.virtue',
       'sanctity.virtue', 'authority.vice', 'sanctity.vice'], dtype=object)

In [203]:
# Load MFD
MFD = 'dictionaries/mft_original.dic'
nummap = dict()
mfd = dict()
mfd_regex = dict()
wordmode = True
with open(MFD, 'r') as f:
    for line in f.readlines():
        ent = line.strip().split()
        if line[0] == '%':
            wordmode = not wordmode
        elif len(ent) > 0:
            if wordmode:
                mfd[ent[0]] = [nummap[e] for e in ent[1:]]
            else:
                nummap[ent[0]] = ent[1]
                
mfd_foundations = ['care.virtue', 'care.vice', 'authority.virtue', 'fairness.vice',
       'fairness.virtue', 'loyalty.vice', 'loyalty.virtue',
       'sanctity.virtue', 'authority.vice', 'sanctity.vice', 'moral']
# convert vocab to compiled regex for comparison
for v in mfd.keys():
    mfd_regex[v] = re.compile(fnmatch.translate(v))

In [66]:
# Load MFD2.0 
MFD2 = 'dictionaries/mfd2.0.dic'
nummap = dict()
mfd2 = dict()
wordmode = True
with open(MFD2, 'r') as f:
    for line in f.readlines():
        ent = line.strip().split()
        if line[0] == '%':
            wordmode = not wordmode
        elif len(ent) > 0:
            if wordmode:
                wordkey = ''.join([e for e in ent if e not in nummap.keys()])
                mfd2[wordkey] = [nummap[e] for e in ent if e in nummap.keys()]
            else:
                nummap[ent[0]] = ent[1]

mfd2 = pd.DataFrame.from_dict(mfd2).T
mfd2_foundations = mfd2[0].unique()
mfd2['foundation'] = mfd2[0]
del mfd2[0]
mfd2 = mfd2.T.to_dict()

In [423]:
csv = pd.read_csv('data/mfdc_input.csv', header=None)

***

## Build spaCy Scoring Pipeline

In [224]:
def tokenizer(doc):
    
    '''Performs minimal preprocessing on textual document.
    Steps include tokenization, lower-casing, and 
    stopword/punctuation/whitespace removal. 
    Returns list of processed tokens'''
    
    return  [x.lower_ for x in doc if x.lower_ not in stopwords and not x.is_punct and not x.is_digit and not x.is_quote and not x.like_num and not x.is_space] 

In [110]:
def score_emfd(doc):
    
    '''Scores documents with the e-MFD.'''
    
    emfd_score = {k:0 for k in probabilites+senti}
    moral_words = [ emfd[token] for token in doc if token in emfd.keys() ]
    
    for dic in moral_words:
        emfd_score['care_p'] += dic['care_p']
        emfd_score['fairness_p'] += dic['fairness_p']
        emfd_score['loyalty_p'] += dic['loyalty_p']
        emfd_score['authority_p'] += dic['authority_p']
        emfd_score['sanctity_p'] += dic['sanctity_p']
        
        emfd_score['care_sent'] += dic['care_sent']
        emfd_score['fairness_sent'] += dic['fairness_sent']
        emfd_score['loyalty_sent'] += dic['loyalty_sent']
        emfd_score['authority_sent'] += dic['authority_sent']
        emfd_score['sanctity_sent'] += dic['sanctity_sent']
    
    emfd_score = {k:v/len(doc) for k,v in emfd_score.items()}
    nonmoral_words = len(doc)-len(moral_words)
    emfd_score['moral_nonmoral_ratio'] =  len(moral_words)/nonmoral_words 
    return emfd_score

In [226]:
def score_mfd(doc):
    
    '''Scores documents with the original MFD.'''
    
    mfd_score = {k:0 for k in mfd_foundations}
    moral_words = []
    for token in doc:
        for v in mfd_regex.keys():
            if mfd_regex[v].match(token):
                for f in mfd[v]:
                    mfd_score[f] += 1
    
    mfd_score = {k:v/len(doc) for k,v in mfd_score.items()}
    return mfd_score

In [136]:
def score_mfd2(doc):
    
    '''Scores documents with the MFD2.'''
    
    mfd2_score = {k:0 for k in mfd2_foundations}
    moral_words = [ mfd2[token]['foundation'] for token in doc if token in mfd2.keys() ]
    f_counts = Counter(moral_words)
    mfd2_score.update(f_counts)    
    mfd2_score = {k:v/len(doc) for k,v in mfd2_score.items()}
    return mfd2_score

In [245]:
def score_docs(csv, dic_type):
    
    '''Wrapper function that executes preprocessing and dictionary scoring. 
    dict_type specifies the dicitonary with which the documents should be scored.
    Accepted values are: [emfd, mfd, mfd2]'''
    
    nlp = spacy.load('en', disable=['ner', 'parser', 'tagger'])
    nlp.add_pipe(tokenizer, name="mfd_tokenizer")
    if dic_type == 'emfd':
        nlp.add_pipe(score_emfd, name="score_emfd", last=True)
    elif dic_type == 'mfd':
        nlp.add_pipe(score_mfd, name="score_mfd", last=True)
    elif dic_type == 'mfd2':
        nlp.add_pipe(score_mfd2, name="score_mfd2", last=True)
    else:
        print('Dictionary type not recognized. Available values are: emfd, mfd, mfd2')
        return 
    scored_docs = csv[0].apply(lambda row: nlp(row))
    df = scored_docs.apply(pd.Series)
    if dic_type == 'emfd':
        df['f_var'] = df[probabilites].var(axis=1)
        df['sent_var'] = df[senti].var(axis=1)
    return df

***

## Dependency Parsing

In [1137]:
nlp = spacy.load('en_core_web_sm')

In [1138]:
def find_ent(token, entities):
    '''High level function to match tokens to NER.
    Do not include in nlp.pipe!'''
    for k,v in entities.items():
        if token in v:
            return k

In [1139]:
def spaCy_NER(doc):
    include_ents = ['PERSON','NORP', 'GPE']
    entities = {ent.text:ent.text.split(' ') for ent in doc.ents if ent.label_ in include_ents}
    cc_processed = {e:{'patient_words':[], 'agent_words':[], 'attribute_words':[],
                  'patient_scores':[], 'agent_scores':[], 'attribute_scores':[]} for e in entities.keys()}
    ner_out = {'cc_processed':cc_processed, 'doc':doc, 'entities':entities}
    
    return ner_out

In [1140]:
def extract_dependencies(ner_out):
    doc = ner_out['doc']
    cc_processed= ner_out['cc_processed']
    entities = ner_out['entities']
    
    for token in doc:
        if token not in stopwords:
            if token.dep_ == 'nsubj' or  token.dep_ == 'ROOT':
                word = token.head.text.lower()
                if word in emfd.keys():
                    try:
                        cc_processed[find_ent(token.text, entities)]['agent_words'].append(word)
                        cc_processed[find_ent(token.text, entities)]['agent_scores'].append(emfd[word])
                    except KeyError as e:
                        pass

            if token.dep_ == 'dobj':
                word = token.head.text.lower()
                if word in emfd.keys():
                    try:
                        cc_processed[find_ent(token.text, entities)]['patient_words'].append(word)
                        cc_processed[find_ent(token.text, entities)]['patient_scores'].append(emfd[word])
                    except KeyError as e:
                        pass

            if token.dep_ == 'prep':
                word = token.head.text.lower()
                if word in emfd.keys():
                    for child in token.children:
                        try:
                            cc_processed[find_ent(str(child), entities)]['patient_words'].append(word)
                            cc_processed[find_ent(str(child), entities)]['patient_scores'].append(emfd[word])
                        except:
                            pass

            if token.text == 'is':
                try:
                    children = list(token.children)
                    word = children[1].lower()
                    if word in emfd.keys():
                        cc_processed[find_ent(str(children[0]),entities)]['attribute_words'].append(word)
                        cc_processed[find_ent(str(children[0]),entities)]['attribute_scores'].append(emfd[word])
                except:
                    pass

            if token.dep_ == 'attr':
                word = token.head.text.lower()
                if word in emfd.keys():
                    for child in token.children:
                        try:
                            cc_processed[find_ent(str(child), entities)]['attribute_words'].append(word)
                            cc_processed[find_ent(str(child), entities)]['attribute_scores'].append(emfd[word])
                        except:
                            pass   

            if token.dep_ == 'conj':
                if str(doc[token.right_edge.i]) == '.' or str(doc[token.right_edge.i]) == '!' or str(doc[token.right_edge.i]) == '?':
                    word = token.head.text.lower()
                    if word in emfd.keys():
                        try:
                            cc_processed[find_ent(str(doc[token.right_edge.i-1]), entities)]['agent_words'].append(word)
                            cc_processed[find_ent(str(doc[token.right_edge.i-1]), entities)]['agent_scores'].append(emfd[word])
                        except:
                            pass 
                else:
                    word = token.head.text.lower()
                    if word in emfd.keys():
                        try:
                            cc_processed[find_ent(str(token.right_edge), entities)]['agent_words'].append(word)
                            cc_processed[find_ent(str(token.right_edge), entities)]['agent_scores'].append(emfd[word])
                        except:
                            pass 
        
    return cc_processed

In [1141]:
def drop_ents(cc_processed):
    
    ''' Deletes entities w/out any related words.'''
    
    empty_ents = []
    for k,v in cc_processed.items():
        counter = 0
        for k1, v1 in v.items():
            counter += len(v1)
        if counter == 0:
            empty_ents.append(k)
            
    for e in empty_ents:
        cc_processed.pop(e)
        
    return cc_processed

In [1142]:
def mean_pat(cc_processed):
    
    '''Calculates the average emfd scores for 
    words in each PAT category. 
    Returns the final dataframe for each document. 
    This frame has three columns for detected  words in each PAT category and
    10 columns for each PAT category capturing the mean emfd scores.
    '''
    
    frames = []
    for k,v in cc_processed.items():
        agent = pd.DataFrame(v['agent_scores']).mean().to_frame().T
        agent.columns = ['agent_' + str(col) for col in agent.columns]
        
        patient = pd.DataFrame(v['patient_scores']).mean().to_frame().T
        patient.columns = ['patient_' + str(col) for col in patient.columns]
        
        attribute = pd.DataFrame(v['attribute_scores']).mean().to_frame().T
        attribute.columns = ['attribute_' + str(col) for col in attribute.columns]
        
        df = pd.concat([agent, patient, attribute], axis=1)
        df['NER'] = k
        df['agent_words'] = ', '.join(v['agent_words'])
        df['patient_words'] = ', '.join(v['patient_words'])
        df['attribute_words'] = ', '.join((v['attribute_words']))
        frames.append(df)
    
    df = pd.concat(frames)
    words = ['agent_words','patient_words','attribute_words']
    a_mf = [c for c in df.columns if c.startswith('agent') and c.endswith('p')]
    a_sent = [c for c in df.columns if c.startswith('agent') and c.endswith('sent')]
    
    p_scores = [c for c in df.columns if c.startswith('patient') and c.endswith('p')]
    p_sent = [c for c in df.columns if c.startswith('patient') and c.endswith('sent')]
    
    att_scores = [c for c in df.columns if c.startswith('attribute') and c.endswith('p')]
    att_sent = [c for c in df.columns if c.startswith('attribute') and c.endswith('sent')]
    
    
    return df[['NER']+words+a_mf+a_sent+p_scores+p_sent+att_scores+att_sent]

In [1143]:
nlp.add_pipe(spaCy_NER, name='NER')

In [1144]:
nlp.add_pipe(extract_dependencies, name='PAT extraction')

In [1145]:
nlp.add_pipe(drop_ents, name='drop empty entities')

In [1146]:
nlp.add_pipe(mean_pat, name='average PAT scores and return final df')

In [1147]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7f2f13118080>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7f2f2d62bb28>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7f2f2d62bb88>),
 ('NER', <function __main__.spaCy_NER(doc)>),
 ('PAT extraction', <function __main__.extract_dependencies(ner_out)>),
 ('drop empty entities', <function __main__.drop_ents(cc_processed)>),
 ('average PAT scores and return final df',
  <function __main__.mean_pat(cc_processed)>)]

In [1149]:
scored_docs = []
for i, row in csv[0].head(2).iteritems():
    scored_docs.append(nlp(row))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [1151]:
df = pd.concat(scored_docs)

In [1153]:
df

Unnamed: 0,NER,agent_words,patient_words,attribute_words,agent_authority_p,agent_care_p,agent_fairness_p,agent_loyalty_p,agent_sanctity_p,agent_authority_sent,...,patient_authority_p,patient_care_p,patient_fairness_p,patient_loyalty_p,patient_sanctity_p,patient_authority_sent,patient_care_sent,patient_fairness_sent,patient_loyalty_sent,patient_sanctity_sent
0,Mosul,,"city, fight, fled, fleeing",,,,,,,,...,0.178824,0.15047,0.113987,0.15208,0.090011,-0.193082,-0.389701,-0.259594,-0.25732,-0.368221
0,Dominik Stillhart,"said, said, said, said, said, said, said",,,0.045465,0.044263,0.047012,0.049989,0.042168,-0.012368,...,,,,,,,,,,
0,Iraq,,"stronghold, staff",,,,,,,,...,0.125794,0.05819,0.071796,0.093219,0.04035,-0.093667,-0.200642,-0.243302,-0.0805,-0.205412
0,Geneva,,headquarters,,,,,,,,...,0.018182,0.076923,0.046154,0.084746,0.034483,-0.4767,-0.464767,0.0,-0.19384,0.0
0,Syria,,operation,,,,,,,,...,0.116071,0.090164,0.037879,0.071429,0.090909,-0.147777,-0.350018,-0.07892,-0.02628,-0.202725
0,North Korea,"continues, seen","leader, strike, forces",,0.092593,0.111145,0.074242,0.09082,0.072211,-0.30054,...,0.146855,0.130324,0.074012,0.109246,0.100334,-0.145653,-0.201394,-0.066792,-0.054395,-0.135781
0,U.S.,believes,,,0.070175,0.06383,0.074766,0.117188,0.079208,0.142013,...,,,,,,,,,,
0,William Gortney,said,,,0.045465,0.044263,0.047012,0.049989,0.042168,-0.012368,...,,,,,,,,,,
0,Kim Jong Un,announced,,,0.096939,0.070352,0.15122,0.092166,0.032967,-0.099532,...,,,,,,,,,,
0,United States,,hit,,,,,,,,...,0.089744,0.176136,0.073034,0.077778,0.147887,-0.122379,-0.100761,-0.108108,-0.337079,-0.242962


In [None]:
#TODO: implement in AMorE! 

In [21]:
def parallelize_dataframe(df, func):
    
    '''Simple function to multiprocess functions on dataframe.
    Adjust the Pool value to match your machine CPU capability'''
    
    df_split = np.array_split(df, cpu_count()-1)
    pool = Pool(cpu_count()-1)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    
    return df


def preproc_sent(sentence):
    
    '''Function to preprocess sentences'''
    
    sentence = sentence.split(' ')
    sentence = [x.lower() for x in sentence]
    sentence = [x.replace("'s",'') for x in sentence]
    for punc in punctuation:
        sentence = [x.replace(punc,'') for x in sentence]
    sentence = [x for x in sentence if x not in stopwords]
    sentence = [x for x in sentence if x not in punctuation]
    sentence = [x for x in sentence if len(x) > 2]
    
    return sentence

def score_emfd(csv):
    
    '''The main functuation for extracting moral information from a CSV 
    of input texts'''

    scored_docs = []

    for i, row in csv.iterrows(): 
        doc_id = i

        # Turn document into spaCy DOC object
        text = row.astype('unicode').values[0]
        doc = nlp(text)

        # Create list to store individual sentence scores
        sentence_scores = []

        # Initialize a variable to store the number of detected moral words 
        emfd_wordcount = 0 
        non_moral_count = 0

        # Start to loop over each sentence in a document
        for s, sentence in enumerate(doc.sents):

            # Run VADER to get sentence sentiment
            sentiment = analyzer.polarity_scores(str(sentence))

            # Preprocess sentence and turn into list of tokens 
            tokens = preproc_sent(str(sentence).strip())

            # If an empty sentence is returned, skip this sentence
            if len(tokens) == 0:
                continue

            # Initialize a matrix that has the 5 foundations + 5 sentiment categories as keys and that will store the scores for each detected word 
            emfd_score = pd.DataFrame(columns=[foundations+senti], index=range(0, len(tokens)))
            emfd_score['vader_pos'] = sentiment['pos']
            emfd_score['vader_neg'] = sentiment['neg']
            emfd_score['vader_neu'] = sentiment['neu']
            emfd_score['vader_pol'] = sentiment['compound']
            emfd_score['word'] = ''

            # Initiate scoring by looping over each token in the sentence
            for x, token in enumerate(tokens):

                # Is token in e-MFD?
                if token in emfd.keys():
                    # Yes: increase moral wordcount by 1
                    emfd_wordcount += 1
                    # In scoring matrix, insert words, foundation probabilities, and sentiment scores 
                    emfd_score.at[x, 'word'] = token
                    emfd_score.at[x,'care'] = emfd[token]['care_p']
                    emfd_score.at[x,'fairness'] = emfd[token]['fairness_p']
                    emfd_score.at[x,'loyalty'] = emfd[token]['loyalty_p']
                    emfd_score.at[x,'authority'] = emfd[token]['authority_p']
                    emfd_score.at[x,'sanctity'] = emfd[token]['sanctity_p']
                    
                    emfd_score.at[x,'care_sent'] = emfd[token]['care_sent']
                    emfd_score.at[x,'fairness_sent'] = emfd[token]['fairness_sent']
                    emfd_score.at[x,'loyalty_sent'] = emfd[token]['loyalty_sent']
                    emfd_score.at[x,'authority_sent'] = emfd[token]['authority_sent']
                    emfd_score.at[x,'sanctity_sent'] = emfd[token]['sanctity_sent']
                    
                    
                else:
                    # Increase non-moral word count by 1
                    non_moral_count += 1
            
            # Add indices 
            emfd_score['word_ix'] = emfd_score.index
            emfd_score['sentence_ix'] = int(s)
            emfd_score['document_ix'] = int(i)
            emfd_score['moral_var'] = emfd_score[foundations].var(axis=1)
            emfd_score['senti_var'] = emfd_score[senti].var(axis=1)
            
            
#             emfd_score['shares'] = row['share_count']
            sentence_scores.append(emfd_score)

        # Concat all sentences and add document-level ratio of moral to non-moral words
        sentences = pd.concat(sentence_scores)
        sentences['moral_nonmoral_ratio'] = emfd_wordcount / non_moral_count
        scored_docs.append(sentences)
    
    df = pd.concat(scored_docs)
    df = df.dropna(how='any')
        
    return df

In [73]:
def parallelize_dataframe(df, func):
    
    '''Simple function to multiprocess functions on dataframe.
    Adjust the Pool value to match your machine CPU capability'''
    
    df_split = np.array_split(df, cpu_count()-2)
    pool = Pool(cpu_count()-2)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    
    return df


def preproc_sent(sentence):
    
    '''Function to preprocess sentences'''
    
    sentence = sentence.split(' ')
    sentence = [x.lower() for x in sentence]
    sentence = [x.replace("'s",'') for x in sentence]
    for punc in punctuation:
        sentence = [x.replace(punc,'') for x in sentence]
    sentence = [x for x in sentence if x not in stopwords]
    sentence = [x for x in sentence if x not in punctuation]
    sentence = [x for x in sentence if len(x) > 2]
    
    return sentence

def score_emfd(csv):
    
    '''The main functuation for extracting moral information from a CSV 
    of input texts'''

    scored_docs = []

    for i, row in csv.iterrows(): 
        doc_id = i

        # Turn document into spaCy DOC object
        text = row.astype('unicode').values[0]
        doc = nlp(text)
        tokens = [token.text for token in doc]

        # Initialize a variable to store the number of detected moral words 
        emfd_wordcount = 0 
        non_moral_count = 0
        
        # Initialize a matrix that has the 5 foundations + 5 sentiment categories as keys and that will store the scores for each detected word 
        emfd_score = pd.DataFrame(columns=[foundations+senti], index=range(0, len(tokens)))

        # Start to loop over each sentence in a document
        for x, token in enumerate(tokens):
            
            # Is token in e-MFD?
            if token in emfd.keys():
                # Yes: increase moral wordcount by 1
                emfd_wordcount += 1
                sentiment = analyzer.polarity_scores(str(token))
                emfd_score['vader_pos'] = sentiment['pos']
                emfd_score['vader_neg'] = sentiment['neg']
                emfd_score['vader_neu'] = sentiment['neu']
                emfd_score['vader_pol'] = sentiment['compound']
                
                # In scoring matrix, insert words, foundation probabilities, and sentiment scores 
                emfd_score.at[x,'care'] = emfd[token]['care_p']
                emfd_score.at[x,'fairness'] = emfd[token]['fairness_p']
                emfd_score.at[x,'loyalty'] = emfd[token]['loyalty_p']
                emfd_score.at[x,'authority'] = emfd[token]['authority_p']
                emfd_score.at[x,'sanctity'] = emfd[token]['sanctity_p']

                emfd_score.at[x,'care_sent'] = emfd[token]['care_sent']
                emfd_score.at[x,'fairness_sent'] = emfd[token]['fairness_sent']
                emfd_score.at[x,'loyalty_sent'] = emfd[token]['loyalty_sent']
                emfd_score.at[x,'authority_sent'] = emfd[token]['authority_sent']
                emfd_score.at[x,'sanctity_sent'] = emfd[token]['sanctity_sent']
                    
            else:
                # Increase non-moral word count by 1
                non_moral_count += 1
            
        # Add indices 
        emfd_score['moral_var'] = emfd_score[foundations].var(axis=1)
        emfd_score['senti_var'] = emfd_score[senti].var(axis=1)
        emfd_score['moral_nonmoral_ratio'] = emfd_wordcount / non_moral_count
        emfd_score = emfd_score.dropna(subset=[foundations], how='any')
        emfd_score = pd.DataFrame(emfd_score.mean()).T
        emfd_score.index = [i]
            
        scored_docs.append(emfd_score)
    
    df = pd.concat(scored_docs, axis=0)
        
    return df

In [57]:
test

Unnamed: 0,care_p,fairness_p,loyalty_p,authority_p,sanctity_p,care_sent,fairness_sent,loyalty_sent,authority_sent,sanctity_sent,moral_nonmoral_ratio,f_var,sent_var
0,0.094813,0.073032,0.069060,0.072408,0.058619,-0.109505,-0.088298,-0.070900,-0.074635,-0.076277,2.412500,0.000174,0.000247
1,0.067849,0.052367,0.054658,0.059176,0.041654,-0.088256,-0.044245,-0.027897,-0.040930,-0.054355,1.553571,0.000092,0.000520
2,0.049164,0.045042,0.040456,0.041374,0.038014,-0.070804,-0.056856,-0.031816,-0.041140,-0.043111,0.767442,0.000019,0.000232
3,0.045444,0.043486,0.042962,0.046669,0.035920,-0.036410,-0.004051,0.009884,-0.014176,-0.027695,1.057692,0.000017,0.000340
4,0.052105,0.049796,0.052589,0.051535,0.039107,-0.062859,-0.032623,-0.006178,-0.017404,-0.045186,1.331683,0.000032,0.000501
5,0.057937,0.059822,0.058146,0.057434,0.046878,-0.091002,-0.051419,-0.024035,-0.048596,-0.055692,1.201299,0.000027,0.000576
6,0.080705,0.075128,0.063355,0.064227,0.058052,-0.013425,-0.002821,0.017426,0.004184,-0.014231,2.054217,0.000087,0.000174
7,0.063438,0.053920,0.047825,0.050844,0.044996,-0.072777,-0.043953,-0.030646,-0.042591,-0.074196,1.140777,0.000051,0.000383
8,0.077813,0.087277,0.070132,0.061160,0.059522,-0.032490,0.030413,0.025335,-0.014331,-0.016338,2.188119,0.000135,0.000771
9,0.069137,0.053560,0.052671,0.051013,0.049883,-0.060967,-0.061534,-0.034998,-0.027852,-0.067304,1.531034,0.000062,0.000317


In [69]:
test = 'The government\'s assault to retake the city of Mosul could take months, prompting more and more civilians to try to flee to avoid being trapped between frontlines, a senior official of the International Committee of the Red Cross told Reuters.'
doc = nlp(test)
for s, sentence in enumerate(doc.sents):
    sentiment = analyzer.polarity_scores(str(sentence))
    # Preprocess sentence and turn into list of tokens 
    tokens = preproc_sent(str(sentence).strip())
    
    # Perform dependency parsing TODO: add variable to toggle dep. parsing
    dep_dic = {}
    for token in doc:
        
        if token.dep_ == 'nsubj' or token.dep_ == 'dobj' or token.dep_ =='PERSON' or token.dep_ =='GPE' or token.dep_ =='NORP':
            dep_dic[token.text] = {token.dep_:token.head.text}  
        if token.dep_ =='ROOT' and [child in dep_dic.keys() for child in token.children]:
            dep_dic[token.text] = [child for child in token.children]
    print(dep_dic)

{'assault': {'nsubj': 'take'}, 'city': {'dobj': 'retake'}, 'take': [assault, could, months, ,, prompting, .], 'months': {'dobj': 'take'}, 'civilians': {'dobj': 'prompting'}, 'official': {'nsubj': 'told'}, 'Reuters': {'dobj': 'told'}}


In [60]:
csv.head(1)[0].values

array(['The Iraqi government\'s assault to retake the city of Mosul could take months, prompting more and more civilians to try to flee to avoid being trapped between frontlines, a senior official of the International Committee of the Red Cross told Reuters.\n\nA growing number of wounded, more than 100 on some days, are  emerging from rural areas surrounding the city of one million that is held by Islamic State forces, said Dominik Stillhart, director of ICRC operations worldwide.\n\n"What we see now on the ground is indeed that the fight in Mosul is not just going to stop anytime soon because the resistance is very strong," Stillhart, back from visiting Iraq, said in an interview on Thursday at ICRC headquarters in Geneva.\n\n"It is likely that we will see long, drawn-out fighting with very serious suffering of a population that will once again be caught between two frontlines," he said. "It is reasonable to expect that this is going to take weeks if not months."\n\nMore than six wee