# 4) Document Scoring, Application, and External Validation of E-MFD

In [1]:
import pandas as pd 
import numpy as np 
from matplotlib import pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import re, fnmatch 
import spacy
nlp = spacy.load('en')

from nltk.corpus import stopwords
nltk_stopwords = stopwords.words('english')
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
punctuation += '’'
for i in range(0,10):
    punctuation += str(i)

from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

  from pandas.core import datetools


In [2]:
foundations = ['care','fairness','loyalty','authority','sanctity']
# neutrals = [f+'.neutral' for f in foundations]
# foundations = virtues+vices+neutrals

In [3]:
# Load News Articles
news_text = pd.read_json('../datasets/uncoded_news.json').set_index('url')
news = pd.read_pickle('data/study1_news.pkl').set_index('url')
news = news.join(news_text['text'])
themes = [c for c in news.columns if c.isupper()] 
news = news[['text','share_count']+themes]
stopwords = set(list(nltk_stopwords) + list(ENGLISH_STOP_WORDS) + list(STOP_WORDS))
news = news.reset_index()

In [4]:
news.head()

Unnamed: 0,url,text,share_count,ACT_HARMTHREATEN,AFFECT,ARMEDCONFLICT,CYBER_ATTACK,EXHUMATION,EXTREMISM,FREESPEECH,JIHAD,KILL,LEGISLATION,MOVEMENT_SOCIAL,PROTEST,REBELLION,RELIGION,TERROR,WOUND
0,http://mobile.reuters.com/article/worldNews/id...,The Iraqi government's assault to retake the c...,0.0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1
1,http://www.cbsnews.com/news/north-korea-can-la...,WASHINGTON -- North Korea now has the capabili...,262.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,http://www.breitbart.com/national-security/201...,TEL AVIV – An Egyptian journalist wrote an op-...,41.0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0
3,http://www.cnn.com/2016/12/07/europe/russia-re...,What was life like for Russians such as Tsar N...,114.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,http://www.foxnews.com/politics/2016/12/05/for...,President Obama's former national security adv...,244.0,0,0,1,0,0,0,0,0,0,1,0,1,1,0,0,0


In [5]:
# Function to preprocess sentences
def preproc_sent(sentence):
    sentence = sentence.split(' ')
    sentence = [x.lower() for x in sentence]
    sentence = [x.replace("'s",'') for x in sentence]
    for punc in punctuation:
        sentence = [x.replace(punc,'') for x in sentence]
    sentence = [x for x in sentence if x not in stopwords]
    sentence = [x for x in sentence if x not in punctuation]
    sentence = [x for x in sentence if len(x) > 2]
    
    return sentence

### 1) Score Documents with E-MFD

In [67]:
# Score E-MFD
emfd = pd.read_pickle('dictionaries/emfd_scoring.pkl')[['care_norm','fairness_norm','authority_norm','loyalty_norm','sanctity_norm', 'var']]
emfd = emfd.rename(columns={'care_norm':'care','fairness_norm':'fairness', 'authority_norm':'authority', 'loyalty_norm':'loyalty','sanctity_norm':'sanctity'})
emfd = emfd.iloc[pd.np.where(emfd[['care', 'fairness', 'loyalty','authority','sanctity']].ge(0.1).any(1, skipna=True))]
emfd = emfd.T.to_dict()

In [68]:
len(emfd)

3309

In [None]:
docs_rw = []

for i, row in news.iterrows():  
    doc_id = i
    #print('This is document:', doc_id)
    
    # Turn document into spaCy DOC object
    doc = nlp(row.text)
    
    # Create list to store individual sentence scores
    sentence_scores = []
    
    # Initialize a variable to store the number of detected moral words 
    emfd_wordcount = 0 
    non_moral_count = 0
    
    # Start to loop over each sentence in a document
    for s, sentence in enumerate(doc.sents):
        
        # Run VADER to get sentence sentiment
        sentiment = analyzer.polarity_scores(str(sentence))
        
        # Preprocess sentence and turn into list of tokens 
        tokens = preproc_sent(str(sentence).strip())
        
        # If an empty sentence is returned, skip this sentence
        if len(tokens) == 0:
            continue
            
        
        # Initialize a matrix that has the 5 foundations + 3 sentiment categories as keys and that will store the scores for each detected word 
#         print(tokens)
        emfd_score = pd.DataFrame(columns=foundations, index=range(0, len(tokens)))
        emfd_score['pos'] = sentiment['pos']
        emfd_score['neg'] = sentiment['neg']
        emfd_score['neu'] = sentiment['neu']
        emfd_score['pol'] = sentiment['compound']
        emfd_score['word'] = ''
                
        # Initiate scoring by looping over each token in the sentence
        for x, token in enumerate(tokens):
            
            # Is token in E-MFD?
            if token in emfd.keys():
                # Yes: increase wordcount by 1
                emfd_wordcount += 1
                # In scoring matrix, insert words and insert weights 
                emfd_score.at[x, 'word'] = token
                emfd_score.at[x,'care'] = emfd[token]['care']
                emfd_score.at[x,'fairness'] = emfd[token]['fairness']
                emfd_score.at[x,'loyalty'] = emfd[token]['loyalty']
                emfd_score.at[x,'authority'] = emfd[token]['authority']
                emfd_score.at[x,'sanctity'] = emfd[token]['sanctity']
            else:
                non_moral_count += 1
                # If word not in EMFD, add word and add 0 weights 
                emfd_score.at[x, 'word'] = token
                emfd_score.at[x,'care'] = 0.0
                emfd_score.at[x,'fairness'] = 0.0
                emfd_score.at[x,'loyalty'] = 0.0
                emfd_score.at[x,'authority'] = 0.0
                emfd_score.at[x,'sanctity'] = 0.0
                
        emfd_score['word_ix'] = emfd_score.index
        emfd_score['sentence_ix'] = int(s)
        emfd_score['document_ix'] = int(i)
        emfd_score['shares'] = row['share_count']
        #print(emfd_score)
        sentence_scores.append(emfd_score)
        
    # Concat all sentences and add document-level ratio of moral to non-moral words
    sentences = pd.concat(sentence_scores)
    sentences['moral_nonmoral_ratio'] = emfd_wordcount / non_moral_count
    #print("this is a document")
    #print(sentences)
    docs_rw.append(sentences)

In [None]:
# Save scored documents
df = pd.concat(docs_rw)
df = df[['document_ix','sentence_ix','word_ix','care','fairness','loyalty','authority','sanctity','moral_nonmoral_ratio','pos','neg','neu','pol','shares','word']]
df.to_csv('emfd_docs4rw.csv', index=False)

In [17]:
len(df)

866024

In [16]:
df.head()

Unnamed: 0,document_ix,sentence_ix,word_ix,care,fairness,loyalty,authority,sanctity,pos,neg,neu,pol,shares,word
0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.203,0.797,-0.8555,0.0,iraqi
1,0,0,1,0.14994,0.181717,0.170055,0.223723,0.111842,0.0,0.203,0.797,-0.8555,0.0,government
2,0,0,2,0.385417,0.189474,0.214286,0.182927,0.233333,0.0,0.203,0.797,-0.8555,0.0,assault
3,0,0,3,0.172414,0.0,0.2,0.368421,0.1875,0.0,0.203,0.797,-0.8555,0.0,retake
4,0,0,4,0.118093,0.0702811,0.102415,0.130682,0.0719794,0.0,0.203,0.797,-0.8555,0.0,city


## 2) Score Documents with MFD

In [None]:
# Initiate MFD
MFD = 'dictionaries/mft_original.dic'
nummap = dict()
mfd = dict()
mfd_regex = dict()
wordmode = True
with open(MFD, 'r') as f:
    for line in f.readlines():
        ent = line.strip().split()
        if line[0] == '%':
            wordmode = not wordmode
        elif len(ent) > 0:
            if wordmode:
                mfd[ent[0]] = [nummap[e] for e in ent[1:]]
            else:
                nummap[ent[0]] = ent[1]

# convert vocab to compiled regex for comparison
for v in mfd.keys():
    mfd_regex[v] = re.compile(fnmatch.translate(v))

In [None]:
# Initiate MFD2.0 
MFD2 = 'dictionaries/mfd2.0.dic'
nummap = dict()
mfd2 = dict()
wordmode = True
with open(MFD2, 'r') as f:
    for line in f.readlines():
        ent = line.strip().split()
        if line[0] == '%':
            wordmode = not wordmode
        elif len(ent) > 0:
            if wordmode:
                wordkey = ''.join([e for e in ent if e not in nummap.keys()])
                mfd2[wordkey] = [nummap[e] for e in ent if e in nummap.keys()]
            else:
                nummap[ent[0]] = ent[1]

mfd2 = pd.DataFrame.from_dict(mfd2).T
mfd2['foundation'] = mfd2[0]
del mfd2[0]
mfd2 = mfd2.T.to_dict()

In [None]:
virtues = [f+'.virtue' for f in foundations]
vices = [f+'.vice' for f in foundations]
mfd_foundations = virtues+vices

In [None]:
# Score MFD 
docs_rw = []

for i, row in news.iterrows():  
    doc_id = i
    
    # Turn document into spaCy DOC object
    doc = nlp(row.text)
    
    # Create list to store individual sentence scores
    sentence_scores = []
    
    # Initialize a variable to store the number of detected moral words 
    mfd_wordcount = 0 
    non_mfd_count = 0
    
    # Start to loop over each sentence in a document
    for s, sentence in enumerate(doc.sents):
        
        # Run VADER to get sentence sentiment
        sentiment = analyzer.polarity_scores(str(sentence))
        
        # Preprocess sentence and turn into list of tokens 
        tokens = preproc_sent(str(sentence).strip())
        
        # If an empty sentence is returned, skip this sentence
        if len(tokens) == 0:
            continue
            
        # Initialize a matrix that has the 5 foundations + 3 sentiment categories as keys and that will store the scores for each detected word 
        #print(tokens)
        mfd_score = pd.DataFrame(columns=mfd_foundations, index=range(0, len(tokens)))
        mfd_score['word'] = ''
        mfd_score = mfd_score.fillna(0)
                
        # Initiate scoring by looping over each token in the sentence
        for x, token in enumerate(tokens):
            
            # Is token in MFD?
            for v in mfd_regex.keys():
                if mfd_regex[v].match(token):
                    mfd_wordcount += 1
                    for f in mfd[v]:
                        if f == 'moral':
                            continue
                        else:
                            #print(token,f)
                            mfd_score.at[x,'word'] = token
                            mfd_score.at[x,f] = 1
                else:
                    non_mfd_count += 1
                    mfd_score.at[x,'word'] = token
                    continue
        
        mfd_score['word_ix'] = mfd_score.index
        mfd_score['sentence_ix'] = int(s)
        mfd_score['document_ix'] = int(i)
        mfd_score['shares'] = row['share_count']
        sentence_scores.append(mfd_score)
        
    # Concat all sentences and add document-level ratio of moral to non-moral words
    sentences = pd.concat(sentence_scores)
    sentences['moral_nonmoral_ratio'] = mfd_wordcount / non_mfd_count
    #print("this is a document")
    #print(sentences)
    docs_rw.append(sentences)

In [None]:
# Combine and save scored MFD counts
# Save scored documents
df = pd.concat(docs_rw)
df = df[['document_ix','sentence_ix','word_ix','care.virtue', 'fairness.virtue' ,'loyalty.virtue', 'authority.virtue' ,'sanctity.virtue' ,'care.vice' ,'fairness.vice' ,'loyalty.vice' ,'authority.vice', 'sanctity.vice', 'moral_nonmoral_ratio','shares','word']]
df.to_csv('mfd_docs4rw.csv', index=False)

In [None]:
# Score MFD2.0 
docs_rw = []

for i, row in news.iterrows():  
    doc_id = i
    
    # Turn document into spaCy DOC object
    doc = nlp(row.text)
    
    # Create list to store individual sentence scores
    sentence_scores = []
    
    # Initialize a variable to store the number of detected moral words 
    mfd_wordcount = 0 
    non_mfd_count = 0
    
    # Start to loop over each sentence in a document
    for s, sentence in enumerate(doc.sents):
        
        # Run VADER to get sentence sentiment
        sentiment = analyzer.polarity_scores(str(sentence))
        
        # Preprocess sentence and turn into list of tokens 
        tokens = preproc_sent(str(sentence).strip())
        
        # If an empty sentence is returned, skip this sentence
        if len(tokens) == 0:
            continue
            
        # Initialize a matrix that has the 5 foundations + 3 sentiment categories as keys and that will store the scores for each detected word 
        #print(tokens)
        mfd_score = pd.DataFrame(columns=mfd_foundations, index=range(0, len(tokens)))
        mfd_score['word'] = ''
        mfd_score = mfd_score.fillna(0)
                
        # Initiate scoring by looping over each token in the sentence
        for x, token in enumerate(tokens):
            
            # Is token in MFD2?
            if token in mfd2.keys(): 
                mfd_wordcount += 1
                mfd_score.at[x,'word'] = token
                mfd_score.at[x, mfd2[token]['foundation']] = 1
                               
            else:
                non_mfd_count += 1
                mfd_score.at[x,'word'] = token
                continue
        
        mfd_score['word_ix'] = mfd_score.index
        mfd_score['sentence_ix'] = int(s)
        mfd_score['document_ix'] = int(i)
        mfd_score['shares'] = row['share_count']
        sentence_scores.append(mfd_score)
        
    # Concat all sentences and add document-level ratio of moral to non-moral words
    sentences = pd.concat(sentence_scores)
    sentences['moral_nonmoral_ratio'] = mfd_wordcount / non_mfd_count
    #print("this is a document")
    #print(sentences)
    docs_rw.append(sentences)

In [None]:
# Combine and save scored MFD counts
# Save scored documents
df = pd.concat(docs_rw)
df = df[['document_ix','sentence_ix','word_ix','care.virtue', 'fairness.virtue' ,'loyalty.virtue', 'authority.virtue' ,'sanctity.virtue' ,'care.vice' ,'fairness.vice' ,'loyalty.vice' ,'authority.vice', 'sanctity.vice','moral_nonmoral_ratio', 'shares','word']]
df.to_csv('mfd2_docs4rw.csv', index=False)