## Hedging classification

This notebooks will analyze hedging patterns in the collected articles corpus. 

1. Look at distributions, frequencies of hedge words in the documents
2. Train semisupervised classification algorithm on individual sentences


In [4]:
# Just text reading/cleaning
import pandas as pd
import numpy as np
from read_json import JsonHelpers
import analysis_helper_functions as helpers
import re
import json
import sys
import os
import clean_txt as clean
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.svm import SVC




if sys.platform == "linux":
    studies_path = "/home/wiktor/git/MasterThesis/Literature/Competition_Studies_Database/studies.json"
    authors_path = "/home/wiktor/git/MasterThesis/Literature/Competition_Studies_Database/authors.json"
    studies_location = "/home/wiktor/Dropbox/Git/MasterThesis/Literature/Competition_Studies_Database/"
else:    
    studies_path = "/Users/Wiktor/Dropbox/Git/MasterThesis/Literature/Competition_Studies_Database/studies.json"
    authors_path = "/Users/Wiktor/Dropbox/Git/MasterThesis/Literature/Competition_Studies_Database/authors.json"
    studies_location = "/Users/Wiktor/Dropbox/Git/MasterThesis/Literature/Competition_Studies_Database/"

# Documents collection with labels
df_documents = helpers.read_txt_studies(studies_location)

# List of all documents as strings
documents = df_documents['document'].tolist()

# List of labels
article_labels = df_documents['label'].tolist()

# CB label
cb = JsonHelpers().central_bank_paper_label()

context_stopwords = ['panzar', 'rosse','panzarrosse', 'rossepanzar', 'prh','journal', 'sciencedirect',
                     'banking', 'rosse–panzar', 'vol', 'department', 'university', 'school', 'economics',
                     'business', 'email', 'cid', 'bank', 'banking', 'asset', 'assets', 'revenue', 'total', 
                     'hstatistic', 'h-statistic', 'competition', 'finance', 'banks']
# List of tokenized documents
texts_token_clean = [clean.normalize(text, context_stopwords, exclude_stops=False) for text in documents]

# List of clean strings
texts_str_clean = helpers.convert_list(texts_token_clean)


In [3]:
# Checking reported database
paper_database = df_documents['document'].apply(lambda x: re.findall(r"([^.]*?database[^.]*\.)", x.lower()))

KeyboardInterrupt: 

## Uncertanity index per paper

Using hedge words I calculate uncertanity index: number of hedge words in the document, divided by number of words in the document. 

In [5]:
# Open file with hedge words
hedge_words = []
with open('hedge_words.txt', 'rt') as file:
    for line in file:
        if '#' not in line:
            hedge_words.append(line.replace('\n', ''))
    
def uncertainty_index(documents, labels, hedge_words):
    """
    Calculate uncertanity index for all all collected papers
    
    number of hedge words seen / total number of words in text
    """

    hedge_score = []
    for document, label in zip(documents, labels):
        keys_count = [document.count(key) for key in hedge_words]
        hedge_score.append((label, sum(keys_count)/len(re.findall(r'\w+', document))))

    return hedge_score


# Undertanity index per document table
undertanity_df = pd.DataFrame(uncertainty_index(texts_str_clean, article_labels, hedge_words))
undertanity_df['cb'] = undertanity_df[0].isin(cb)
sorted_undertanity_df = undertanity_df.sort_values(by=1, ascending=False).reset_index(drop=True)

# Uncertanity Rank of CB papers 
uncertanity_df_cb = sorted_undertanity_df[sorted_undertanity_df['cb'] == True].reset_index()
uncertanity_df_cb.drop(['cb'], axis=1, inplace=True)
uncertanity_df_cb.columns = ['rank', 'article label', 'uncertanity score']
uncertanity_df_cb['rank'] = uncertanity_df_cb['rank'] + 1

# Saving table
uncertanity_df_cb.to_excel('uncertanity_score_cb.xlsx', index=False)

# Uncertanity Rank of NCB papers 
uncertanity_df_ncb = sorted_undertanity_df[sorted_undertanity_df['cb'] == False].reset_index()
uncertanity_df_ncb.drop(['cb'], axis=1, inplace=True)
uncertanity_df_ncb.columns = ['rank', 'article label', 'uncertanity score']
uncertanity_df_ncb['rank'] = uncertanity_df_ncb['rank'] + 1

# Saving table
uncertanity_df_ncb.to_excel('uncertanity_score_ncb.xlsx', index=False)

sorted_undertanity_df

Unnamed: 0,0,1,cb
0,p032,0.089467,True
1,p048,0.082132,False
2,p074,0.077574,False
3,p053,0.073553,True
4,p070,0.068966,False
5,p003,0.066594,False
6,p057,0.065937,False
7,p034,0.064996,False
8,p014,0.063074,False
9,p051,0.061458,False


## Frequent used hedge words accross all papers and groups split

http://web.informatik.uni-mannheim.de/ponzetto/pubs/stajner17a

The following are considered hedge instances **(Medlock, Briscoe)**:
- data_Set to train: https://sraf.nd.edu/textual-analysis/resources/#LM%20Sentiment%20Word%20Lists (MacDonald word list) 


In [6]:
# Join all texts
single_corpa =  ' '.join(texts_str_clean)

# Hedge words distribution accross all texts
full_popular_hedge = []
for key in hedge_words:
    full_popular_hedge.append((key, single_corpa.count(key)))
    
# Data Frame with most popular hedge words
all_hedge = pd.DataFrame(full_popular_hedge).sort_values(by=1, ascending=False)


ncb_texts_clean = []
cb_texts_clean = []
for text, label in zip(texts_str_clean, article_labels):
    if label in cb:
        cb_texts_clean.append(text)
    else:
        ncb_texts_clean.append(text)
        
# Distributions of hedge words accross two groups of papers        
ncb_single_text_clean = ' '.join(ncb_texts_clean)
cb_single_text_clean = ' '.join(cb_texts_clean)

cb_popular_hedge = []
ncb_popular_hedge = []

for key in hedge_words:
    cb_popular_hedge.append((key, cb_single_text_clean.count(key), cb_single_text_clean.count(key)/len(re.findall(r'\w+', cb_single_text_clean))))
    ncb_popular_hedge.append((key, ncb_single_text_clean.count(key), ncb_single_text_clean.count(key)/len(re.findall(r'\w+', ncb_single_text_clean))))
    
cb_hedge = pd.DataFrame(cb_popular_hedge).sort_values(by=1, ascending=False)
ncb_hedge = pd.DataFrame(ncb_popular_hedge).sort_values(by=1, ascending=False)



In [11]:
# All papers popular hedge
all_hedge

# CB papers popular hedge
cb_hedge

# NCB papers popular hedge
ncb_hedge

Unnamed: 0,0,1,2
100,differ,780,0.004046
86,depend,660,0.003424
160,indicate,531,0.002755
277,risk,482,0.002500
298,some,428,0.002220
321,suggest,409,0.002122
118,find,391,0.002028
180,most,343,0.001779
72,consider,330,0.001712
175,many,246,0.001276


In [7]:
# Hypothesis testing words
testing_words = []

full_popular_testing = []
cb_popular_testing = []
ncb_popular_testing = []

with open('hypothesis_testing_words.txt', 'rt') as file:
    for line in file:
        if '#' not in line:
            testing_words.append(line.replace('\n', ''))

for key in testing_words:
    cb_popular_testing.append((key, cb_single_text_clean.count(key), cb_single_text_clean.count(key)/len(re.findall(r'\w+', cb_single_text_clean))))
    ncb_popular_testing.append((key, ncb_single_text_clean.count(key), ncb_single_text_clean.count(key)/len(re.findall(r'\w+',ncb_single_text_clean))))
    full_popular_testing.append((key, single_corpa.count(key), single_corpa.count(key)/len(re.findall(r'\w+',single_corpa))))


cb_testing = pd.DataFrame(cb_popular_testing).sort_values(by=2, ascending=False)
ncb_testing = pd.DataFrame(ncb_popular_testing).sort_values(by=2, ascending=False)
all_testing = pd.DataFrame(full_popular_testing).sort_values(by=2, ascending=False)



In [13]:
# All papers popular hedge
all_testing

# CB papers popular hedge
cb_testing

# NCB papers popular hedge
ncb_testing

Unnamed: 0,0,1,2
2,reject,447,0.002319
3,hypothesis,376,0.001951
7,significant,345,0.00179
19,error,213,0.001105
6,alternative,122,0.000633
4,null hypothesis,105,0.000545
15,pvalue,89,0.000462
8,significantly,84,0.000436
9,significance level,38,0.000197
1,reject null,37,0.000192


### Sentence uncertanity/vagueness analysis

To train supervised classifiers, we first transformed the sentences from the articles into a vector of numbers. We explored vector representations such as TF-IDF weighted vectors.

After having this vector representations of the text we can train supervised classifiers to predict the “uncertanity” of the sentence.

http://web.informatik.uni-mannheim.de/ponzetto/pubs/stajner17a

The following are considered hedge instances **(Medlock, Briscoe)**:
- data_Set to train: https://sraf.nd.edu/textual-analysis/resources/#LM%20Sentiment%20Word%20Lists (MacDonald word list) 


The Wikipedia training set (CoNLL-2010 shared task, v2.0,
task1) was used as the training set for all our experiments,
as it is the largest existing general-domain dataset annotated
for speculation. It contains a total of 11111 sentences, out of
which 2346 were marked as speculative (uncertain). A few
examples from this dataset are presented in Table V.

use :  Task1 wikipedia other variant (sentence tags in separate lines)

T. Loughran and B. McDonald, “When is a Liability not a Liability?
Textual Analysis, Dictionaries, and 10-Ks,” Journal of Finance, 2010.


The feature
set consisted of frequencies of each speculation trigger plus an additional feature (the total number of triggers found).

In [8]:
# Create df with text and corresponding label
sentence_df = pd.concat([pd.Series(texts_str_clean), pd.Series(article_labels)], ignore_index=True, axis=1)

# Label columns
sentence_df.columns = ['sentence', 'label']

# Expand columns by splitting text into sentences on '.'
sentence_df = sentence_df.set_index(sentence_df.columns.drop('sentence',1).tolist())\
                                                                          .sentence.str.split('.', expand=True)\
                                                                          .stack()\
                                                                          .reset_index()\
                                                                          .rename(columns={0:'sentence'})\
                                                                          .loc[:, sentence_df.columns]

# Drop sentences with les than 20 characters
sentence_df = sentence_df[sentence_df['sentence'].apply(lambda x: len(x)>20)]

# Count number of words per sentence
sentence_df['no_words'] = sentence_df['sentence'].apply(lambda x: len(re.findall(r'\w+', x)))

# Assign "uncertanity category"
sentence_df['uncertain'] = sentence_df['sentence'].str.contains('|'.join(hedge_words))

# Convert boolean to integer
sentence_df['uncertain'] = sentence_df['uncertain'].apply(lambda x: x*1)

sentence_df['cb'] = sentence_df['label'].isin(cb)

In [10]:
# TFIDF setup
tfidf = TfidfVectorizer(ngram_range=(1, 3), stop_words='english')

X_tfidf = tfidf.fit_transform(sentence_df['sentence'])


X_train, X_test, y_train, y_test = train_test_split(X_tfidf,
                                                    sentence_df['uncertain'], random_state = 0, test_size=0.3)

# Random forest with 100 trees
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1).fit(X_train, y_train)

In [16]:
confusion_matrix(clf.predict(X_test), y_test)

print(classification_report(y_test, clf.predict(X_test)))

             precision    recall  f1-score   support

          0       0.80      0.98      0.88      2673
          1       0.97      0.72      0.83      2396

avg / total       0.88      0.86      0.85      5069



In [17]:
pd.DataFrame(confusion_matrix(y_test, clf.predict(X_test)))

Unnamed: 0,0,1
0,2623,50
1,674,1722


In [18]:
# tfidf_list = []

# for i in X_tfidf:
#     tfidf_list.append(clf.predict(i)[0])
    
# # Make full sample (article) prediction, classifier tesed on the sample
# sentence_df['prediction_on_sample'] = tfidf_list

In [19]:
sentence_cb = sentence_df[sentence_df['label'].isin(cb)]
sentence_ncb = sentence_df[~sentence_df['label'].isin(cb)]

## Prediction using wikipedia sentence data

In [13]:
# Read data, 22222 sentences
wiki_data = pd.read_csv('wiki_data_clean.csv')

# Transform string certanity variable into dummy 
wiki_data['uncertanity_dummy'] = wiki_data['certanity'].str.get_dummies()['uncertain']

# Dropping sentences that were NaN (not string)
wiki_data = wiki_data[wiki_data['sentence'].apply(lambda x: isinstance(x, str))]

# Selecting only strings with more than 20 characters (removing single words), 18744 remain
wiki_data = wiki_data[wiki_data['sentence'].apply(lambda x: len(x) > 20)]

# Transform data into TFIDF, adding data from the articles to have appropreate number of features 
# first 18744 rows are wiki, remainder articles
wiki_tfidf = tfidf.fit_transform(pd.concat([wiki_data['sentence'], sentence_df['sentence']]))

In [21]:
# Training random forest on wikipedia data
rf_wiki = RandomForestClassifier(n_estimators=100, n_jobs=-1).fit(wiki_tfidf[:18744],
                                                                   wiki_data['uncertanity_dummy'])

nb_wiki = MultinomialNB().fit(wiki_tfidf[:18744],
                              wiki_data['uncertanity_dummy'])

svc_wiki = SVC(kernel='linear').fit(wiki_tfidf[:18744],
                     wiki_data['uncertanity_dummy'])

# Predict uncertanity based on article data using RF
rf_wiki_prediction = rf_wiki.predict(wiki_tfidf[18744:])

# Predict uncertanity based on article data using NB
nb_wiki_prediction = nb_wiki.predict(wiki_tfidf[18744:])

# Predict uncertanity based on article data using SVM
svc_wiki_prediction = svc_wiki.predict(wiki_tfidf[18744:])


In [31]:
# Problems: unsupported words, seen in article but not in wiki
# no proper way to measure the prediction

sentence_df['rf_wiki_predicion'] = rf_wiki_prediction 
sentence_df['nb_wiki_predicion'] = nb_wiki_prediction 
sentence_df['svc_wiki_prediction'] = svc_wiki_prediction 



# Calculation fraction of sentences considered to be uncertain within each group RF
sentence_df[sentence_df['label'].isin(cb)]['rf_wiki_predicion'].sum()/len(sentence_df[sentence_df['label'].isin(cb)]['rf_wiki_predicion'])
sentence_df[~sentence_df['label'].isin(cb)]['rf_wiki_predicion'].sum()/len(sentence_df[~sentence_df['label'].isin(cb)]['rf_wiki_predicion'])

# Calculation fraction of sentences considered to be uncertain within each group SVM
sentence_df[sentence_df['label'].isin(cb)]['svc_wiki_prediction'].sum()/len(sentence_df[sentence_df['label'].isin(cb)]['svc_wiki_prediction'])
sentence_df[~sentence_df['label'].isin(cb)]['svc_wiki_prediction'].sum()/len(sentence_df[~sentence_df['label'].isin(cb)]['svc_wiki_prediction'])


0.030495654972642422

1. RF classified 508 as uncertain out of 16903
    - 0.0281 of all CB sentences are considered to be uncertain
    - 0.0305 of all NCB sentence are considered to be uncertain

2. SVM classified 526 as uncertain out of 16903
    - 0.0372 of all CB sentences are considered to be uncertain
    - 0.0297 of all NCB sentence are considered to be uncertain

e.g "The estimation results for Specification II lead to different and
possibly misleading inferences about the nature of competition, if
they are interpreted using the same criteria regardless of the inclusion
of ln(yi,t) as an additional control variable in Specification II."

CB:
"Our results suggest that
European banks were operating under conditions of monopolistic competition and
that bank interest revenues in the 10 new EU member states was earned under
conditions of higher competition than those that existed in the old EU banking
countries"


In [23]:
# Creating ranking of papers where share of uncertain sentences is the highest

uncertain_sentences_count = sentence_df.groupby('label').agg({'svc_wiki_prediction': 'sum',
                                                              'sentence': 'count'}).reset_index()

uncertain_sentences_count['uncertan_sentence_index'] = uncertain_sentences_count['svc_wiki_prediction']/uncertain_sentences_count['sentence']
uncertain_sentences_count['cb'] = uncertain_sentences_count['label'].isin(cb)

# Ranking of papers with highest fraction of uncertain sentences
uncertain_sentences_count.sort_values('uncertan_sentence_index', ascending=False).reset_index(drop=True)

Unnamed: 0,label,svc_wiki_prediction,sentence,uncertan_sentence_index,cb
0,p026,15,174,0.086207,False
1,p032,25,302,0.082781,True
2,p059,5,80,0.062500,True
3,p048,16,260,0.061538,False
4,p074,20,344,0.058140,False
5,p062,15,259,0.057915,False
6,p034,10,181,0.055249,False
7,p057,10,183,0.054645,False
8,p065,19,355,0.053521,True
9,p051,6,114,0.052632,False


In [24]:
test = uncertain_sentences_count.sort_values('uncertan_sentence_index', ascending=False).reset_index(drop=True)

test[test.cb == False].reset_index().to_excel('sentence_uncertainity_score_ncb.xlsx', index=False)

In [37]:
# Arevage number of words in certain vs uncertain sentence
sentence_df.groupby(['svc_wiki_prediction']).agg({'sentence': 'count',
                                                  'no_words': 'mean'}).reset_index()

# With CB split
sentence_df.groupby(['svc_wiki_prediction', 'cb']).agg({'sentence': 'count',
                                                   'no_words': 'mean'}).reset_index()

Unnamed: 0,svc_wiki_prediction,cb,sentence,no_words
0,0,False,12049,15.287825
1,0,True,4319,16.317898
2,1,False,379,16.94723
3,1,True,147,19.884354


In [26]:
# Average number of words certain vs uncertain sentence per paper

test = sentence_df.groupby(['label', 'svc_wiki_prediction']).agg({'sentence': 'count',
                                                                  'no_words': 'mean'}).reset_index()

# Top 10 articles with highest uncertain share
test10 = test[test.label.isin(['p026', 'p032', 'p059', 'p048', 'p074', 'p062', 'p034', 'p057', 'p065', 'p051', 'p007'])]


# Top articles with highest uncertain share have average number of words in unc = 18.586
# and uncer = 15.74275851992153
test10.loc[test10['svc_wiki_prediction'] == 0, 'no_words'].mean()


Unnamed: 0,label,svc_wiki_prediction,sentence,no_words
10,p007,0,166,14.722892
11,p007,1,9,21.0
46,p026,0,159,16.446541
47,p026,1,15,19.4
56,p032,0,277,18.429603
57,p032,1,25,26.88
60,p034,0,171,12.807018
61,p034,1,10,14.2
88,p048,0,244,14.643443
89,p048,1,16,15.625


In [24]:
# Validating Random Forest vs SVM

y = wiki_data['uncertanity_dummy']
wiki_tfidf = tfidf.fit_transform(wiki_data['sentence'])
x = wiki_tfidf

X_train, X_test, y_train, y_test = train_test_split(x, y, random_state = 0, test_size=0.3)

# Training random forest on wikipedia data
rf_wiki = RandomForestClassifier(n_estimators=50, n_jobs=-1).fit(X_train, y_train)

svc_wiki = SVC(kernel='linear').fit(X_train, y_train)

# Predict uncertanity based on article data using RF
rf_wiki_prediction = rf_wiki.predict(X_test)

# Predict uncertanity based on article data using SVM
svc_wiki_prediction = svc_wiki.predict(X_test)

print(classification_report(y_test, rf_wiki_prediction))
print(classification_report(y_test, svc_wiki_prediction))


             precision    recall  f1-score   support

          0       0.89      1.00      0.94      4094
          1       0.98      0.68      0.81      1530

avg / total       0.92      0.91      0.90      5624

             precision    recall  f1-score   support

          0       0.89      1.00      0.94      4094
          1       0.99      0.67      0.80      1530

avg / total       0.92      0.91      0.90      5624



In [36]:
# PRecision
(0.91* 4094 + 0.74*1530)/5624

# Recall
#(0.98* 4094 + 0.62*1530)/5624


0.8637517780938834

In [22]:
pd.DataFrame(confusion_matrix(y_test, rf_wiki_prediction))

Unnamed: 0,0,1
0,4080,14
1,499,1031


In [23]:
pd.DataFrame(confusion_matrix(y_test, svc_wiki_prediction))

Unnamed: 0,0,1
0,4080,14
1,503,1027
