In [1]:
import os
import gc
import pandas as pd
from tqdm.notebook import tqdm
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn
import matplotlib.pyplot as plt
import re

In [None]:
# def pos_tag_text(text):
#     stop_words = set(stopwords.words('english'))
#     pos_dict = {
#         'J' : wordnet.ADJ, 
#         'V' : wordnet.VERB, 
#         'N' : wordnet.NOUN, 
#         'R' : wordnet.ADV
#     }

#     text = re.sub('[^A-Za-z]+', ' ', text)
#     token = word_tokenize(text.lower())
#     words_list = [w for w in token if not w in stop_words]
#     pos_tagged = pos_tag(words_list)
#     pos_data = [(w, pos_dict.get(p[0])) for w, p in pos_tagged]
    
#     return pos_data

# def lemmatize_text(pos_data):
#     wordnet_lemmatizer = WordNetLemmatizer()
#     lemma_rew = " "
#     for word, pos in pos_data:
#         if pos is None:
#             lemma = word
#             lemma_rew = lemma_rew + " " + lemma
#         else:
#             lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
#             lemma_rew = lemma_rew + " " + lemma
            
#     return lemma_rew.strip()

# def sentiwordnet_analysis(pos_data):
#     wordnet_lemmatizer = WordNetLemmatizer()
#     tokens_count = 0
#     score_pos = 0
#     score_neg = 0
#     score_obj = 0
#     for word, pos in pos_data:
#         if not pos:
#             continue
            
#         lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
#         if not lemma:
#             continue
        
#         synsets = wordnet.synsets(lemma, pos=pos)
#         if not synsets:
#             continue
            
#         # Take the first sense, the most common
#         synset = synsets[0]
#         swn_synset = swn.senti_synset(synset.name())
        
#         score_pos += swn_synset.pos_score()
#         score_neg += swn_synset.neg_score()
#         score_obj += swn_synset.obj_score()
        
#         tokens_count += 1
    
#     score_total = score_pos - score_neg
    
#     return (score_pos, score_neg, score_obj, score_total)

In [2]:
lm_negative = pd.read_csv('lm_negative.csv', header=None)[0].tolist()
lm_negative = {word.lower() : -1 for word in lm_negative}

lm_positive = pd.read_csv('lm_positive.csv', header=None)[0].tolist()
lm_positive = {word.lower() : 1 for word in lm_positive}

def negated(word):
    """
    Determine if preceding word is a negation word
    """
    negate = ["aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt", "ain't", "aren't", "can't",
              "couldn't", "daren't", "didn't", "doesn't", "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt",
              "neither", "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't", "neednt", "needn't",
              "never", "none", "nope", "nor", "not", "nothing", "nowhere", "oughtnt", "shant", "shouldnt", "wasnt",
              "werent", "oughtn't", "shan't", "shouldn't", "wasn't", "weren't", "without", "wont", "wouldnt", "won't",
              "wouldn't", "rarely", "seldom", "despite", "no", "nobody"]
    
    if word.lower() in negate:
        return True
    else:
        return False

def lm_analysis(text):
    pos_count = 0
    neg_count = 0
    input_words = re.findall(r'\b([a-zA-Z]+n\'t|[a-zA-Z]+\'s|[a-zA-Z]+)\b', text.lower())
    word_count = len(input_words)

    for i in range(0, word_count):
        if input_words[i] in lm_negative:
            neg_count += 1

        if input_words[i] in lm_positive:
            if i >= 3:
                if negated(input_words[i - 1]) or negated(input_words[i - 2]) or negated(input_words[i - 3]):
                    neg_count += 1
                else:
                    pos_count += 1
            elif i == 2:
                if negated(input_words[i - 1]) or negated(input_words[i - 2]):
                    neg_count += 1
                else:
                    pos_count += 1
            elif i == 1:
                if negated(input_words[i - 1]):
                    neg_count += 1
                else:
                    pos_count += 1
            elif i == 0:
                pos_count += 1
    
    if pos_count + neg_count > 0:
        polarity = (pos_count - neg_count) / (pos_count + neg_count)
    else:
        polarity = 0
    subjectivity = (pos_count + neg_count) / word_count
    pos_rate = pos_count / word_count
    neg_rate = neg_count / word_count
    
    return {'Positive' : pos_rate, 
            'Negative' : neg_rate, 
            'Polarity' : polarity, 
            'Subjectivity' : subjectivity}

In [3]:
def make_sentiment_features(df):
    analyzer = SentimentIntensityAnalyzer()
    
    df['economy_indicator_news_vader_neg'] = df.apply(lambda x: analyzer.polarity_scores(x['article_concat'])['neg'], axis=1)
    df['economy_indicator_news_vader_pos'] = df.apply(lambda x: analyzer.polarity_scores(x['article_concat'])['pos'], axis=1)
    df['economy_indicator_news_vader_neu'] = df.apply(lambda x: analyzer.polarity_scores(x['article_concat'])['neu'], axis=1)
    df['economy_indicator_news_vader_compound'] = df.apply(lambda x: analyzer.polarity_scores(x['article_concat'])['compound'], axis=1)
    
    df['economy_indicator_news_lm_dict'] = df.apply(lambda x: lm_analysis(x['article_concat']), axis=1)
    df['economy_indicator_news_lm_pos'] = df.apply(lambda x: x['economy_indicator_news_lm_dict']['Positive'], axis=1)
    df['economy_indicator_news_lm_neg'] = df.apply(lambda x: x['economy_indicator_news_lm_dict']['Negative'], axis=1)
    df['economy_indicator_news_lm_polar'] = df.apply(lambda x: x['economy_indicator_news_lm_dict']['Polarity'], axis=1)
    df['economy_indicator_news_lm_subject'] = df.apply(lambda x: x['economy_indicator_news_lm_dict']['Subjectivity'], axis=1)
    
    df.drop('economy_indicator_news_lm_dict', axis=1, inplace=True)
    return df

In [4]:
path_news = './stock/usa/economic_indicator_news/economic_indicator_news.csv'
filter_start_time = '2015-01-01'

df = pd.read_csv(path_news, header=None)
df.columns = ['article_title', 'time', 'article_abstract']
df['Date'] = pd.to_datetime(df['time'])
df.drop_duplicates(subset=['Date', 'article_title'], keep='first', inplace=True)
df = df[['Date', 'article_title', 'article_abstract']]

df = df[df['Date'] >= filter_start_time]

df['article_concat'] = df['article_title'] + ' ' + df['article_abstract']
df = make_sentiment_features(df)

df.head()

Unnamed: 0,Date,article_title,article_abstract,article_concat,economy_indicator_news_vader_neg,economy_indicator_news_vader_pos,economy_indicator_news_vader_neu,economy_indicator_news_vader_compound,economy_indicator_news_lm_pos,economy_indicator_news_lm_neg,economy_indicator_news_lm_polar,economy_indicator_news_lm_subject
0,2021-08-13,U.S. consumer sentiment plummets in early Augu...,By Evan Sully and Lindsay (NYSE:LNN) Dunsmuir ...,U.S. consumer sentiment plummets in early Augu...,0.165,0.0,0.835,-0.7269,0.0,0.071429,-1.0,0.071429
1,2021-08-13,EU exports to Britain rise in June as they fal...,By Francesco Guarascio BRUSSELS (Reuters) -Eur...,EU exports to Britain rise in June as they fal...,0.0,0.0,1.0,0.0,0.0,0.023256,-1.0,0.023256
2,2021-08-13,Taiwan expects fastest growth in a decade this...,By Emily Chan TAIPEI (Reuters) -Taiwan's econo...,Taiwan expects fastest growth in a decade this...,0.0,0.058,0.942,0.3818,0.0,0.0,0.0,0.0
3,2021-08-13,Japan inflation seen weak despite export boom ...,By Daniel Leussink and Leika Kihara TOKYO (Reu...,Japan inflation seen weak despite export boom ...,0.046,0.0,0.954,-0.2382,0.0,0.025,-1.0,0.025
4,2021-08-12,U.S. producer prices at more than decade high;...,WASHINGTON (Reuters) -Producer prices posted t...,U.S. producer prices at more than decade high;...,0.059,0.093,0.848,0.0772,0.0,0.054054,-1.0,0.054054


In [5]:
df.shape

(20037, 12)

In [6]:
df.describe()

Unnamed: 0,economy_indicator_news_vader_neg,economy_indicator_news_vader_pos,economy_indicator_news_vader_neu,economy_indicator_news_vader_compound,economy_indicator_news_lm_pos,economy_indicator_news_lm_neg,economy_indicator_news_lm_polar,economy_indicator_news_lm_subject
count,20037.0,20037.0,20037.0,20037.0,20037.0,20037.0,20037.0,20037.0
mean,0.047569,0.05796,0.89447,0.042559,0.00013,0.031595,-0.572085,0.031724
std,0.067898,0.073425,0.097398,0.450434,0.00194,0.036059,0.497532,0.036126
min,0.0,0.0,0.534,-0.9741,0.0,0.0,-1.0,0.0
25%,0.0,0.0,0.825,-0.2023,0.0,0.0,-1.0,0.0
50%,0.0,0.031,0.905,0.0,0.0,0.026316,-1.0,0.026316
75%,0.084,0.101,1.0,0.3716,0.0,0.052632,0.0,0.052632
max,0.452,0.449,1.0,0.9661,0.057143,0.258065,1.0,0.258065


In [7]:
df.groupby(['Date']) \
    .agg(economy_indicator_news_in_day=('article_title', 'count'),
         economy_indicator_news_vader_neg=('economy_indicator_news_vader_neg', 'sum'),
         economy_indicator_news_vader_pos=('economy_indicator_news_vader_pos', 'sum'),
         economy_indicator_news_vader_neu=('economy_indicator_news_vader_neu', 'sum'),
         economy_indicator_news_vader_compound=('economy_indicator_news_vader_compound', 'sum'),
         economy_indicator_news_lm_pos=('economy_indicator_news_lm_pos', 'sum'),
         economy_indicator_news_lm_neg=('economy_indicator_news_lm_neg', 'sum'),
         economy_indicator_news_lm_polar=('economy_indicator_news_lm_polar', 'sum'),
         economy_indicator_news_lm_subject=('economy_indicator_news_lm_subject', 'sum'),
        ) \
    .reset_index() \
    .to_csv('./exported_data/news_sentiment_analysis/economic_indicator_news_sentiment_analysis.csv', index=False)