In [1]:
import os
import gc
import pandas as pd
from tqdm.notebook import tqdm
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn
import matplotlib.pyplot as plt
import re

In [2]:
def pos_tag_text(text):
    stop_words = set(stopwords.words('english'))
    pos_dict = {
        'J' : wordnet.ADJ, 
        'V' : wordnet.VERB, 
        'N' : wordnet.NOUN, 
        'R' : wordnet.ADV
    }

    text = re.sub('[^A-Za-z]+', ' ', text)
    token = word_tokenize(text.lower())
    words_list = [w for w in token if not w in stop_words]
    pos_tagged = pos_tag(words_list)
    pos_data = [(w, pos_dict.get(p[0])) for w, p in pos_tagged]
    
    return pos_data

def lemmatize_text(pos_data):
    wordnet_lemmatizer = WordNetLemmatizer()
    lemma_rew = " "
    for word, pos in pos_data:
        if pos is None:
            lemma = word
            lemma_rew = lemma_rew + " " + lemma
        else:
            lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
            lemma_rew = lemma_rew + " " + lemma
            
    return lemma_rew.strip()

def sentiwordnet_analysis(pos_data):
    wordnet_lemmatizer = WordNetLemmatizer()
    tokens_count = 0
    score_pos = 0
    score_neg = 0
    score_obj = 0
    for word, pos in pos_data:
        if not pos:
            continue
            
        lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
        if not lemma:
            continue
        
        synsets = wordnet.synsets(lemma, pos=pos)
        if not synsets:
            continue
            
        # Take the first sense, the most common
        synset = synsets[0]
        swn_synset = swn.senti_synset(synset.name())
        
        score_pos += swn_synset.pos_score()
        score_neg += swn_synset.neg_score()
        score_obj += swn_synset.obj_score()
        
        tokens_count += 1
    
    score_total = score_pos - score_neg
    
    return (score_pos, score_neg, score_obj, score_total)

def make_sentiment_features(df):
    analyzer = SentimentIntensityAnalyzer()
    
    df['article_concat_pos_tag'] = df['article_concat'].apply(pos_tag_text)
    df['article_concat_lemma'] = df['article_concat_pos_tag'].apply(lemmatize_text)
    
    df['stock_market_news_vader_neg'] = df.apply(lambda x: analyzer.polarity_scores(x['article_concat_lemma'])['neg'], axis=1)
    df['stock_market_news_vader_pos'] = df.apply(lambda x: analyzer.polarity_scores(x['article_concat_lemma'])['pos'], axis=1)
    df['stock_market_news_vader_neu'] = df.apply(lambda x: analyzer.polarity_scores(x['article_concat_lemma'])['neu'], axis=1)
    df['stock_market_news_vader_compound'] = df.apply(lambda x: analyzer.polarity_scores(x['article_concat_lemma'])['compound'], axis=1)
    
    df['stock_market_news_wordnet_pos'] = df.apply(lambda x: sentiwordnet_analysis(x['article_concat_pos_tag'])[0], axis=1)
    df['stock_market_news_wordnet_neg'] = df.apply(lambda x: sentiwordnet_analysis(x['article_concat_pos_tag'])[1], axis=1)
    df['stock_market_news_wordnet_obj'] = df.apply(lambda x: sentiwordnet_analysis(x['article_concat_pos_tag'])[2], axis=1)
    df['stock_market_news_wordnet_total'] = df.apply(lambda x: sentiwordnet_analysis(x['article_concat_pos_tag'])[3], axis=1)
    
    return df

In [3]:
%%time
path_news = './stock/usa/stock_market_news/stock_market_news.csv'
filter_start_time = '2015-01-01'

df = pd.read_csv(path_news, header=None)
df.columns = ['article_title', 'time', 'article_abstract']

df = df[~df['time'].str.contains('ago')]
df['article_abstract'] = df['article_abstract'].str.replace('Investing.com', '', regex=False)

df['Date'] = pd.to_datetime(df['time'])
df.drop_duplicates(subset=['Date', 'article_title'], keep='first', inplace=True)
df = df[['Date', 'article_title', 'article_abstract']]

df = df[df['Date'] >= filter_start_time]

df['article_concat'] = df['article_title'] + ' ' + df['article_abstract']
df = make_sentiment_features(df)
df

CPU times: user 6min 44s, sys: 3.18 s, total: 6min 47s
Wall time: 6min 50s


Unnamed: 0,Date,article_title,article_abstract,article_concat,article_concat_pos_tag,article_concat_lemma,stock_market_news_vader_neg,stock_market_news_vader_pos,stock_market_news_vader_neu,stock_market_news_vader_compound,stock_market_news_wordnet_pos,stock_market_news_wordnet_neg,stock_market_news_wordnet_obj,stock_market_news_wordnet_total
17,2021-08-14,Fired Alibaba employee suspected of 'forcible ...,SHANGHAI (Reuters) - A former male employee of...,Fired Alibaba employee suspected of 'forcible ...,"[(fired, v), (alibaba, a), (employee, n), (sus...",fire alibaba employee suspect forcible indecen...,0.313,0.060,0.627,-0.8519,0.625,3.375,19.000,-2.750
18,2021-08-14,Top 12 Stocks for the Rest of 2021,Why do most investors underperform the stock m...,Top 12 Stocks for the Rest of 2021 Why do most...,"[(top, a), (stocks, n), (rest, v), (investors,...",top stock rest investor underperform stock mar...,0.000,0.420,0.580,0.8126,1.125,0.375,13.500,0.750
22,2021-08-14,InferVision Gets $140M In Series D2 Funding Ro...,InferVision is a medical artificial intelligen...,InferVision Gets $140M In Series D2 Funding Ro...,"[(infervision, n), (gets, v), (series, n), (fu...",infervision get series funding round lead gold...,0.000,0.119,0.881,0.4767,0.875,0.250,17.875,0.625
23,2021-08-14,Elon Musk Wants To Produce First Tesla At Berl...,Tesla has pushed back opening after local resi...,Elon Musk Wants To Produce First Tesla At Berl...,"[(elon, n), (musk, n), (wants, v), (produce, v...",elon musk want produce first tesla berlin giga...,0.000,0.144,0.856,0.4939,0.750,0.875,20.375,-0.125
24,2021-08-14,India's commerce minister faces heat for criti...,By Abhirup Roy and Aditya Kalra NEW DELHI (Reu...,India's commerce minister faces heat for criti...,"[(india, a), (commerce, n), (minister, n), (fa...",india commerce minister face heat criticism bu...,0.094,0.000,0.906,-0.4404,0.625,1.625,16.750,-1.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158379,2015-01-02,Germany stocks mixed at close of trade,– Germany stocks were mixed after the close o...,Germany stocks mixed at close of trade – Germ...,"[(germany, n), (stocks, n), (mixed, v), (close...",germany stock mix close trade germany stock mi...,0.089,0.216,0.695,0.5106,0.375,0.500,19.125,-0.125
158380,2015-01-02,European stocks re-open higher after Draghi re...,"- European stocks were higher on Friday, as m...",European stocks re-open higher after Draghi re...,"[(european, a), (stocks, n), (open, v), (highe...",european stock open high draghi remark dax eur...,0.000,0.105,0.895,0.4019,0.500,0.250,17.250,0.250
158381,2015-01-02,Australia stocks higher at close of trade,– Australia stocks were higher after the clos...,Australia stocks higher at close of trade – A...,"[(australia, r), (stocks, n), (higher, r), (cl...",australia stock higher close trade australia s...,0.000,0.319,0.681,0.8360,0.000,0.000,18.000,0.000
158382,2015-01-02,"Euro starts new year at new low, oil struggles...",By Wayne Cole SYDNEY (Reuters) - The euro star...,"Euro starts new year at new low, oil struggles...","[(euro, n), (starts, v), (new, a), (year, n), ...",euro start new year new low oil struggle rally...,0.192,0.068,0.740,-0.4939,1.125,0.375,19.500,0.750


In [5]:
df.head(1)

Unnamed: 0,Date,article_title,article_abstract,article_concat,article_concat_pos_tag,article_concat_lemma,stock_market_news_vader_neg,stock_market_news_vader_pos,stock_market_news_vader_neu,stock_market_news_vader_compound,stock_market_news_wordnet_pos,stock_market_news_wordnet_neg,stock_market_news_wordnet_obj,stock_market_news_wordnet_total
17,2021-08-14,Fired Alibaba employee suspected of 'forcible ...,SHANGHAI (Reuters) - A former male employee of...,Fired Alibaba employee suspected of 'forcible ...,"[(fired, v), (alibaba, a), (employee, n), (sus...",fire alibaba employee suspect forcible indecen...,0.313,0.06,0.627,-0.8519,0.625,3.375,19.0,-2.75


In [7]:
df.groupby(['Date']) \
    .agg(stock_market_news_in_day=('article_title', 'count'),
         stock_market_news_vader_neg=('stock_market_news_vader_neg', 'sum'),
         stock_market_news_vader_pos=('stock_market_news_vader_pos', 'sum'),
         stock_market_news_vader_neu=('stock_market_news_vader_neu', 'sum'),
         stock_market_news_vader_compound=('stock_market_news_vader_compound', 'sum'),
         stock_market_news_wordnet_pos=('stock_market_news_wordnet_pos', 'sum'),
         stock_market_news_wordnet_neg=('stock_market_news_wordnet_neg', 'sum'),
         stock_market_news_wordnet_obj=('stock_market_news_wordnet_obj', 'sum'),
         stock_market_news_wordnet_total=('stock_market_news_wordnet_total', 'sum'),
        ) \
    .reset_index() \
    .to_csv('./exported_data/stock_market_news_sentiment_analysis.csv', index=False)