In [1]:
import os
import gc
import pandas as pd
from tqdm.notebook import tqdm
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn
import matplotlib.pyplot as plt
import re

In [2]:
path_news = './stock/au/politicals_au/'
list_file_news = [os.path.join(path_news, filename) for filename in os.listdir(path_news) if os.path.isfile(os.path.join(path_news, filename))]

In [3]:
def vader_sentiment_article(list_word):
#     list_word = text.replace('\t', ' ').replace('\n', ' ').split(' ')
    df_list_word = pd.DataFrame(list_word, columns=['words'])
    
    analyzer = SentimentIntensityAnalyzer()
    df_list_word['vader_analysis'] = df_list_word['words'].apply(analyzer.polarity_scores)
    df_list_word['pos'] = df_list_word['vader_analysis'].apply(lambda x: x['pos'])
    df_list_word['neg'] = df_list_word['vader_analysis'].apply(lambda x: x['neg'])
    
    sum_diff = (df_list_word['pos'] - df_list_word['neg']).sum()
    if sum_diff < 0:
        return -1
    if sum_diff > 0:
        return 1
    return 0

def make_sentiment_features(df):
    print(df.shape)
    df['article_concat_split'] = df['article_title'].apply(lambda x: x.replace('\t', ' ').replace('\n', ' ').split(' '))
    df['vader_sentiment'] = df['article_concat_split'].apply(vader_sentiment_article)
    df['vader_sentiment_pos'] = (df['vader_sentiment'] > 0).astype('int')
    df['vader_sentiment_neg'] = (df['vader_sentiment'] < 0).astype('int')
    
#     df['lm_sentiment'] = df['article_concat_split'].apply(lm_sentiment_article)
#     df['lm_sentiment_pos'] = (df['lm_sentiment'] > 0).astype('int')
#     df['lm_sentiment_neg'] = (df['lm_sentiment'] < 0).astype('int')
    
    return df

def daily_sentiment_score(pos_num, neg_num):
    if pos_num > neg_num:
        return 2 * pos_num / (pos_num + neg_num) - 1
    
    if pos_num < neg_num:
        return 1 - 2  * neg_num / (pos_num + neg_num)

    return 0

In [4]:
df = None
filter_start_time = '2005-01-01'

for file in tqdm(list_file_news):
    basename = os.path.basename(file)
    os.path.splitext(basename)
    file_name, file_extension = os.path.splitext(basename)
    if file_extension != '.csv':
        continue
        
    df_news = pd.read_csv(file, header=None)
    df_news.columns = ['article_title', 'time']
    df_news['Date'] = pd.to_datetime(df_news['time'])
    df_news = df_news[df_news['Date'] >= filter_start_time]
    
    if len(df_news) == 0:
        continue
    
    df_news.drop_duplicates(subset=['Date', 'article_title'], keep='first', inplace=True)
    df_news.drop('time', axis=1, inplace=True)
    
    df_news = df_news[['Date', 'article_title']]

    df_news = make_sentiment_features(df_news)
    
    if df is None:
        df = df_news.copy()
    else:
        df = pd.concat([df, df_news], axis=0)
    
    del df_news
    gc.collect()

  0%|          | 0/16 [00:00<?, ?it/s]

(2, 2)
(8, 2)
(13, 2)
(7, 2)
(59, 2)
(35, 2)
(61, 2)
(1920, 2)
(3887, 2)
(3837, 2)
(3895, 2)
(3599, 2)
(3351, 2)
(3183, 2)
(3071, 2)
(1963, 2)


In [5]:
df.head()

Unnamed: 0,Date,article_title,article_concat_split,vader_sentiment,vader_sentiment_pos,vader_sentiment_neg
0,2005-02-22,"After a word in his ear, minister finds son","[After, a, word, in, his, ear,, minister, find...",0,0,0
1,2005-01-19,Labor leader quits politics,"[Labor, leader, quits, politics]",0,0,0
0,2007-08-19,Strip club visit hurts Australian party leader,"[Strip, club, visit, hurts, Australian, party,...",0,0,0
1,2007-12-01,Rudd sets date for Iraq pull-out,"[Rudd, sets, date, for, Iraq, pull-out]",0,0,0
2,2007-11-18,Australia's ballot goes to the wire,"[Australia's, ballot, goes, to, the, wire]",0,0,0


In [6]:
df_sentiment = df.groupby(['Date']) \
    .agg(vader_sentiment_pos=('vader_sentiment_pos', 'sum'), 
         vader_sentiment_neg=('vader_sentiment_neg', 'sum'),) \
    .reset_index()

df_sentiment['political_daily_sentiment_score_vader'] = df_sentiment \
    .apply(lambda x: daily_sentiment_score(x['vader_sentiment_pos'], x['vader_sentiment_neg']), axis=1)

df_sentiment[['Date', 'political_daily_sentiment_score_vader']].head()

Unnamed: 0,Date,political_daily_sentiment_score_vader
0,2005-01-19,0.0
1,2005-02-22,0.0
2,2007-08-19,0.0
3,2007-11-18,0.0
4,2007-11-23,0.0


In [7]:
df_sentiment[['Date', 'political_daily_sentiment_score_vader']] \
    .to_csv('./exported_data/news_sentiment_analysis/au_political_news_sentiment_analysis.csv', index=False)