In [1]:
import os
import gc
import pandas as pd
from tqdm.notebook import tqdm
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn
import matplotlib.pyplot as plt
import re

In [5]:
def vader_sentiment_article(list_word):
#     list_word = text.replace('\t', ' ').replace('\n', ' ').split(' ')
    df_list_word = pd.DataFrame(list_word, columns=['words'])
    
    analyzer = SentimentIntensityAnalyzer()
    df_list_word['vader_analysis'] = df_list_word['words'].apply(analyzer.polarity_scores)
    df_list_word['pos'] = df_list_word['vader_analysis'].apply(lambda x: x['pos'])
    df_list_word['neg'] = df_list_word['vader_analysis'].apply(lambda x: x['neg'])
    
    sum_diff = (df_list_word['pos'] - df_list_word['neg']).sum()
    if sum_diff < 0:
        return -1
    if sum_diff > 0:
        return 1
    return 0

def make_sentiment_features(df):
    print(df.shape)
    df['article_title_split'] = df['article_title'].apply(lambda x: x.replace('\t', ' ').replace('\n', ' ').split(' '))
    df['vader_sentiment'] = df['article_title_split'].apply(vader_sentiment_article)
    df['vader_sentiment_pos'] = (df['vader_sentiment'] > 0).astype('int')
    df['vader_sentiment_neg'] = (df['vader_sentiment'] < 0).astype('int')
    
#     df['lm_sentiment'] = df['article_concat_split'].apply(lm_sentiment_article)
#     df['lm_sentiment_pos'] = (df['lm_sentiment'] > 0).astype('int')
#     df['lm_sentiment_neg'] = (df['lm_sentiment'] < 0).astype('int')
    
    return df

def daily_sentiment_score(pos_num, neg_num):
    if pos_num > neg_num:
        return 2 * pos_num / (pos_num + neg_num) - 1
    
    if pos_num < neg_num:
        return 1 - 2  * neg_num / (pos_num + neg_num)

    return 0

In [6]:
df = None
filter_start_time = '2005-01-01'
path_news = './stock/vn/political_vn.csv'

df = pd.read_csv(path_news)
df = df[~df['time'].isna()]
df['para'].fillna('', inplace=True)
df['Date'] = pd.to_datetime(df['time'])

df.drop_duplicates(subset=['Date', 'title'], keep='first', inplace=True)

df['article_title'] = df['title'] + ' ' + df['para']

df.drop('time', axis=1, inplace=True)
df = make_sentiment_features(df)

df.head()

(16381, 5)


Unnamed: 0,title,para,link,Date,article_title,article_title_split,vader_sentiment,vader_sentiment_pos,vader_sentiment_neg
0,Việt Nam highlights need to avoid negative imp...,The economy was basically stabilised in the fi...,/politics-laws/1009908/viet-nam-highlights-nee...,2021-08-12 13:57:00,Việt Nam highlights need to avoid negative imp...,"[Việt, Nam, highlights, need, to, avoid, negat...",-1,0,1
1,"Economy stable, pandemic basically controlled: PM",Prime Minister Phạm Minh Chính expressed deter...,/politics-laws/1009835/economy-stable-pandemic...,2021-08-12 11:48:00,"Economy stable, pandemic basically controlled:...","[Economy, stable, pandemic, basically, control...",1,1,0
2,"PM calls for a government of innovation, integ...",Prime Minister Pham Minh Chính made proposals ...,/politics-laws/1009122/pm-calls-for-a-governme...,2021-08-12 08:00:00,"PM calls for a government of innovation, integ...","[PM, calls, for, a, government, of, innovation...",1,1,0
3,President Phúc meets with Chairman of the Lao ...,Việt Nam and the Czech Republic have vowed to ...,/politics-laws/1008250/president-phuc-meets-wi...,2021-08-10 14:43:00,President Phúc meets with Chairman of the Lao ...,"[President, Phúc, meets, with, Chairman, of, t...",-1,0,1
4,"President meets with PM, visits former leaders...",President Nguyễn Xuân Phúc and General Secreta...,/politics-laws/1008039/president-meets-with-pm...,2021-08-10 08:14:00,"President meets with PM, visits former leaders...","[President, meets, with, PM,, visits, former, ...",1,1,0


In [7]:
df_sentiment = df.groupby(['Date']) \
    .agg(vader_sentiment_pos=('vader_sentiment_pos', 'sum'), 
         vader_sentiment_neg=('vader_sentiment_neg', 'sum'),) \
    .reset_index()

df_sentiment['politic_daily_sentiment_score_vader'] = df_sentiment \
    .apply(lambda x: daily_sentiment_score(x['vader_sentiment_pos'], x['vader_sentiment_neg']), axis=1)

df_sentiment[['Date', 'politic_daily_sentiment_score_vader']].head()

Unnamed: 0,Date,politic_daily_sentiment_score_vader
0,2005-01-06,0.333333
1,2005-01-08,1.0
2,2005-01-10,1.0
3,2005-01-11,0.666667
4,2005-01-12,1.0


In [8]:
df_sentiment[['Date', 'politic_daily_sentiment_score_vader']] \
    .to_csv('./exported_data/news_sentiment_analysis/vn_political_news_sentiment_analysis.csv', index=False)