In [1]:
import os
import gc
import pandas as pd
from tqdm.notebook import tqdm
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn
import matplotlib.pyplot as plt
import re

In [2]:
path_news = './stock/usa/politicals_news/'
list_file_news = [os.path.join(path_news, filename) for filename in os.listdir(path_news) if os.path.isfile(os.path.join(path_news, filename))]

In [3]:
# def pos_tag_text(text):
#     stop_words = set(stopwords.words('english'))
#     pos_dict = {
#         'J' : wordnet.ADJ, 
#         'V' : wordnet.VERB, 
#         'N' : wordnet.NOUN, 
#         'R' : wordnet.ADV
#     }

#     text = re.sub('[^A-Za-z]+', ' ', text)
#     token = word_tokenize(text.lower())
#     words_list = [w for w in token if not w in stop_words]
#     pos_tagged = pos_tag(words_list)
#     pos_data = [(w, pos_dict.get(p[0])) for w, p in pos_tagged]
    
#     return pos_data

# def lemmatize_text(pos_data):
#     wordnet_lemmatizer = WordNetLemmatizer()
#     lemma_rew = " "
#     for word, pos in pos_data:
#         if pos is None:
#             lemma = word
#             lemma_rew = lemma_rew + " " + lemma
#         else:
#             lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
#             lemma_rew = lemma_rew + " " + lemma
            
#     return lemma_rew.strip()

# def sentiwordnet_analysis(pos_data):
#     wordnet_lemmatizer = WordNetLemmatizer()
#     tokens_count = 0
#     score_pos = 0
#     score_neg = 0
#     score_obj = 0
#     for word, pos in pos_data:
#         if not pos:
#             continue
            
#         lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
#         if not lemma:
#             continue
        
#         synsets = wordnet.synsets(lemma, pos=pos)
#         if not synsets:
#             continue
            
#         # Take the first sense, the most common
#         synset = synsets[0]
#         swn_synset = swn.senti_synset(synset.name())
        
#         score_pos += swn_synset.pos_score()
#         score_neg += swn_synset.neg_score()
#         score_obj += swn_synset.obj_score()
        
#         tokens_count += 1
    
#     score_total = score_pos - score_neg
    
#     return (score_pos, score_neg, score_obj, score_total)

In [9]:
def vader_sentiment_article(list_word):
#     list_word = text.replace('\t', ' ').replace('\n', ' ').split(' ')
    df_list_word = pd.DataFrame(list_word, columns=['words'])
    
    analyzer = SentimentIntensityAnalyzer()
    df_list_word['vader_analysis'] = df_list_word['words'].apply(analyzer.polarity_scores)
    df_list_word['pos'] = df_list_word['vader_analysis'].apply(lambda x: x['pos'])
    df_list_word['neg'] = df_list_word['vader_analysis'].apply(lambda x: x['neg'])
    
    sum_diff = (df_list_word['pos'] - df_list_word['neg']).sum()
    if sum_diff < 0:
        return -1
    if sum_diff > 0:
        return 1
    return 0

def make_sentiment_features(df):
    print(df.shape)
    df['article_concat_split'] = df['article_title'].apply(lambda x: x.replace('\t', ' ').replace('\n', ' ').split(' '))
    df['vader_sentiment'] = df['article_concat_split'].apply(vader_sentiment_article)
    df['vader_sentiment_pos'] = (df['vader_sentiment'] > 0).astype('int')
    df['vader_sentiment_neg'] = (df['vader_sentiment'] < 0).astype('int')
    
#     df['lm_sentiment'] = df['article_concat_split'].apply(lm_sentiment_article)
#     df['lm_sentiment_pos'] = (df['lm_sentiment'] > 0).astype('int')
#     df['lm_sentiment_neg'] = (df['lm_sentiment'] < 0).astype('int')
    
    return df

def daily_sentiment_score(pos_num, neg_num):
    if pos_num > neg_num:
        return 2 * pos_num / (pos_num + neg_num) - 1
    
    if pos_num < neg_num:
        return 1 - 2  * neg_num / (pos_num + neg_num)

    return 0

In [10]:
df = None
filter_start_time = '2005-01-01'

for file in tqdm(list_file_news):
    basename = os.path.basename(file)
    os.path.splitext(basename)
    file_name, file_extension = os.path.splitext(basename)
    if file_extension != '.csv':
        continue
        
    df_news = pd.read_csv(file, header=None)
    df_news.columns = ['article_title', 'time']
    df_news.loc[df_news['time'].str.contains('ago'), 'time'] = 'Aug 10, 2021'
    df_news['Date'] = pd.to_datetime(df_news['time'])
    df_news = df_news[df_news['Date'] >= filter_start_time]
    
    if len(df_news) == 0:
        continue
    
    df_news.drop_duplicates(subset=['Date', 'article_title'], keep='first', inplace=True)
    df_news.drop('time', axis=1, inplace=True)
    
    df_news = df_news[['Date', 'article_title']]

    df_news = make_sentiment_features(df_news)
    
    if df is None:
        df = df_news.copy()
    else:
        df = pd.concat([df, df_news], axis=0)
    
    del df_news
    gc.collect()

  0%|          | 0/18 [00:00<?, ?it/s]

(317, 2)
(848, 2)
(1158, 2)
(4743, 2)
(3136, 2)
(1644, 2)
(1549, 2)
(3131, 2)
(2503, 2)
(1835, 2)
(2578, 2)
(5514, 2)
(3314, 2)
(2686, 2)
(3112, 2)
(4534, 2)
(2077, 2)


In [11]:
df.head(1)

Unnamed: 0,Date,article_title,article_concat_split,vader_sentiment,vader_sentiment_pos,vader_sentiment_neg
0,2005-08-01,'One of them made cuts in my penis. I was in a...,"['One, of, them, made, cuts, in, my, penis., I...",-1,0,1


In [15]:
df_sentiment = df.groupby(['Date']) \
    .agg(vader_sentiment_pos=('vader_sentiment_pos', 'sum'), 
         vader_sentiment_neg=('vader_sentiment_neg', 'sum'),) \
    .reset_index()

df_sentiment['daily_sentiment_score_vader'] = df_sentiment \
    .apply(lambda x: daily_sentiment_score(x['vader_sentiment_pos'], x['vader_sentiment_neg']), axis=1)

df_sentiment[['Date', 'daily_sentiment_score_vader']].head()

Unnamed: 0,Date,daily_sentiment_score_vader
0,2005-01-02,-1.0
1,2005-01-03,0.0
2,2005-01-05,-1.0
3,2005-01-06,-1.0
4,2005-01-07,-1.0


In [16]:
df_sentiment[['Date', 'daily_sentiment_score_vader']] \
    .to_csv('./exported_data/news_sentiment_analysis/us_political_news_sentiment_analysis.csv', index=False)