# Company Sentiment Analysis

In [None]:
import warnings
warnings.filterwarnings('ignore')

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()


### Update Vader Lexicon to accomodate financial vocabulary

In [None]:
import csv
import pandas as pd

# Stock Market Lexicon (https://github.com/nunomroliveira/stock_market_lexicon/blob/master/stock_lex.csv)
stock_lex = pd.read_csv('lex_data/stock_lex.csv')
stock_lex['sentiment'] = (stock_lex['Aff_Score'] + stock_lex['Neg_Score'])/2
stock_lex = dict(zip(stock_lex.Item, stock_lex.sentiment))

stock_lex = {k:v for k,v in stock_lex.items() if len(k.split(' '))==1}

stock_lex_scaled = {}
for k, v in stock_lex.items():
    if v > 0:
        stock_lex_scaled[k] = v / max(stock_lex.values()) * 4
    else:
        stock_lex_scaled[k] = v / min(stock_lex.values()) * -4

# # Loughran McDonald Lexicon ()
positive = []
with open('lex_data/positive.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        positive.append(row[0].strip())
    
negative = []
with open('lex_data/negative.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        entry = row[0].strip().split(" ")
        if len(entry) > 1:
            negative.extend(entry)
        else:
            negative.append(entry[0])

final_lex = {}
final_lex.update({word:2.0 for word in positive})
final_lex.update({word:-2.0 for word in negative})
final_lex.update(stock_lex_scaled)
final_lex.update(sia.lexicon)
sia.lexicon = final_lex

### Run Vader to retrieve sentiment score

In [43]:
# Variables that determine data collection criteria
keywords = 'oil'
from_date = '3/2/2020' 
to_date = '3/30/2020'

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import time
import pprint

date_sentiments = {}

for i in range(1,11):
    page = urlopen('https://www.businesstimes.com.sg/search/facebook?page='+str(i)).read()
    soup = BeautifulSoup(page, features="html.parser")
    posts = soup.findAll("div", {"class": "media-body"})
    for post in posts:
        time.sleep(1)
        url = post.a['href']
        date = post.time.text
        print(date, url)
        try:
            link_page = urlopen(url).read()
        except:
            url = url[:-2]
            link_page = urlopen(url).read()
        link_soup = BeautifulSoup(link_page)
        sentences = link_soup.findAll("p")
        passage = ""
        for sentence in sentences:
            passage += sentence.text
        sentiment = sia.polarity_scores(passage)['compound']
        date_sentiments.setdefault(date, []).append(sentiment)

date_sentiment = {}

for k,v in date_sentiments.items():
    date_sentiment[datetime.strptime(k, '%d %b %Y').date() + timedelta(days=1)] = round(sum(v)/float(len(v)),3)

earliest_date = min(date_sentiment.keys())

print(date_sentiment)

In [44]:
import requests
from lxml import html
from urllib.request import urlopen
import urllib.request
from bs4 import BeautifulSoup
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}

def get_news(keywords, date):
    url = "https://www.google.com/search?hl=en&gl=us&tbm=nws&authuser=0&num=20&q=" + keywords + "&tbs=cdr:1,cd_min:" + date + ",cd_max:" + date + ",sbd:1"
    print(url)
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
    hdrs = {'User-Agent': user_agent} 
    response = requests.get(url, headers=hdrs)
    file1 = open('MyFile.html', 'w+')
    file1.write(response.text)
    file1.close()
    soup = BeautifulSoup(response.text, features="html.parser")
    result_div = soup.find_all('div', attrs = {'class': 'dbsr'})

    links = []
    titles = []
    descriptions = []
    for r in result_div:
        # Checks if each element is present, else, raise exception
        try:
            link = r.find('a', href = True)
            # title = r.find('div', attrs={'class':'vvjwJb'}).get_text()
            # description = r.find('div', attrs={'class':'s3v9rd'}).get_text()

            # print(link)
            
            # Check to make sure everything is present before appending
            if link != '': 
                # links.append((link['href'].split('=')[1]).split('&')[0])
                links.append(link['href'])
                # titles.append(title)
                # descriptions.append(description)

        # Next loop if one element is not present
        except:
            print('hello')
            continue
    return links

news_urls = get_news(keywords=keywords, date='3/31/2020')

with open('urls.txt', 'a') as url_file:
    url_file.write('3/31/2020\n\n')
    url_file.writelines("%s\n" % u for u in news_urls)
    url_file.write('\n\n')

sentiment_sum = 0

for url in news_urls:
    print(url)
    link_page = requests.get(url)
    link_soup = BeautifulSoup(link_page.text)
    sentences = link_soup.findAll("p")
    passage = ""
    for sentence in sentences:
        passage += sentence.text
    senti = sia.polarity_scores(passage)['compound']
    print(senti)
    sentiment_sum += senti
    # date_sentiments.setdefault(date, []).append(sentiment)

print(sentiment_sum)
sentiment_sum /= len(news_urls)

print(sentiment_sum)

https://www.google.com/search?hl=en&gl=us&tbm=nws&authuser=0&num=20&q=oil&tbs=cdr:1,cd_min:3/31/2020,cd_max:3/31/2020,sbd:1
https://finance.yahoo.com/news/oil-sell-off-pauses-market-221740570.html
-0.9952
https://www.nytimes.com/2020/03/31/business/energy-environment/crude-oil-companies-coronavirus.html
-0.9564
https://www.washingtonpost.com/business/2020/03/31/cheap-oil-doesnt-mean-much-when-no-ones-going-anywhere-coronavirus-will-reshape-oil-industry/
0.958
http://www.marketwatch.com/story/crude-prices-rebound-after-tapping-18-year-low-2020-03-31
-0.9524
https://finance.yahoo.com/news/putin-trump-agree-current-oil-123751435.html
-0.9789
https://www.reuters.com/article/us-oil-opec-trump-putin/putin-and-trump-agree-oil-market-situation-suits-neither-kremlin-idUSKBN21I1PS
-0.3204
https://apnews.com/7c40ef427c3ad91708b602b8c8be615d
-0.128
https://www.aljazeera.com/ajimpact/oil-prices-crashed-keystone-xl-moving-200331153048908.html
-0.8791
https://www.investing.com/news/commodities-news/w