# Company Sentiment Analysis

In [None]:
import warnings
warnings.filterwarnings('ignore')

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()


### Update Vader Lexicon to accomodate financial vocabulary

In [None]:
import csv
import pandas as pd

# Stock Market Lexicon (https://github.com/nunomroliveira/stock_market_lexicon/blob/master/stock_lex.csv)
stock_lex = pd.read_csv('lex_data/stock_lex.csv')
stock_lex['sentiment'] = (stock_lex['Aff_Score'] + stock_lex['Neg_Score'])/2
stock_lex = dict(zip(stock_lex.Item, stock_lex.sentiment))

stock_lex = {k:v for k,v in stock_lex.items() if len(k.split(' '))==1}

stock_lex_scaled = {}
for k, v in stock_lex.items():
    if v > 0:
        stock_lex_scaled[k] = v / max(stock_lex.values()) * 4
    else:
        stock_lex_scaled[k] = v / min(stock_lex.values()) * -4

# # Loughran McDonald Lexicon ()
positive = []
with open('lex_data/positive.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        positive.append(row[0].strip())
    
negative = []
with open('lex_data/negative.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        entry = row[0].strip().split(" ")
        if len(entry) > 1:
            negative.extend(entry)
        else:
            negative.append(entry[0])

final_lex = {}
final_lex.update({word:2.0 for word in positive})
final_lex.update({word:-2.0 for word in negative})
final_lex.update(stock_lex_scaled)
final_lex.update(sia.lexicon)
sia.lexicon = final_lex

### Run Vader to retrieve sentiment score

In [106]:
# Variables that determine data collection criteria
keywords = 'Goldman Sachs'
from_date = '2010-01-01' 
to_date = '2010-02-01'

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import time
import pprint

date_sentiments = {}

for i in range(1,3):
    page = urlopen('https://www.businesstimes.com.sg/search/facebook?page='+str(i)).read()
    soup = BeautifulSoup(page, features="html.parser")
    posts = soup.findAll("div", {"class": "media-body"})
    for post in posts:
        time.sleep(1)
        url = post.a['href']
        date = post.time.text
        print(date, url)
        try:
            link_page = urlopen(url).read()
        except:
            url = url[:-2]
            link_page = urlopen(url).read()
        link_soup = BeautifulSoup(link_page)
        sentences = link_soup.findAll("p")
        passage = ""
        for sentence in sentences:
            passage += sentence.text
        sentiment = sia.polarity_scores(passage)['compound']
        date_sentiments.setdefault(date, []).append(sentiment)
    print(date_sentiments)

date_sentiment = {}

for k,v in date_sentiments.items():
    date_sentiment[datetime.strptime(k, '%d %b %Y').date() + timedelta(days=1)] = round(sum(v)/float(len(v)),3)

earliest_date = min(date_sentiment.keys())

print(date_sentiment)

In [111]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from time import sleep
import csv

# Scrape URL's from Google News results for the provided keywords and date
def get_news(keywords, date):
    url = "https://www.google.com/search?hl=en&gl=us&tbm=nws&authuser=0&q=" + keywords + "&tbs=cdr:1,cd_min:" + date + ",cd_max:" + date + ",sbd:1"
    hdrs = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}

    print(url)

    response = None
    try:
        response = requests.get(url, headers=hdrs)
    except requests.ConnectionError as e:
        with open('log.txt', 'a') as log_file:
            log_file.write(datetime.today().isoformat())
            log_file.write(url)
        return []
    
    soup = BeautifulSoup(response.text, features="html.parser")
    result_div = soup.find_all('div', attrs = {'class': 'dbsr'})

    links = []
    for r in result_div:
        # Checks if each element is present, else, raise exception
        try:
            link = r.find('a', href = True)
            # title = r.find('div', attrs={'class':'vvjwJb'}).get_text()
            # description = r.find('div', attrs={'class':'s3v9rd'}).get_text()
            
            # Check to make sure everything is present before appending
            if link != '':
                links.append(link['href'])

        # Next loop if one element is not present
        except:
            continue

    return links

# Get number of days for the date range
end_date_obj = datetime.strptime(to_date, '%Y-%m-%d')
start_date_obj = datetime.strptime(from_date, '%Y-%m-%d')
num_days = (end_date_obj - start_date_obj).days + 1

date_sentiment = {}

# Get news articles for every date and calculate sentiment score
for date in (start_date_obj + timedelta(days=n) for n in range(num_days)):
    sleep(0.05)
    # print(date.strftime("%m/%d/%Y"))
    news_urls = get_news(keywords=keywords, date=str(date.strftime("%m/%d/%Y")))
    
    sentiment_avg = 0
    for url in news_urls:

        # Scrape article content
        link_page = None
        try:
            link_page = requests.get(url)
        except requests.ConnectionError as e:
            with open('log.txt', 'a') as log_file:
                log_file.write(datetime.today().isoformat())
                log_file.write(url)
            continue

        link_soup = BeautifulSoup(link_page.text)
        sentences = link_soup.findAll("p")
        passage = ""
        for sentence in sentences:
            passage += sentence.text

        sentiment_avg += sia.polarity_scores(passage)['compound']
    
    sentiment_avg = 0 if len(news_urls) == 0 else round(sentiment_avg / len(news_urls), 3)


    print(date.date(), sentiment_avg)
    with open('urls.txt', 'a') as url_file:
        url_file.write(str(date.date()) + '\t(' + str(sentiment_avg) + ')\n\n')
        url_file.writelines("%s\n" % u for u in news_urls)
        url_file.write('\n\n')
    
    date_sentiment[str(date.date() - timedelta(days=1))] = sentiment_avg

print()
print(date_sentiment)

with open('sentiment.csv', 'a') as sent_file:
    writer = csv.writer(sent_file)
    for key,value in date_sentiment.items():
        writer.writerow([key,value])



https://www.google.com/search?hl=en&gl=us&tbm=nws&authuser=0&q=Goldman Sachs&tbs=cdr:1,cd_min:01/01/2010,cd_max:01/01/2010,sbd:1
2010-01-01 0.743
https://www.google.com/search?hl=en&gl=us&tbm=nws&authuser=0&q=Goldman Sachs&tbs=cdr:1,cd_min:01/02/2010,cd_max:01/02/2010,sbd:1
2010-01-02 0
https://www.google.com/search?hl=en&gl=us&tbm=nws&authuser=0&q=Goldman Sachs&tbs=cdr:1,cd_min:01/03/2010,cd_max:01/03/2010,sbd:1
2010-01-03 0.999
https://www.google.com/search?hl=en&gl=us&tbm=nws&authuser=0&q=Goldman Sachs&tbs=cdr:1,cd_min:01/04/2010,cd_max:01/04/2010,sbd:1
2010-01-04 0.992
https://www.google.com/search?hl=en&gl=us&tbm=nws&authuser=0&q=Goldman Sachs&tbs=cdr:1,cd_min:01/05/2010,cd_max:01/05/2010,sbd:1
2010-01-05 0.733
https://www.google.com/search?hl=en&gl=us&tbm=nws&authuser=0&q=Goldman Sachs&tbs=cdr:1,cd_min:01/06/2010,cd_max:01/06/2010,sbd:1
2010-01-06 0.983
https://www.google.com/search?hl=en&gl=us&tbm=nws&authuser=0&q=Goldman Sachs&tbs=cdr:1,cd_min:01/07/2010,cd_max:01/07/2010,sbd:1