# Company Sentiment Analysis

In [None]:
import warnings
warnings.filterwarnings('ignore')

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()


### Update Vader Lexicon to accomodate financial vocabulary

In [None]:
import csv
import pandas as pd

# Stock Market Lexicon (https://github.com/nunomroliveira/stock_market_lexicon/blob/master/stock_lex.csv)
stock_lex = pd.read_csv('lex_data/stock_lex.csv')
stock_lex['sentiment'] = (stock_lex['Aff_Score'] + stock_lex['Neg_Score'])/2
stock_lex = dict(zip(stock_lex.Item, stock_lex.sentiment))

stock_lex = {k:v for k,v in stock_lex.items() if len(k.split(' '))==1}

stock_lex_scaled = {}
for k, v in stock_lex.items():
    if v > 0:
        stock_lex_scaled[k] = v / max(stock_lex.values()) * 4
    else:
        stock_lex_scaled[k] = v / min(stock_lex.values()) * -4

# # Loughran McDonald Lexicon ()
positive = []
with open('lex_data/positive.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        positive.append(row[0].strip())
    
negative = []
with open('lex_data/negative.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        entry = row[0].strip().split(" ")
        if len(entry) > 1:
            negative.extend(entry)
        else:
            negative.append(entry[0])

final_lex = {}
final_lex.update({word:2.0 for word in positive})
final_lex.update({word:-2.0 for word in negative})
final_lex.update(stock_lex_scaled)
final_lex.update(sia.lexicon)
sia.lexicon = final_lex

### Run Vader to retrieve sentiment score

In [37]:
# Variables that determine data collection criteria
keywords = 'Goldman Sachs'
from_date = '2011-01-01' 
to_date = '2012-12-31'

import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from textblob import TextBlob
from time import sleep
import csv, json

In [19]:
all_date_links = {}
start_date = datetime.strptime(from_date, '%Y-%m-%d')
end_date = start_date + timedelta(days=30)
start_page = 0

## keywords - List of Words
## date - date in question (format: mm/dd/yyyy)
def get_news(keywords, date):
    global all_date_links
    global start_date
    global end_date
    global start_page

    # Check if date is in range of so that we do not need to query google again
    if start_date <= date <= end_date and len(all_date_links) != 0:
        if str(datetime.strftime(date, "%m/%d/%Y")) in all_date_links:
            return all_date_links[str(datetime.strftime(date, "%m/%d/%Y"))]
        else:
            return []
    if date < start_date or date > end_date and len(all_date_links) != 0:
        start_date = end_date
        end_date = start_date + timedelta(days=30)
        start_page = 0

        
    
    # Scrape URL's from Google News results for the provided keywords and date range
    for i in range(10):

        url = "https://www.google.com/search?hl=en&gl=us&tbm=nws&authuser=0&q=" + keywords + "&tbs=cdr:1,cd_min:" + str(start_date.strftime("%m/%d/%Y")) + ",cd_max:" + str(end_date.strftime("%m/%d/%Y")) + ",sbd:1&num=100&start=" + str(start_page)
        hdrs = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}
        
        print(url)
        # Request the above URL
        response = None
        try:
            response = requests.get(url, headers=hdrs)
        except requests.ConnectionError as e:
            with open('log.txt', 'a') as log_file:
                log_file.write(datetime.today().isoformat())
                log_file.write(' Scrape GET error ')
                log_file.write(url)
                log_file.write('\n')
            return []
            
        with open('temp.html', 'w') as th:
            th.write(response.text)

        # Check if response is Captcha, sleep if it is and notify through printing
        if isCaptchaPage(response.text):
            print('Captcha was triggered, Sleeping for 15mins...')
            return []
            # return get_news(keywords, date)
        
        # Extract URL info
        soup = BeautifulSoup(response.text, features="html.parser")
        result_div = soup.find_all('div', attrs = {'class': 'dbsr'})

        # Get all links and add it to the 
        links = []
        for r in result_div:
            # Checks if each element is present, else, raise exception
            # try:
                link = r.find('a', href = True)
                page_date = r.find('span', attrs = {'class': 'eNg7of'})
                # print(page_date.text)
                if link != '':
                    if str(datetime.strptime(str(page_date.text), '%b %d, %Y').strftime('%m/%d/%Y')) not in all_date_links:
                        all_date_links[str(datetime.strptime(str(page_date.text), '%b %d, %Y').strftime('%m/%d/%Y'))] = [link['href']]
                    else:
                        all_date_links[str(datetime.strptime(str(page_date.text), '%b %d, %Y').strftime('%m/%d/%Y'))].append(link['href'])
                    links.append(link['href'])

            # # Next loop if one element is not present
            # except:
            #     print("No")
            #     continue
        
        if len(links) == 0:
            with open('log.txt', 'a') as log_file:
                    log_file.write(str(date))
                    log_file.write(' No news URLS error ')
                    log_file.write(url)
                    log_file.write('\n')
            start_page = 0
            break

        start_page += 100
        sleep(0.5)
        
    # print("List is here")
    # print(all_date_links)

    with open('response.json', 'w') as resp:
        json.dump(all_date_links, resp)

    if str(datetime.strftime(date, "%m/%d/%Y")) in all_date_links:
        return all_date_links[str(datetime.strftime(date, "%m/%d/%Y"))]
    else:
        return []

def isCaptchaPage(text):
    if "CAPTCHA" in text:
        with open('temp.html', 'w') as c:
            c.write(text)
        return True
    else:
        return False


In [23]:
temp = get_news(keywords, datetime.strptime('2010/05/19', "%Y/%m/%d"))
print(temp)

['https://www.nytimes.com/2010/05/19/business/19goldmanquestions.html', 'https://www.theatlantic.com/business/archive/2010/05/finance-jobs-weathering-the-storm-better-than-most/56970/', 'https://www.environmentalleader.com/2010/05/ikea-reduces-co2-emissions-by-5/', 'https://www.streetinsider.com/Corporate+News/China+MediaExpress+Holdings+(CCME)+Moving+to+NASDAQ/5653862.html', 'https://www.bankrate.com/banking/savings/should-you-store-savings-bonds-online/', 'https://money.cnn.com/2010/05/19/news/economy/naked_short_selling_wtf/index.htm', 'https://cityroom.blogs.nytimes.com/2010/05/19/saint-anns-chooses-new-headmaster/']


In [38]:
# Get number of days for the date range
end_date_obj = datetime.strptime(to_date, '%Y-%m-%d')
start_date_obj = datetime.strptime(from_date, '%Y-%m-%d')
num_days = (end_date_obj - start_date_obj).days + 1

date_sentiment = {}

print(num_days)

# Get news articles for every date and calculate sentiment score
for date in (start_date_obj + timedelta(days=n) for n in range(num_days)):
    sleep(0.05)
    
    print (date.date())
    news_urls = get_news(keywords=keywords, date=date)
    
    sentiment_avg = 0
    sentiment_avg2 = 0
    total_len = len(news_urls)
    for url in news_urls:

        # Scrape article content
        link_page = None
        try:
            link_page = requests.get(url)
        except requests.ConnectionError as e:
            with open('log.txt', 'a') as log_file:
                log_file.write(str(date))
                log_file.write(' Page GET Error ')
                log_file.write(url)
            continue

        link_soup = BeautifulSoup(link_page.text)
        sentences = link_soup.findAll("p")
        passage = ""
        for sentence in sentences:
            passage += sentence.text
        
        if keywords not in passage:
            total_len -= 1
            continue

        print(url)
        sentiment_avg += sia.polarity_scores(passage)['compound']
        sentiment_avg2 += TextBlob(passage).sentiment.polarity
    
    sentiment_avg = 0 if total_len == 0 else round(sentiment_avg / total_len, 3)
    sentiment_avg2 = 0 if total_len == 0 else round(sentiment_avg2 / total_len, 3)


    print(sentiment_avg, sentiment_avg2)
    with open('urls.txt', 'a') as url_file:
        url_file.write(str(date.date()) + '\t(' + str(sentiment_avg) + ')\n\n')
        url_file.writelines("%s\n" % u for u in news_urls)
        url_file.write('\n\n')

    with open('sentiment.csv', 'a') as sent_file:
        writer = csv.writer(sent_file)
        writer.writerow([str(date.date()), sentiment_avg, sentiment_avg2])
    
    date_sentiment[str(date.date() - timedelta(days=1))] = (sentiment_avg, sentiment_avg2)

print()
print(date_sentiment)

# with open('sentiment.csv', 'a') as sent_file:
#     writer = csv.writer(sent_file)
#     for key,value in date_sentiment.items():
#         writer.writerow([key,value[0], value[1]])



litics-news/quantitative-easing-the-hidden-government-subsidy-for-banks-246032/
https://www.truthdig.com/articles/the-origin-of-americas-intellectual-vacuum/
https://e360.yale.edu/features/as_tigers_near_extinction_the_world_bank_and_environmental_groups_craft_last-ditch_strategy
0.999 0.09
2010-11-16
https://www.google.com/search?hl=en&gl=us&tbm=nws&authuser=0&q=Goldman Sachs&tbs=cdr:1,cd_min:11/15/2010,cd_max:12/15/2010,sbd:1&num=100&start=0
https://www.google.com/search?hl=en&gl=us&tbm=nws&authuser=0&q=Goldman Sachs&tbs=cdr:1,cd_min:11/15/2010,cd_max:12/15/2010,sbd:1&num=100&start=100
https://www.google.com/search?hl=en&gl=us&tbm=nws&authuser=0&q=Goldman Sachs&tbs=cdr:1,cd_min:11/15/2010,cd_max:12/15/2010,sbd:1&num=100&start=200
https://www.google.com/search?hl=en&gl=us&tbm=nws&authuser=0&q=Goldman Sachs&tbs=cdr:1,cd_min:11/15/2010,cd_max:12/15/2010,sbd:1&num=100&start=300
https://www.reuters.com/article/lipstickbuilding-bankruptcy/owner-of-nys-lipstick-building-files-bankruptcy-idU

## Gather sentiment data from Twitter API

In [None]:
import tweepy
import pandas as pd
import numpy as np
import twitter

import os
import sys

from dotenv import load_dotenv
load_dotenv()

## Twitter Data Collection

In [None]:
## Initialize Twitter API access

""" Initialize Twitter Authentication & set up API client """
try:
    consumer_key = os.environ['TWITTER_CONSUMER_KEY']
    consumer_secret = os.environ['TWITTER_CONSUMER_SECRET']
    access_token = os.environ['TWITTER_ACCESS_TOKEN']
    access_secret = os.environ['TWITTER_ACCESS_SECRET']
except KeyError:
    sys.stderr.write("TWITTER_* environment variables not set\n")
    sys.exit(1)
# auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
# auth.set_access_token(access_token, access_secret)

## Setup twitter API Client ##
twitter_api = twitter.Api(consumer_key=consumer_key, consumer_secret=consumer_secret, access_token_key=access_token, access_token_secret=access_secret)


In [None]:
# with open('temp.jsonl', 'w') as f:

# print(tweepy.Cursor(twitter_api.search,q = "google", since = "2014-02-14", until = "2014-02-15", lang = "en").items())
# for tweet in tweepy.Cursor(twitter_api.search,q = "google", since = "2014-02-14", until = "2014-02-15", lang = "en").items():
    # print(tweet)
# for tweet in tweepy.Cursor(twitter_api.search, q='apple' + " -rt", include_retweets=False, since='2018-06-26', until = '2018-06-27').items(10000):
    # print(tweet)
        # f.write(json.dumps(tweet._json)+"\n")