In [234]:
from yahoofinancials import YahooFinancials
from pandas import DataFrame, Series
import pandas as pd
import pyodbc
from pprint import pprint
from datetime import datetime as dt

from bs4 import BeautifulSoup
import re
from urllib import request
import nltk # natural language toolkit
from nltk.stem.porter import PorterStemmer

### Importing and arranging Yahoo Financials data

In [14]:
tableau_ticker = 'DATA'
microsoft_ticker = 'MSFT'
facebook_ticker = 'FB'

In [15]:
tableau_financials = YahooFinancials(tableau_ticker)
microsoft_financials = YahooFinancials(microsoft_ticker)
facebook_financials = YahooFinancials(facebook_ticker)

In [320]:
tech_stocks = {'tableau':'DATA', 'microsoft':'MSFT', 'facebook':'FB'}
tech_tickers = list(tech_stocks.values())

In [28]:
balance_sheet_data_qt = YahooFinancials(tech_tickers).get_financial_stmts('quarterly', 'balance')

In [154]:
bal_record = []
for t in tech_tickers:
    company_data = balance_sheet_data_qt['balanceSheetHistoryQuarterly'][t]

    for quar in company_data:
        quarstats = {}
        
        quarstats['ticker'] = t
        
        quardate = list(quar.keys())[0]
        quarstats['date'] = quardate
        #quarstats['date'] = dt.strptime(quardate, '%Y-%m-%d').date()
        
        quarstats['cash'] = (quar[quardate]['cash'] if 'cash' in quar[quardate].keys() else 0)
        
        quarstats['longTermDebt'] = (quar[quardate]['longTermDebt'] if 'longTermDebt' in quar[quardate].keys() 
                                     else (quar[quardate]['totalLiab'] - quar[quardate]['totalCurrentLiabilities']))
        
        quarstats['totalAssets'] = (quar[quardate]['totalAssets'] if 'totalAssets' in quar[quardate].keys() else 0)
        
        quarstats['totalLiab'] = (quar[quardate]['totalLiab'] if 'totalLiab' in quar[quardate].keys() else 0)
            
        bal_record.append(quarstats)
bal_record

[{'ticker': 'DATA',
  'date': '2018-06-30',
  'cash': 611091000,
  'longTermDebt': 69301000,
  'totalAssets': 1473257000,
  'totalLiab': 525295000},
 {'ticker': 'DATA',
  'date': '2018-03-31',
  'cash': 623994000,
  'longTermDebt': 75598000,
  'totalAssets': 1430656000,
  'totalLiab': 520783000},
 {'ticker': 'DATA',
  'date': '2017-12-31',
  'cash': 627878000,
  'longTermDebt': 82443000,
  'totalAssets': 1398795000,
  'totalLiab': 645172000},
 {'ticker': 'DATA',
  'date': '2017-09-30',
  'cash': 790945000,
  'longTermDebt': 78561000,
  'totalAssets': 1319298000,
  'totalLiab': 566696000},
 {'ticker': 'MSFT',
  'date': '2018-06-30',
  'cash': 11946000000,
  'longTermDebt': 77810000000,
  'totalAssets': 258848000000,
  'totalLiab': 176130000000},
 {'ticker': 'MSFT',
  'date': '2018-03-31',
  'cash': 9221000000,
  'longTermDebt': 79231000000,
  'totalAssets': 245497000000,
  'totalLiab': 166258000000},
 {'ticker': 'MSFT',
  'date': '2017-12-31',
  'cash': 12859000000,
  'longTermDebt': 78

In [94]:
column_headers = list(bal_record[1].keys()) 
print(column_headers)

['ticker', 'date', 'cash', 'longTermDebt', 'totalAssets', 'totalLiab']


### Scrape content from wikipedia pages

In [271]:
url_dict = {}

url_dict['tableau'] = 'https://en.wikipedia.org/wiki/Tableau_Software'
url_dict['microsoft'] = 'https://en.wikipedia.org/wiki/Microsoft'
url_dict['facebook'] = 'https://en.wikipedia.org/wiki/Facebook'

In [283]:
ua_header = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'}
req = request.Request(tableau_url, headers=ua_header)

In [284]:
content = request.urlopen(req)

In [351]:
def words_cleaner(text):
    clean_words = []
    for w in text.split(): # iterates through all words in each tweet
        w = w.lower()
        w = re.sub('[^a-zA-Z\s]+', '', w) # if character in word doesn't match [^a-zA-Z\s] word is not included
        #w = re.sub('[\W]+', '', w) # if character in word doesn't match [^a-zA-Z\s] word is not included
        #w = ps.stem(w)
        if w not in stop: # gets rid of stopwords
            clean_words.append(w)
    
    #this gets rid of all instances of ''
    clean_words = [item for item in clean_words if item]
    
    return clean_words


len(words_cleaner(text))

361

In [321]:
tech_stocks['tableau']

'DATA'

In [352]:
common_words = []
for key in list(url_dict.keys()):
    req = request.Request(url_dict[key], headers=ua_header)
    content = request.urlopen(req)
    
    body_soup = BeautifulSoup(content, 'html.parser').findAll('p')
    
    body_text = ''
    for p in body_soup:
        body_text = body_text + p.get_text()
    
    words_list = words_cleaner(body_text)
    
    d = {}
    
    for w in words_list:
        d[w] = d.get(w, 0) + 1
        
    top10words = sorted(d.items(), key = lambda x: x[1], reverse=True)[:10]
    
    for wordrow in top10words:
        word_row_dict = {}
        
        word_row_dict['ticker'] = tech_stocks[key]
        word_row_dict['word'] = wordrow[0]
        word_row_dict['wordcount'] = wordrow[1] 
        common_words.append(word_row_dict)
    
common_words

[{'ticker': 'DATA', 'word': 'tableau', 'wordcount': 14},
 {'ticker': 'DATA', 'word': 'million', 'wordcount': 10},
 {'ticker': 'DATA', 'word': 'data', 'wordcount': 9},
 {'ticker': 'DATA', 'word': 'company', 'wordcount': 4},
 {'ticker': 'DATA', 'word': 'products', 'wordcount': 4},
 {'ticker': 'DATA', 'word': 'policy', 'wordcount': 4},
 {'ticker': 'DATA', 'word': 'software', 'wordcount': 3},
 {'ticker': 'DATA', 'word': 'visualization', 'wordcount': 3},
 {'ticker': 'DATA', 'word': 'intelligence', 'wordcount': 3},
 {'ticker': 'DATA', 'word': 'databases', 'wordcount': 3},
 {'ticker': 'MSFT', 'word': 'microsoft', 'wordcount': 164},
 {'ticker': 'MSFT', 'word': 'company', 'wordcount': 50},
 {'ticker': 'MSFT', 'word': 'windows', 'wordcount': 49},
 {'ticker': 'MSFT', 'word': 'new', 'wordcount': 28},
 {'ticker': 'MSFT', 'word': 'billion', 'wordcount': 20},
 {'ticker': 'MSFT', 'word': 'software', 'wordcount': 18},
 {'ticker': 'MSFT', 'word': 'operating', 'wordcount': 18},
 {'ticker': 'MSFT', 'word'

In [336]:
top10dict = {}
for key in list(url_dict.keys()):
    req = request.Request(url_dict[key], headers=ua_header)
    content = request.urlopen(req)
    
    body_soup = BeautifulSoup(content, 'html.parser').findAll('p')
    
    body_text = ''
    for p in body_soup:
        body_text = body_text + p.get_text()
    
    words_list = words_cleaner(body_text)
    
    d = {}
    
    for w in words_list:
        d[w] = d.get(w, 0) + 1
    
    top10dict[key] = sorted(d.items(), key = lambda x: x[1], reverse=True)[:10]
    #print(top10dict)

pprint(top10dict)

{'facebook': [('facebook', 427),
              ('users', 144),
              ('data', 67),
              ('million', 55),
              ('social', 50),
              ('company', 49),
              ('us', 47),
              ('announced', 46),
              ('billion', 44),
              ("facebook's", 44)],
 'microsoft': [('microsoft', 164),
               ('company', 50),
               ('windows', 49),
               ('new', 28),
               ('billion', 20),
               ('software', 18),
               ('operating', 18),
               ('released', 18),
               ("microsoft's", 17),
               ('us', 16)],
 'tableau': [('tableau', 14),
             ('million', 10),
             ('data', 9),
             ('company', 4),
             ('products', 4),
             ('policy', 4),
             ('software', 3),
             ('visualization', 3),
             ('intelligence', 3),
             ('databases', 3)]}


In [366]:
top10dict.items()

dict_items([('tableau', [('tableau', 14), ('million', 10), ('data', 9), ('company', 4), ('products', 4), ('policy', 4), ('software', 3), ('visualization', 3), ('intelligence', 3), ('databases', 3)]), ('microsoft', [('microsoft', 164), ('company', 50), ('windows', 49), ('new', 28), ('billion', 20), ('software', 18), ('operating', 18), ('released', 18), ("microsoft's", 17), ('us', 16)]), ('facebook', [('facebook', 427), ('users', 144), ('data', 67), ('million', 55), ('social', 50), ('company', 49), ('us', 47), ('announced', 46), ('billion', 44), ("facebook's", 44)])])

### Establish connection to SQL and load values into table

In [90]:
fin_insert = 'insert into financials ? values ?'

In [92]:
cnxn = pyodbc.connect('DSN=SQLADW;UID=pylogin;PWD=password')
cur = cnxn.cursor()

In [137]:
for k in list(testrow.keys()):
    testrow[k] = str(testrow[k])
testrow

{'ticker': 'DATA',
 'date': '2018-03-31',
 'cash': '623994000',
 'longTermDebt': '75598000',
 'totalAssets': '1430656000',
 'totalLiab': '520783000'}

In [344]:
#Load data into financials table
for row in bal_record:
    cols = list(row.keys())
    vals = list(row.values())
    
    fin_insert_qry = 'Insert Into financials ({}, {}, {}, {}, {}, {}) Values (\'{}\', \'{}\', {}, {}, {}, {})'.format(cols[0],cols[1],cols[2],cols[3],cols[4],cols[5], vals[0],vals[1],vals[2],vals[3],vals[4],vals[5])
    
    cur.execute(fin_insert_qry)
cur.commit()

In [141]:
fin_insert_qry = 'Insert Into financials ticker, date, cash, longTermDebt, totalAssets, totalLiab Values DATA, 2018-03-31 00:00:00, 623994000, 75598000, 1430656000, 520783000'

"Insert Into financials (ticker, date, cash, longTermDebt, totalAssets, totalLiab) Values ('DATA', 2018-03-31, 623994000, 75598000, 1430656000, 520783000)"

In [169]:
cur.execute(fin_insert_qry)
cur.commit()

In [354]:
#Load data into fin_top_words table
for row in common_words:
    cols = list(row.keys())
    vals = list(row.values())
    words_insert_qry = 'Insert Into fin_top_words ({}, {}, {}) Values (\'{}\', \'{}\', {})'.format(cols[0],cols[1],cols[2], vals[0],vals[1],vals[2])
    
    cur.execute(words_insert_qry)
cur.commit()

In [355]:
tech_stocks

{'tableau': 'DATA', 'microsoft': 'MSFT', 'facebook': 'FB'}

In [356]:
url_dict

{'tableau': 'https://en.wikipedia.org/wiki/Tableau_Software',
 'microsoft': 'https://en.wikipedia.org/wiki/Microsoft',
 'facebook': 'https://en.wikipedia.org/wiki/Facebook'}

In [360]:
company_websites = {'tableau': 'https://www.tableau.com/',
 'microsoft': 'https://www.microsoft.com/en-gb/',
 'facebook': 'https://en-gb.facebook.com/'}

In [361]:
company_websites

{'tableau': 'https://www.tableau.com/',
 'microsoft': 'https://www.microsoft.com/en-gb/',
 'facebook': 'https://en-gb.facebook.com/'}

In [363]:
company_info = []
for comp in list(tech_stocks.keys()):
    info_dict = {}
    info_dict['ticker'] = tech_stocks[comp]
    info_dict['company_name'] = comp.title()
    info_dict['website'] = company_websites[comp]
    info_dict['wiki_page'] = url_dict[comp]
    company_info.append(info_dict)
    
company_info

[{'ticker': 'DATA',
  'company_name': 'Tableau',
  'website': 'https://www.tableau.com/',
  'wiki_page': 'https://en.wikipedia.org/wiki/Tableau_Software'},
 {'ticker': 'MSFT',
  'company_name': 'Microsoft',
  'website': 'https://www.microsoft.com/en-gb/',
  'wiki_page': 'https://en.wikipedia.org/wiki/Microsoft'},
 {'ticker': 'FB',
  'company_name': 'Facebook',
  'website': 'https://en-gb.facebook.com/',
  'wiki_page': 'https://en.wikipedia.org/wiki/Facebook'}]

In [365]:
#Load data into fin_company_info table
for row in company_info:
    cols = list(row.keys())
    vals = list(row.values())
    info_insert_qry = 'Insert Into fin_company_info ({}, {}, {}, {}) Values (\'{}\', \'{}\', \'{}\', \'{}\')'.format(cols[0],cols[1],cols[2],cols[3], vals[0],vals[1],vals[2],vals[3])
    
    cur.execute(info_insert_qry)
cur.commit()