# Master File

**Calculating Correlation for Coca-Cola**

In this file, you will find all the functions from the Google Trends Data & Stock Analysis files as well as our calculations to obtain the correlation coefficient. We intentionally left out any extensive descriptions of the functions from this notebook to keep it uncluttered. If you would like to see our theory behind these functions, more in-depth explanations, and test cases, please visit our other two notebooks.

At the bottom of the file, we will be using these functions to calcuate the correlation between our stock returns and trends data for n-lags.

**Import all necessary packages:**

In [None]:
import urllib
import pandas as pd
import numpy as np
import lxml
import requests
import requests_cache
import pytrends
from pytrends.request import TrendReq
from bs4 import BeautifulSoup
from collections import Counter

# for plotting & manipulating datetimes
from matplotlib.dates import date2num       
from matplotlib.dates import DateFormatter
import matplotlib import mlab as mlab
from matplotlib import pyplot as plt
import datetime as DT
plt.style.use('ggplot')

requests_cache.install_cache("cache")
#%matplotlib inline # only for python 3

# required info
google_username = "***@gmail.com"
google_password = "***"

# Login to Google. Only need to run this once, the rest of requests will use the same session.
pytrend = TrendReq(google_username, google_password, custom_useragent=None)

**All necessary functions:**

From Goolge Trends Data file:

In [None]:
# function to quickly change the search term list into a list of lists
def listit(t):
    term_listed = []
    term_listed.append(t)
    return term_listed

In [None]:
# function to get each company's data frame separately - includes past 3 years and categories
def get_term_df2(term, cat_num):
    pytrend.build_payload(kw_list = term, cat = cat_num, geo = 'US', timeframe = '2014-01-05 2017-03-11')
    new_df = pytrend.interest_over_time()
    return new_df

In [None]:
# function to obtain correlation coefficient for n-lags
def calc_corr(n_lag):
    if n_lag == 0:
        return result.corr(l)
    else:
        r = result.iloc[n_lag:]
        t = l.iloc[:-n_lag]
        if len(r) == len(t):
            return r.corr(t)
        else:
            print "Not same length"

From Stock Analysis file:

In [None]:
# function to get historical stock data from Yahoo Finance
def getYahooFinanceStockData(stock = '^GSPC', fromMonth = '01', fromDay = '01', fromYear = '1960'):
    #build the url and read the full contents as a string (going back in time upto start of specified year)
    url = 'http://ichart.finance.yahoo.com/table.csv?s=' + stock + '&a=' + fromMonth + '&b=' + fromDay + '&c=' + fromYear
    fulltext = urllib.request.urlopen(url).read().decode('utf-8')
  
    #split the fulltext into individual lines based on the newline character
    fulltextlines = fulltext.splitlines()

    #split the lines into headers and an array of data, based by comma characters
    header = fulltextlines[0].split(',')    #for now, fine to just discard the header...
    data = np.array([line.split(',') for line in fulltextlines[1:]])
  
    #note, if you wanted to return reported closing values instead of adjusted closing values, use column data[4] instead of data[6].
    adjustedClosingPrices = data[:,6].astype(np.float)  #convert it to an array of floats instead of strings.
    ##adjustedClosingPrices = np.array(list(map(float, data[:,6])))
    ##adjustedClosingPrices = data[:,6]
    closing_dates = data[:,0]
  
    return closing_dates, adjustedClosingPrices

In [None]:
# calcualtes the lognormal return from a specified stock over n days
def calcLogNormalStockReturns(stockPrices = np.empty(0), n_days=1):   
    n_day_lognormal_returns = np.empty(len(stockPrices) - n_days)
    # n-day training return; use "-n" to omit oldest n days that won't have a pair with which to compare
    for i in range(len(stockPrices) - n_days):      
        n_day_lognormal_returns[i] = np.log(float(stockPrices[i]) / float(stockPrices[i+n_days]))
  
    return n_day_lognormal_returns

In [None]:
# calcualtes the percent return from a specified stock over n days
def calcPercentStockReturns(stockPrices = np.empty(0), n_days=1):
    n_day_percent_returns = np.empty(len(stockPrices) - n_days)
    
    # n-day training return; use "-n" to omit oldest n days that won't have a pair with which to compare
    for i in range(len(stockPrices) - n_days):
        n_day_percent_returns[i] = (float(stockPrices[i]) - float(stockPrices[i+n_days])) / float(stockPrices[i+n_days]) *100
  
    return n_day_percent_returns

In [None]:
# insert avg returns per week & reorder

In [None]:
# take a time series of stock returns and return a binary time series identifying some percent of worst returns in that period
def worstReturns(stock_returns = np.empty(0), pWorst = 5):
    worst_ones = np.zeros(len(stock_returns), dtype=int)
    p = np.percentile(stock_returns, pWorst)  #identify all values less than this.
    for i in range(len(stock_returns)):
        if stock_returns[i] <= p: worst_ones[i] = 1
    
    return worst_ones

**Calculating Correlation:**

Now we have reached the exciting part where we actually test our code with the Coca-Cola company to calculate the correlation between stock returns and search trends data.

In [None]:
##### CODE TESTING WITH COCA-COLA #####

stocks=['KO'] # can add more stocks if wanted
n_days = 5
month = '1'
day = '1'
year = '2014'

print('\nTest results for stock PERCENT returns (similar, less likely to be used):')
for stock in stocks:  
    closing_dates, stock_prices = getYahooFinanceStockData(stock, month, day, year)
    stock_returns = calcPercentStockReturns(stock_prices, n_days)

In [None]:
# get binomial df

bottom5Mask = worstReturns(stock_returns, 5)
bottom5Returns = np.array([r for r, mask in zip(stock_returns, bottom5Mask) if mask == 1])
bottom5Dates = np.array([d for d, mask in zip(closing_dates, bottom5Mask) if mask == 1])
print('number of zero or one flags for worst 5% of returns:', len(bottom5Mask))
print('number of returns in the bottom 5% (where mask == 1):', len(bottom5Returns))
print('number of dates corresponding to the bottom 5% (where mask == 1):', len(bottom5Dates), '\n')

# Plot it...
pltdata = []
for i in range(len(bottom5Returns)):
    pltdata.append((DT.datetime.strptime(bottom5Dates[i], "%Y-%m-%d"), bottom5Returns[i]))
x = [date2num(date) for (date, value) in pltdata]
y = [value for (date, value) in pltdata]
fig, ax = plt.subplots()
ax.plot(x,y, 'ro')
ax.xaxis.set_major_formatter(DateFormatter('%d %b %Y')) #('%b %d %Y'))
ax.xaxis_date()  #tell matplotlib to interpret the x-axis values as dates
plt.gcf().autofmt_xdate(rotation=45) #rotate the x labels
plt.title('Bottom 5% of returns')
plt.ylabel('returns')
plt.xlabel('dates prior to the present')
plt.show()

# Display it...
print('Here are the 5% worst returns from the dataset:')
for i in range(len(bottom5Returns)): print(bottom5Dates[i], '\t', bottom5Returns[i])
print('\nHere is the start of the binary bitmask that identifyies the locations of those worst returns:')
for i in range(min(len(bottom5Mask), 50)): print(bottom5Mask[i])