# Import Libraries

In [224]:
import requests
from requests.auth import HTTPDigestAuth
import json
import pandas as pd
from pandas.tseries.offsets import *
from pandas.tseries.holiday import get_calendar
from datetime import datetime

import re

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


import matplotlib.gridspec as gridspec

from nltk.stem.snowball import SnowballStemmer
    
import numpy as np
from time import sleep,clock,time
from io import StringIO

from geopy.distance import vincenty

from facepy import utils
from facepy import GraphAPI


# Brand Data

In [225]:
url='https://data.iowa.gov/resource/spsw-4jax.json'

appToken='<token>'
appString='&$$app_token=' + appToken
selectQuery='?$select=im_desc,sum(sale_dollars),sum(sale_liters)'
groupQuery='&$group=im_desc'

limitQuery='&$limit=50000'
if (offset>0):
    offset='&$offset={}'.format(offset)
    query=url+selectQuery+groupQuery+limitQuery+appString+offset
else:
    query=url+selectQuery+groupQuery+limitQuery+appString

myResponse = requests.get(query,verify=True)
#print (myResponse.status_code)

jData=''
# For successful API call, response code will be 200 (OK)

try:
    if(myResponse.ok):
        jData = json.loads(myResponse.content.decode('utf-8').replace('/',' '))
        if len(jData) >= 50000:
            print("The response {0} contains {1} properties - there may be additional data.".format(startdate,len(jData)))
    else:
        print(myResponse.status_code)
        print(myResponse.headers)

except:
    print(myResponse.status_code)
    print(myResponse.headers)


In [227]:
branddf=pd.io.json.json_normalize(jData)
branddf['im_desc']=branddf['im_desc'].str.upper()
branddf['sum_sale_dollars']= pd.to_numeric(branddf['sum_sale_dollars'])
branddf['sum_sale_liters']= pd.to_numeric(branddf['sum_sale_liters'])
#remove unneeded key words from the item descriptions
removelist=['MINI','PET','TRAVELER','ORIGINAL','100','PRF','PROOF','80','SQUARE','FLAT',',','TRAVELLER']

def removewords(string, wordlist):
    for word in wordlist:
        if word in string:
            string=(string.replace(word,'')).strip()
    return " ".join(string.split())

branddf['item'] = branddf['im_desc'].apply(lambda x: removewords(x,removelist))

branddf = branddf[['item','sum_sale_dollars','sum_sale_liters']].groupby(['item'])[['sum_sale_liters','sum_sale_dollars']].agg(np.nansum)
branddf = branddf.reset_index()
branddf.sort_values(by='sum_sale_liters',inplace=True,ascending=False)
print(branddf.head(20))
branddf.to_csv('iowa_liquor_sales_by_brand.csv',index=False)

                               item  sum_sale_liters  sum_sale_dollars
511                    BLACK VELVET      6007985.500       55825349.57
1883                  HAWKEYE VODKA      4210411.575       26846845.24
836       CAPTAIN MORGAN SPICED RUM      3125386.825       52325290.73
1579             FIVE O'CLOCK VODKA      2521899.075       16294308.93
3658                 SMIRNOFF VODKA      1794435.875       26271375.72
405                    BARTON VODKA      1753339.875       10520115.17
2123  JACK DANIELS OLD #7 BLACK LBL      1749226.850       46366765.69
1556      FIREBALL CINNAMON WHISKEY      1670095.400       27520516.43
2658                MCCORMICK VODKA      1538029.200       10604848.68
3099                 PHILLIPS VODKA      1482907.500        9565728.90
343            BACARDI SUPERIOR RUM      1255692.050       17475283.65
3484    SEAGRAMS 7 CROWN BL WHISKEY      1185335.475       13291794.46
145       ADMIRAL NELSON SPICED RUM      1141308.625       11964590.14
118   

In [132]:
daterange = pd.date_range(datetime(2012,1,1),datetime(2016,5,31),freq='7D')
topbrands=['Black Velvet','Hawkeye','Captain Morgan','Five','Smirnoff',
           'Jack Daniels','Barton','Fireball','McCormick','Phillips','Bacardi',
           'Seagrams','Admiral Nelson','Absolut','Crown Royal','Paramount',
           'Canadian','Jim Beam', 'Five Star','Jagermeister']
#I did the Hawyeke brand separately: leave it out here...
topbrands=['Black Velvet','Captain Morgan','Five','Smirnoff',
           'Jack Daniels','Barton','Fireball','McCormick','Phillips','Bacardi',
           'Seagrams','Admiral Nelson','Absolut','Crown Royal','Paramount',
           'Canadian','Jim Beam', 'Five Star','Jagermeister']

In [133]:
def getItemData(startdate, brand, offset=0):
    enddate=startdate + pd.Timedelta('6 days')
    url='https://data.iowa.gov/resource/spsw-4jax.json'

    appToken='<app token>'
    appString='&$$app_token=' + appToken
    selectQuery='?$select=im_desc,store_location_city,sum(sale_dollars),sum(sale_liters),county'
    groupQuery='&$group=im_desc,store_location_city,county'
    whereQuery="&$where=date between '" + startdate.isoformat() + "' and '" + enddate.isoformat() + "'"

    whereBrand=" AND im_desc like '%25" + brand + "%25'" 


    limitQuery='&$limit=50000'
    if (offset>0):
        offset='&$offset={}'.format(offset)
        query=url+selectQuery+groupQuery+whereQuery+whereBrand+limitQuery+appString+offset
    else:
        query=url+selectQuery+groupQuery+whereQuery+whereBrand+limitQuery+appString

    myResponse = requests.get(query,verify=True)
    #print (myResponse.status_code)

    jData=''
    # For successful API call, response code will be 200 (OK)

    try:
        if(myResponse.ok):
            jData = json.loads(myResponse.content.decode('utf-8').replace('/',' '))
            if len(jData) >= 50000:
                print("The response {0} contains {1} properties - there may be additional data.".format(startdate,len(jData)))
        else:
            print(myResponse.status_code)
            print(myResponse.headers)

    except:
        print(myResponse.status_code)
        print(myResponse.headers)

    df = pd.io.json.json_normalize(jData)
    df['weekStarting'] = startdate
    return df

def removewords(string, wordlist):
    for word in wordlist:
        if word in string:
            string=(string.replace(word,'')).strip()
    return " ".join(string.split())

In [134]:
removelist=['MINI','PET','TRAVELER','ORIGINAL','100','PRF','PROOF','80','SQUARE','FLAT',',','TRAVELLER']

for brand in topbrands:
    branddf= pd.DataFrame()
    print(brand)
    #First, gather all the data for this specific brand:
    for date in daterange:
        print(date)
        branddf = branddf.append(getItemData(date,brand))
    
    ildfsj = branddf
    
    #clean up data types
    branddf['weekStarting'] = pd.to_datetime(branddf['weekStarting'])

    branddf['sum_sale_dollars']= pd.to_numeric(branddf['sum_sale_dollars'])
    branddf['sum_sale_liters']= pd.to_numeric(branddf['sum_sale_liters'])
    branddf['store_location_city']=branddf['store_location_city'].str.upper()
    branddf['im_desc']=branddf['im_desc'].str.upper()

    #remove unneeded key words from the item descriptions

    branddf['item'] = branddf['im_desc'].apply(lambda x: removewords(x,removelist))
    branddf['county']=branddf['county'].str.upper()

    branddf = branddf.sort_values('weekStarting')

    #Fix any city or item description duplicates
    ildf = branddf[['weekStarting','item','store_location_city','county','sum_sale_dollars','sum_sale_liters']].groupby(['weekStarting','item','store_location_city','county'])[['sum_sale_liters','sum_sale_dollars']].agg(np.nansum)
    ildfc = ildf.reset_index()

    ildfc.to_csv('brand_data\' + brand + '_iowa_liquor_sales_brand.csv',index=False)

Black Velvet
2012-01-01 00:00:00
2012-01-08 00:00:00
2012-01-15 00:00:00
2012-01-22 00:00:00
2012-01-29 00:00:00
2012-02-05 00:00:00
2012-02-12 00:00:00
2012-02-19 00:00:00
2012-02-26 00:00:00
2012-03-04 00:00:00
2012-03-11 00:00:00
2012-03-18 00:00:00
2012-03-25 00:00:00
2012-04-01 00:00:00
2012-04-08 00:00:00
2012-04-15 00:00:00
2012-04-22 00:00:00
2012-04-29 00:00:00
2012-05-06 00:00:00
2012-05-13 00:00:00
2012-05-20 00:00:00
2012-05-27 00:00:00
2012-06-03 00:00:00
2012-06-10 00:00:00
2012-06-17 00:00:00
2012-06-24 00:00:00
2012-07-01 00:00:00
2012-07-08 00:00:00
2012-07-15 00:00:00
2012-07-22 00:00:00
2012-07-29 00:00:00
2012-08-05 00:00:00
2012-08-12 00:00:00
2012-08-19 00:00:00
2012-08-26 00:00:00
2012-09-02 00:00:00
2012-09-09 00:00:00
2012-09-16 00:00:00
2012-09-23 00:00:00
2012-09-30 00:00:00
2012-10-07 00:00:00
2012-10-14 00:00:00
2012-10-21 00:00:00
2012-10-28 00:00:00
2012-11-04 00:00:00
2012-11-11 00:00:00
2012-11-18 00:00:00
2012-11-25 00:00:00
2012-12-02 00:00:00
2012-12

# Twitter Data

In [222]:
fbavailable = pd.read_csv('brand_twitter_id.csv')
fbavailable

Unnamed: 0,Brand,TwitterID
0,Captain Morgan,captainmorganus
1,Smirnoff,smirnoffus
2,Jack Daniels,jackdaniels_us
3,Fireball,fireballwhisky
4,Bacardi,bacardi
5,Seagrams,seagramsginusa
6,Admiral Nelson,admiralnelsons
7,Absolut,absolutvodka
8,Crown Royal,crownroyal
9,Jagermeister,jagermeisterusa


In [204]:
def getfirstday(date):
    #Return the date of the Sunday of this week (corresponds to the dates used in the brand liquor data)
    firstday = 0
    dayofweek = pd.Timestamp(date).dayofweek
    if dayofweek == 6:
        firstday = date
    else:
        firstday = date + pd.Timedelta('{}D'.format(-1-dayofweek))
    return pd.to_datetime(firstday)

In [214]:
import twitter

twitterConsumerKey='<ConsumerKey>'
twitterConsumerSecret='<ConsumerSecret>'
twitterAccessToken='<AccessToken>'
twitterAccessTokenSecret='<TokenSecret>'

#need to get the lowest ID retrieved - pass this as max_id for the next request (and will be duplicated in that request)
#or subtract 1 from lowest ID (64 bit integer subtraction)

api = twitter.Api(consumer_key=twitterConsumerKey, consumer_secret=twitterConsumerSecret,
                 access_token_key=twitterAccessToken, access_token_secret=twitterAccessTokenSecret)




In [221]:
def getTwitterFeed(liquor):
    brand = liquor['Brand']
    feedID = liquor['TwitterID']
    
    tweetlist0=api.GetUserTimeline(screen_name=feedID,count=200,trim_user=True)
    next_max_id=np.min([s.id for s in tweetlist0])-1

    tweetdict = dict()
    for s in tweetlist0:
        #strip urls - they don't help us at all...
        text = re.sub('https?:\/*.*', '', s.text, flags=re.MULTILINE)
        text = re.sub('\n', ' ', text, flags=re.MULTILINE)
        tweetdict[s.created_at]=text.strip()
    tweetdf = pd.DataFrame.from_dict(tweetdict,orient='index')
    tweetdf.reset_index(inplace=True)
    tweetdf['date']=pd.DatetimeIndex(pd.to_datetime(tweetdf['index'])).normalize()
    tweetdf['weekStarting'] = tweetdf['date'].apply(lambda x: getfirstday(x))
    tweetdf.drop('index',axis=1,inplace=True)
    tweetdf.rename(columns={0:'tweet'},inplace=True)
    tweetdf.sort_values(by='date',inplace=True)

    mindate = pd.to_datetime('2012-01-03')
    currentdate = tweetdf['date'].min()
    while (currentdate > mindate and len(tweetdf) < 3200):
        tweetlistTemp=api.GetUserTimeline(screen_name=feedID,count=200,trim_user=True,max_id=next_max_id)
        if (len(tweetlistTemp) != 0):
            next_max_id=np.min([s.id for s in tweetlistTemp])-1

            tweetdictTemp = dict()
            for s in tweetlistTemp:
                #strip urls - they don't help us at all...
                text = re.sub('https?:\/*.*', '', s.text, flags=re.MULTILINE)
                text = re.sub('\n', ' ', text, flags=re.MULTILINE)
                tweetdictTemp[s.created_at]=text.strip()
            tweetdfTemp = pd.DataFrame.from_dict(tweetdictTemp,orient='index')
            tweetdfTemp.reset_index(inplace=True)
            tweetdfTemp['date']=pd.DatetimeIndex(pd.to_datetime(tweetdfTemp['index'])).normalize()
            tweetdfTemp['weekStarting'] = tweetdfTemp['date'].apply(lambda x: getfirstday(x))
            tweetdfTemp.drop('index',axis=1,inplace=True)
            tweetdfTemp.rename(columns={0:'tweet'},inplace=True)
            tweetdfTemp.sort_values(by='date',inplace=True)
            tweetdf=tweetdf.append(tweetdfTemp)
            currentdate=tweetdf['date'].min()
            sleep(0.2)
            print('{0} total entries at {1}'.format(len(tweetdf),currentdate))
        else:
            break
    return tweetdf

In [223]:
for index,liquor in fbavailable.iterrows():
    brandtweetdf = pd.DataFrame()
    brand = liquor['Brand']
    print(brand)   
    
    brandtweetdf = getTwitterFeed(liquor)
    brandtweetdf.to_csv('brand_data\' + brand + '_brand_tweets.csv',index=False)

Captain Morgan
400 total entries at 2015-03-15 00:00:00
599 total entries at 2014-09-10 00:00:00
798 total entries at 2014-07-02 00:00:00
997 total entries at 2014-05-03 00:00:00
1197 total entries at 2014-03-15 00:00:00
1393 total entries at 2013-12-30 00:00:00
1576 total entries at 2013-08-09 00:00:00
Smirnoff
400 total entries at 2014-06-13 00:00:00
600 total entries at 2013-12-20 00:00:00
800 total entries at 2013-07-23 00:00:00
1000 total entries at 2013-04-23 00:00:00
1200 total entries at 2013-01-14 00:00:00
1400 total entries at 2012-11-18 00:00:00
1560 total entries at 2012-01-03 00:00:00
Jack Daniels
400 total entries at 2014-11-23 00:00:00
599 total entries at 2014-06-12 00:00:00
799 total entries at 2014-03-22 00:00:00
999 total entries at 2013-11-14 00:00:00
1198 total entries at 2013-05-19 00:00:00
1397 total entries at 2013-04-30 00:00:00
1597 total entries at 2013-04-15 00:00:00
1793 total entries at 2013-04-07 00:00:00
1991 total entries at 2011-08-08 00:00:00
Fireball