In [1]:
import bs4 
import re 
import os 
import sys
import json
import requests
import datetime
import urllib
from dateutil import parser
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np 
import sqlite3
pd.set_option('max_colwidth', 300)
pd.set_option('max_columns', 100)

## Functions

### Fun Fact.. Dealnews uses an API 

In [2]:
def get_dealnews_feed(link):
    headers = {'Authorization': 'DN jxqfz29pbv9xpWSYnmJX'}
    dealnews = requests.get(link, headers=headers)
    assert(dealnews.status_code == 200), f"Status Code={dealnews.status_code}...Error:\n\n{dealnews.content}"
    as_json = json.loads(dealnews.content)
    return(as_json)

### Understanding API Options

In [3]:
def _parse_api_deal_types(raw_api_response):
    ## Deal Types
    keeper_elems = ['name', 'count', 'url', 'short_name']
    groups = raw_api_response['deal_types']
    key_group_info = []
    for group in groups:
        key_group_info.append([group[x] for x in keeper_elems])
    deal_type_df = pd.DataFrame(key_group_info, columns = keeper_elems)
    deal_type_df = deal_type_df.rename({'url': 'id_number'}, axis =1)
    deal_type_df['id_name'] = 'deal_type_id'
    return(deal_type_df)

In [4]:
def _parse_api_categories(raw_api_response):
    ## Categories
    keeper_elems = ['name', 'count', 'category_id', 'short_name']
    groups = raw_api_response['categories']
    key_group_info = []
    for group in groups:
        key_group_info.append([group[x] for x in keeper_elems])
    category_df = pd.DataFrame(key_group_info, columns = keeper_elems)
    category_df = category_df.rename({'category_id': 'id_number'}, axis =1)
    category_df['id_name'] = 'category_id'
    return(category_df)

In [5]:
def _parse_api_brands(raw_api_response):
    ## Brands
    keeper_elems = ['name', 'count', 'brand_id', 'short_name']
    groups = raw_api_response['brands']
    key_group_info = []
    for group in groups:
        key_group_info.append([group[x] for x in keeper_elems])
    brand_df = pd.DataFrame(key_group_info, columns = keeper_elems)
    brand_df = brand_df.rename({'brand_id': 'id_number'}, axis =1)
    brand_df['id_name'] = 'brand_id'
    return(brand_df)

In [6]:
def _parse_api_vendors(raw_api_response):
    ## Vendors
    keeper_elems = ['name', 'count', 'vendor_id', 'short_name']
    groups = raw_api_response['vendors']
    key_group_info = []
    for group in groups:
        key_group_info.append([group[x] for x in keeper_elems])
    vendor_df = pd.DataFrame(key_group_info, columns = keeper_elems)
    vendor_df = vendor_df.rename({'vendor_id': 'id_number'}, axis =1)
    vendor_df['id_name'] = 'vendor_id'
    return(vendor_df)

In [7]:
def _parse_api_facet_groups(raw_api_response):
    ## Facet Groups 
    keeper_elems = ['name', 'count', 'facet_id', 'short_name']
    facet_group = raw_api_response['facet_groups']
    key_group_info = []
    # This one has a group within each group (2 levels before getting to the mapping)
    for facet in facet_group:
        groups = facet['facets']
        facet_category_name = facet['name']
        for group in groups:
            group_list = [facet_category_name + ' : ' + group[x] if x == 'name' 
                          else group[x] for x in keeper_elems]
            key_group_info.append(group_list)
    facet_df = pd.DataFrame(key_group_info, columns = keeper_elems)
    facet_df = facet_df.rename({'facet_id': 'id_number'}, axis =1)
    facet_df['id_name'] = 'facet_id'
    return(facet_df)

### Summarize Options 

In [8]:
def get_summary_of_api_options(df):
    out = {}
    out['Offers Sum'] = df['count'].sum()
    out['Group Count'] = df.shape[0]
    df = df.sort_values(by='count', ascending=False)
    most_common = df.iloc[0]
    out['Most Common Name'] = most_common['name']
    out['Most Common Count'] = most_common['count']
    out['Most Common ID'] = most_common['id_number']
    
    out_series = pd.Series(out)
    return(out_series)

## Begin Running it 

### Parse API Categories with Counts

In [9]:
con = sqlite3.connect('dealnews.db')
cursor = con.cursor()

tables = cursor.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()
tables = [x[0] for x in tables]
tables

['Category Info', 'Category Summary', 'Dealnews Items']

In [10]:
link = 'https://api.dealnews.com/content?facet_ids=1780&count=70'
raw_api_response = get_dealnews_feed(link)

In [11]:
deal_types_df = _parse_api_deal_types(raw_api_response)
vendor_df = _parse_api_vendors(raw_api_response)
facet_df = _parse_api_facet_groups(raw_api_response)
brand_df = _parse_api_brands(raw_api_response)
category_df = _parse_api_categories(raw_api_response)


df_type = {
#     "deal_type" : deal_types_df, # not really helpful with the API calls 
"vendor" : vendor_df,
"facet_group" : facet_df,
"brand" : brand_df,
"category" : category_df}

all_dfs = pd.concat(df_type)
all_dfs.head(30)

Unnamed: 0,Unnamed: 1,name,count,id_number,short_name,id_name
vendor,0,Amazon,794,313,Amazon,vendor_id
vendor,1,eBay,144,50,eBay,vendor_id
vendor,2,Home Depot,68,958,Home Depot,vendor_id
vendor,3,PepsiCo via Amazon,46,46889,PepsiCo via Amazon,vendor_id
vendor,4,Macy's,34,288,Macy's,vendor_id
vendor,5,Ace Hardware,34,1320,Ace Hardware,vendor_id
vendor,6,Sam's Club,32,857,Sam's Club,vendor_id
vendor,7,Nordstrom Rack,26,41081,Nordstrom Rack,vendor_id
vendor,8,Kohl's,22,1009,Kohl's,vendor_id
vendor,9,Nike,22,1186,Nike,vendor_id


In [12]:
summary1 = all_dfs.reset_index().groupby('level_0')\
    .apply(get_summary_of_api_options)
summary1.index = summary1.index.rename('API Category Name')
summary1

Unnamed: 0_level_0,Offers Sum,Group Count,Most Common Name,Most Common Count,Most Common ID
API Category Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
brand,1094,524,Apple,26,13
category,2293,29,Home & Garden,832,196
facet_group,9036,33,Popularity Rank : Popularity: 1/5,3469,1786
vendor,2126,348,Amazon,794,313


### Save to Database

In [13]:
all_dfs['_dt_pulled'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
all_dfs.to_sql('Category Info', con, if_exists='append')

summary1['_dt_pulled'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
summary1.to_sql('Category Summary', con, if_exists='append')

  method=method,


## Parse One API Section Request

In [14]:

def parse_easy_elems(content):
    content_keys_to_keep = [
         'id',
         'headline',
         'brief_headline',
         'summary',
         'secondary_summary',
         'key_attribute',
         'brief_notes',
         'url',
         'display_date',
         'publish_datetime_ts',
         'update_datetime_ts',
         'expiration_datetime_ts',
         'last_verified_datetime_ts',
         'editors_choice',
         'sponsored',
         'expired',
         'expires_today',
         'exclusive',
         'searchable',
         'hotness',
         'call_out',
         'call_out_comparison',
         'sub_call_out'
    ]
    out_dict = {x : content[x] for x in content_keys_to_keep}
    return(out_dict)

def parse_time_fields(content):
    elements = ['publish_datetime_ts',
         'update_datetime_ts',
         'expiration_datetime_ts',
         'last_verified_datetime_ts']
    out_dict = {}
    for x in elements:
        temp_time = content.get(x) if content.get(x) != None else 0
        temp_time = datetime.datetime.fromtimestamp(temp_time)
        out_dict[x] = datetime.datetime.strftime(temp_time, '%Y-%m-%d %H:%M:%S')
        
    return(out_dict)

def parse_coupon_code(content):
    cc_list = content['coupon_code']
    cc_code = cc_list[0] if len(cc_list) > 0 else None 
    out_dict = {"Coupon Code":cc_code}
    return(out_dict)

def _parse_category(cat_dict):
    keeper_elements = ['category_id', 'name'
                      'path', 'ancestor_list' ]
    cat_elems = {x:cat_dict.get(x) for x in keeper_elements}
    return(cat_elems)

def parse_2_categories(content):
    cat_list = content['categories']
    out_dict = {}
    # Only keep 2 categories 
    if len(cat_list) < 2:
        cat_list.append({})
    for dict_num in range(2):
        temp_cat_elems = _parse_category(cat_list[dict_num])
        temp_cat_elems = {key + '_' +str(dict_num): value 
                          for key, value in temp_cat_elems.items()}
        out_dict.update(temp_cat_elems)
    
    return(out_dict)

def parse_vendor(content):
    vend_dict = content['vendor']
    keeper_elems = ['vendor_id', 'name']
    vend_dict = {x:vend_dict[x] for x in keeper_elems}
    return(vend_dict)

def _create_img_path(content):
    # Create Filename
    headline = content.get('headline')
    id1 = content.get('id')
    file_name = headline + '_' + str(id1) + '.jpg'
    file_name = file_name.replace('/', '')
    # Create Directory structure 
    date = str(datetime.datetime.now().date())
    base_path = '/mnt/volume-nyc3-01/Dealnews_Images/'
    path = base_path + date + '/'
    os.makedirs(path, exist_ok=True)
    #File Save location 
    save_location = path + file_name
    return(save_location)
    
def parse_image(content):
    image_link = content.get('images').get('XXL').get('url')
    save_path = _create_img_path(content)
    
    urllib.request.urlretrieve(image_link, save_path)
    out_dict = {'Image path': save_path}
    return(out_dict)


In [15]:
def parse_dn_item(content):
    all_details = {}
    
    main_elems = parse_easy_elems(content)
    all_details.update(main_elems)
    
    time_fields = parse_time_fields(content)
    all_details.update(time_fields)
    
    cc_code = parse_coupon_code(content)
    all_details.update(cc_code)
    
    categories = parse_2_categories(content)
    all_details.update(categories)

    vendor = parse_vendor(content)
    all_details.update(vendor)
    
    image_info = parse_image(content)
    all_details.update(image_info)
    
    return(all_details)

In [16]:
def _get(colname):
    return(popularity.columns.tolist().index(colname))

In [17]:
popularity = facet_df[facet_df.name.str.contains('Popularity Rank')]
# popularity['n_api_return_items'] = 70
# popularity['# of times to call'] = popularity['count']/popularity['n_api_return_items']
# popularity['# of times to call'] = 1 + popularity['# of times to call'].astype('int')
# popularity['# of times to call'] = 3
popularity

Unnamed: 0,name,count,id_number,short_name,id_name
0,Popularity Rank : Popularity: 5/5,140,1774,Popularity: 5/5,facet_id
1,Popularity Rank : Popularity: 4/5,583,1777,Popularity: 4/5,facet_id
2,Popularity Rank : Popularity: 3/5,2128,1780,Popularity: 3/5,facet_id
3,Popularity Rank : Popularity: 2/5,1151,1783,Popularity: 2/5,facet_id
4,Popularity Rank : Popularity: 1/5,3469,1786,Popularity: 1/5,facet_id


In [18]:
last_published_item_raw = cursor.execute("""
select `API Feed`, max(publish_datetime_ts) 
from `Dealnews Items`
group by `API Feed`
""").fetchall()

# For all the items 
# last_published_dict = {x: str(datetime.datetime(1980, 1, 1)) for x in popularity['name'].tolist()}
last_published_dict = {x[0]: str(datetime.datetime.strptime(x[1], '%Y-%m-%d %H:%M:%S')) for x in last_published_item_raw}
last_published_dict

{'Popularity Rank : Popularity: 1/5': '2021-06-21 17:48:21',
 'Popularity Rank : Popularity: 2/5': '2021-06-21 18:00:00',
 'Popularity Rank : Popularity: 3/5': '2021-06-21 18:11:22',
 'Popularity Rank : Popularity: 4/5': '2021-06-21 18:07:46',
 'Popularity Rank : Popularity: 5/5': '2021-06-21 18:02:38'}

In [22]:
base_path = '/mnt/volume-nyc3-01'

In [19]:
page_items = []
for row in popularity.iterrows():
    # Set up 
    row = row[1]
    print("On group: \n", row)
    temp_name = row[_get('name')]
    id_num = row[_get('id_number')]
    temp_link = f'https://api.dealnews.com/content?facet_ids={id_num}&count=70'
    raw_api_response = get_dealnews_feed(temp_link)
    last_published_item = last_published_dict[temp_name]
    
    # Parse each item 
    for content1 in tqdm(raw_api_response['content']):
        elems = parse_dn_item(content1)
        elems['API Feed'] = temp_name
        elems['API id_number'] = id_num
        elems_series = pd.Series(elems)
        item_publish_time = elems_series['publish_datetime_ts'] 
        if item_publish_time > last_published_item:
            page_items.append(elems_series)

On group: 
 name          Popularity Rank : Popularity: 5/5
count                                       140
id_number                                  1774
short_name                      Popularity: 5/5
id_name                                facet_id
Name: 0, dtype: object


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=70.0), HTML(value='')))


On group: 
 name          Popularity Rank : Popularity: 4/5
count                                       583
id_number                                  1777
short_name                      Popularity: 4/5
id_name                                facet_id
Name: 1, dtype: object


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=70.0), HTML(value='')))


On group: 
 name          Popularity Rank : Popularity: 3/5
count                                      2128
id_number                                  1780
short_name                      Popularity: 3/5
id_name                                facet_id
Name: 2, dtype: object


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=70.0), HTML(value='')))


On group: 
 name          Popularity Rank : Popularity: 2/5
count                                      1151
id_number                                  1783
short_name                      Popularity: 2/5
id_name                                facet_id
Name: 3, dtype: object


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=70.0), HTML(value='')))


On group: 
 name          Popularity Rank : Popularity: 1/5
count                                      3469
id_number                                  1786
short_name                      Popularity: 1/5
id_name                                facet_id
Name: 4, dtype: object


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=70.0), HTML(value='')))




In [20]:
if len(page_items) > 0:
    dn_today = pd.concat(page_items, axis =1).T
    print(f"Saving {dn_today.shape[0]} items") 
    dn_today['_dt_pulled'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
    dn_today.to_sql('Dealnews Items', con, index=False, if_exists='append')

In [21]:
con.close()