# WebShrinker API
Get category infromation for unique TPs from TPs list

In [1]:
# Import
try:
    from urllib import urlencode
except ImportError:
    from urllib.parse import urlencode

from base64 import urlsafe_b64encode
import hashlib
import requests
import json
import pandas as pd
import csv
import numpy as np
from random import sample

In [2]:
# Function for making a query to the Webshrinker API
def webshrinker_categories_v3(access_key, secret_key, url=b"", params={}):
    params['key'] = access_key

    request = "categories/v3/{}?{}".format(urlsafe_b64encode(url).decode('utf-8'), urlencode(params, True))
    request_to_sign = "{}:{}".format(secret_key, request).encode('utf-8')
    signed_request = hashlib.md5(request_to_sign).hexdigest()

    return "https://api.webshrinker.com/{}&hash={}".format(request, signed_request)

In [3]:
# Load globally unique TPs
with open('/home/ubuntu/data/processed/TPs/TPs_unique.csv') as f: #, newline='') as f:
    reader = csv.reader(f)
    list_TPs = list(reader)

print('COMPLETED')

COMPLETED


In [None]:
# Define API key
access_key = "XXX"
secret_key = "YYY"

# Define path for export
f_path = '/home/ubuntu/data/datasets_for_enrichment/categorization/'

# Initialize variables
results = []
TPs_json_data = {}
TP_data = []
req_number = 0

# Loop through all TPs
for TP in list_TPs:
    
    req_number += 1
    
    # Encode TP
    url = bytes(TP[0], encoding = 'utf-8')

    # Make API request
    api_url = webshrinker_categories_v3(access_key, secret_key, url)
    response = requests.get(api_url)

    status_code = response.status_code
    data = response.json()

    if status_code == 200:
        # Do something with the JSON response
        print(json.dumps(data, indent=4, sort_keys=True))
    elif status_code == 202:
        # The website is being visited and the categories will be updated shortly
        print(json.dumps(data, indent=4, sort_keys=True))
    elif status_code == 400:
        # Bad or malformed HTTP request
        print("Bad or malformed HTTP request")
        print(json.dumps(data, indent=4, sort_keys=True))
    elif status_code == 401:
        # Unauthorized
        print("Unauthorized - check your access and secret key permissions")
        print(json.dumps(data, indent=4, sort_keys=True))
    elif status_code == 402:
        # Request limit reached
        print("Account request limit reached")
        print(json.dumps(data, indent=4, sort_keys=True))
    else:
        # General error occurred
        print("A general error occurred, try the request again")
    
    # If data successfully returned
     if 'data' in data:
        results.append(data)  
        TPs_json_data[TP[0]] = data['data'][0]
    # If no data returned
    else:
        special_data = {}
        special_data['data'] = []
        special_data['data'].append({'categories':''})
        special_data['data'][0]['categories'] = []
        special_data['data'][0]['categories'].append({'id': 400, 'label': 'Uncategorized_Error', 'parent': 400, 
                                                      'score': 100, 'confident': True})
        special_data['data'][0]['url'] = TP[0]
        results.append(special_data)  
        TPs_json_data[TP[0]] = special_data['data'][0]
    
name = str(req_number)
    
print('COMPLETED')

In [6]:
# Export all data as JSON
json_data = json.dumps(TPs_json_data)
with open(f_path + 'json/allDataCategories_' + name + '.json',"w") as jsonFilehandle:
    jsonFilehandle.write(json_data)
    jsonFilehandle.close()

print('JSON export completed')

# Prepare data to be exported as CSV and TXT
for TP_webShrinker in results:

    url = TP_webShrinker['data'][0]['url']

    TP_info = [url]
    for i in range(len(TP_webShrinker['data'][0]['categories'])):
        ID = TP_webShrinker['data'][0]['categories'][i]['id']
        label = TP_webShrinker['data'][0]['categories'][i]['label']
        parent = TP_webShrinker['data'][0]['categories'][i]['parent']
        score = TP_webShrinker['data'][0]['categories'][i]['score']
        confident =TP_webShrinker['data'][0]['categories'][i]['confident']
        data = [ID, label, parent, score, confident]
        TP_info.extend(data)
    TP_data.extend([TP_info])

# Export
with open(f_path + 'txt/webShrinker_' + name + '.txt', 'w') as filehandle:
    for listitem in TP_data:
        filehandle.write('%s\n' % listitem)

print('TXT export completed')

with open(f_path + 'csv/webShrinker_' + name + '.csv', 'w', newline='') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerows(TP_data)

print('CSV export completed')

JSON export completed
TXT export completed
CSV export completed
