In [3]:
from datetime import datetime
import json
import pandas as pd
import re
import requests
from tqdm import tqdm_notebook as progress

from twitter import *
#import twcred

In [4]:
filename = 'soc2010indexversion5.116august2016.xls'
xl = pd.ExcelFile(filename)

FileNotFoundError: [Errno 2] No such file or directory: 'soc2010indexversion5.116august2016.xls'

In [7]:
for name in xl.sheet_names:
    print(name)

Correction Notice
Information
FILE SPEC
SOC2010 Structure
SOC2010 Full Index V5
All New Entries 
All Amended Entries 


In [8]:
sheet = pd.read_excel(filename, 'SOC2010 Full Index V5')

In [9]:
sheet_row_count = sheet.shape[0]
sheet_num_relevant_rows = sheet_row_count - 29 # - 29 because the first bunch of rows are special records without a job title

In [10]:
sheet.loc[[29]] #this should be the first relevant row (Abbot)

Unnamed: 0,REC NO,RECNOEXT,D,E,SOC90,SOC 2000,SOC 2010,INDEXOCC,IND,ADD,SEE,NEW,VERSNO
29,32,0,,,292,2444,2444,Abbot,,,,,


In [11]:
sheet_relevant = sheet.tail(sheet_num_relevant_rows) # read all the records except the first bunch of irrelevant ones

In [12]:
sheet_relevant.tail() # this should be the last bunch of jobs

Unnamed: 0,REC NO,RECNOEXT,D,E,SOC90,SOC 2000,SOC 2010,INDEXOCC,IND,ADD,SEE,NEW,VERSNO
28743,26160,0,,,900,9111,9111,Yardsman,(farming),,,,
28744,26160,250,,,902,6139,6139,Yardsman,(livery stable),,,,
28745,26160,500,,,889,9139,9139,Yardsman,(vulcanised fibre board mfr),,,*,
28746,26160,750,M,,990,9149,9260,Yardsman,,,,*,
28747,26161,0,,,201,2112,2112,Zoologist,,,,*,


In [13]:
skip_count = 0 # will be the number of rows/records skipped

soc_data = {}
for row in sheet_relevant.itertuples():
    rec_no = row[0]
    soc_data[rec_no] = {}
    if type(row[7]) == int: #sometimes they aren't integers, but numbers saved as text or a weird '}}}}' which probably means go to another row
        soc_data[rec_no]['SOC2010Code'] = row[7]
    elif type(row[7]) == str:
#         print(row[7])
        try:
            soc_data[rec_no]['SOC2010Code'] = int(row[7])
        except:
#             print(row[7])
            skip_count +=1
    soc_data[rec_no]['UnformattedTitle'] = row[8]
    if pd.notnull(row[9]):
        soc_data[rec_no]['Industry'] = row[9]
    if pd.notnull(row[10]):
        soc_data[rec_no]['AdditionalInfo'] = row[10]
            
            
print(f'{len(soc_data)} rows read correctly, {skip_count} rows skipped')

28719 rows read correctly, 227 rows skipped


In [14]:
# delete job titles that don't appear in soc 2010
keys_to_remove = []

for i in soc_data:
    if 'SOC2010Code' in soc_data[i].keys():
        continue
    else:
        keys_to_remove.append(i)

# print(keys_to_remove)
print(f'count of keys to remove: {len(keys_to_remove)}')

for i in keys_to_remove:
    del soc_data[i]

count of keys to remove: 227


In [15]:
# Now let's get the major, sub-major and minor group from the full SOC2010 code...
#... e.g. the soc code 1234: 1 is the major group, 2 is the sub-major group, and 123 is the minor group...
#... as explained in the ONS XLS
#... this will help us group the records by majorgroup later
for x in soc_data:
    soc_data[x]['MajorGroup'] = str(soc_data[x]['SOC2010Code'])[0]
    soc_data[x]['SubMajorGroup'] = str(soc_data[x]['SOC2010Code'])[0:2]
    soc_data[x]['MinorGroup'] = str(soc_data[x]['SOC2010Code'])[0:3]

In [16]:
def make_title(soc_string):
    split_string = soc_string.split(', ')
    split_string.reverse()
    return ' '.join(split_string)    

In [17]:
for i in soc_data:
    soc_data[i]['FormattedTitle'] = make_title(soc_data[i]['UnformattedTitle'])

In [20]:
# major_groups_titles = {'MANAGERS, DIRECTORS AND SENIOR OFFICIALS':[],
#                 'PROFESSIONAL OCCUPATIONS':[],
#                 'ASSOCIATE PROFESSIONAL AND TECHNICAL OCCUPATIONS':[],
#                 'ADMINISTRATIVE AND SECRETARIAL OCCUPATIONS':[],
#                 'SKILLED TRADES OCCUPATIONS':[],
#                 'CARING, LEISURE AND OTHER SERVICE OCCUPATIONS':[],
#                 'SALES AND CUSTOMER SERVICE OCCUPATIONS':[],
#                 'PROCESS, PLANT AND MACHINE OPERATIVES':[],
#                 'ELEMENTARY OCCUPATIONS':[],}

In [22]:
#working out which job titles require disambiguation (ie they appear twice)

all_titles = []
for i in soc_data:
    all_titles.append(soc_data[i]['UnformattedTitle'])


# add in a new key for each job title with a count of how many times it appears    
for i in progress(soc_data):
    title_count = all_titles.count(soc_data[i]['UnformattedTitle']) #the count of that job title
    soc_data[i]['Count'] = title_count

HBox(children=(IntProgress(value=0, max=28492), HTML(value='')))




In [23]:
def make_search(job):
    if 'AdditionalInfo' in job.keys() and 'Industry' in job.keys():
        context_string = (job['UnformattedTitle']) + ' ' + job['AdditionalInfo']
    elif 'Industry' in job.keys():
        context_string = job['Industry']
    elif 'AdditionalInfo' in job.keys():
        context_string = job['AdditionalInfo']       
    context_string = re.sub('[:(),\']|(mfr)', '', context_string) #regex to remove brackets, commas, apostrophes and colons
    return context_string

# make_search(soc_data[58])
for i in soc_data:
    if 'AdditionalInfo' in soc_data[i].keys() or 'Industry' in soc_data[i].keys():
        soc_data[i]['Context'] = make_search(soc_data[i])

In [24]:
# Now let's find out how many titles appear more than once, and also don't have context terms
xprobcount = 0
for i in soc_data:
    if soc_data[i]['Count'] > 0 and 'Context' not in soc_data[i].keys():
        xprobcount +=1
print (f'{xprobcount} titles dont have context to disambiguate')

16673 titles dont have context to disambiguate


In [25]:
# Okay, but actually, the titles that are going to be problematic are those that aren't specific to one MajorGroup...
# ... maybe ...
# ... So let's find out how many job titles are actaully found in more than one MajorGroup
# ... and if there aren't too many of those, we could simply exclude those job titles

In [26]:
major_group_counts = {1:0,
                      2:0,
                      3:0,
                      4:0,
                      5:0,
                      6:0,
                      7:0,
                      8:0,
                      9:0,}

for count in major_group_counts:
    for record in soc_data:
        if int(soc_data[record]['MajorGroup']) == count:
            major_group_counts[count] +=1

print (major_group_counts)

{1: 1596, 2: 2619, 3: 3100, 4: 1575, 5: 5193, 6: 937, 7: 884, 8: 9595, 9: 2993}


In [28]:
jobs_in_major_groups = {1:[],
                      2:[],
                      3:[],
                      4:[],
                      5:[],
                      6:[],
                      7:[],
                      8:[],
                      9:[],}

for k in jobs_in_major_groups:
    for record in soc_data:
        if int(soc_data[record]['MajorGroup']) == k:
            temp_dict = soc_data[record]
            jobs_in_major_groups[k].append(temp_dict)
#             temp_dict = {record:{'FormattedTitle':soc_data[record]['FormattedTitle']}}
#             if 'Context' in soc_data[record].keys():
#                 temp_dict[record]['Context'] = soc_data[record]['Context']
#             jobs_in_major_groups[k].append(temp_dict)

In [32]:
def count_unique_values(list_of_dicts, list_name):
    unique_values = set()    
    for i in list_of_dicts:
#         print(i.items())
        hashable = frozenset(i.items())
        unique_values.add(hashable)
    print(f'Unique values in {list_name}: {len(unique_values)}')



for i in jobs_in_major_groups:
    count_unique_values(jobs_in_major_groups[i], i)
    print(f'Total values in {i}: {len(jobs_in_major_groups[i])}')

Unique values in 1: 1596
Total values in 1: 1596
Unique values in 2: 2619
Total values in 2: 2619
Unique values in 3: 3100
Total values in 3: 3100
Unique values in 4: 1575
Total values in 4: 1575
Unique values in 5: 5193
Total values in 5: 5193
Unique values in 6: 937
Total values in 6: 937
Unique values in 7: 884
Total values in 7: 884
Unique values in 8: 9595
Total values in 8: 9595
Unique values in 9: 2993
Total values in 9: 2993


## Now let's use Audiences to find people who have those sets of job titles 

In [7]:
query = \
{
    "id":None,
    "query":{
        "operator":"AND",
        "children":[
            {
                "operator":"AND",
                "children":[
                    {
                        "field":"BIO",
                        "value":["testymctestface"],
                        "operator":"OR"
                    },
                    {
                        "field":"BIO",
                        "value":["bullibull"],
                        "operator":"OR"
                    }]
            },
            {
                "operator":"OR",
                "children":[
                    {
                        "field":"BIO",
                        "value":["blahabla"],
                        "operator":"OR"
                    },
                    {
                        "field":"BIO",
                        "value":["scrump"],
                        "operator":"OR"
                    }]
            }]
    }
}


In [33]:
def make_audiences_search(job_list):
    children = [] # each child is equivalent to a box on the UI and will also be one job title (sometimes with context)
    for i in job_list:
        job_info = {'operator':'AND', 'children':[{'operator':'OR', 'field':'BIO', 'value':[i['FormattedTitle']]}]} 
        if 'Context' in i.keys():
            job_info['children'].append({'operator':'OR', 'field':'BIO', 'value':[i['Context']]})
        children.append(job_info)
    # Now we've made the children bit, we also need to add the outside bit, containing the top level operator...
    #... and the ID of the search (null if its not been saved, I think!)
    query = {'id':'null', 'query':{'operator':'OR', 'children':children}}
    query_formatted = json.dumps(query)
    return query_formatted
        

#Let's test this out using the major group 7 as an example
payload = make_audiences_search(jobs_in_major_groups[7])

In [41]:
# Input the authorization from the web browser
# This will need to be updated with each session - it doesn't last long at all!
authorization = "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCIsImtpZCI6IjA2MGRlYWI3YTdhNTE0Mjk2NDg5NGFlNjUwMTI1Zjg0Y2VkNjkyNjgifQ.eyJlbWFpbCI6Im9ob2xtQGJyYW5kd2F0Y2guY29tIiwiZmlyc3ROYW1lIjoiT3NrYXIiLCJsYXN0TmFtZSI6IkhvbG0iLCJjbGllbnRJZCI6MTk5NzM5MjcwNSwiYXBpMkFjY2Vzc1Rva2VuIjoiZXlKaGJHY2lPaUpTVXpJMU5pSXNJblI1Y0NJNklrcFhWQ0lzSW10cFpDSTZJakEyTUdSbFlXSTNZVGRoTlRFME1qazJORGc1TkdGbE5qVXdNVEkxWmpnMFkyVmtOamt5TmpnaWZRLmV5SnliMnhsY3lJNld5SkNWMTlDUVZOSlExOVZVMFZTSWl3aVFsZGZRVVJOU1U1ZlZWTkZVaUpkTENKelkyOXdaU0k2ZTMwc0ltVjRjQ0k2TVRVME5qWXlOVFV3TUN3aVlYVmtJanBiSW1GMVpHbGxibU5sY3k1aWNtRnVaSGRoZEdOb0xtTnZiU0lzSW1Gd2FTNWljbUZ1WkhkaGRHTm9MbU52YlNKZExDSnBjM01pT2lKc2IyZHBiaTVpY21GdVpIZGhkR05vTG1OdmJTSXNJbk4xWWlJNklqSXdOemN6TmpRNU1pSjkubENVdWlQU3pLX3RtRzlHbGl0ZDlIemkwOUNCYTZtRjhpUjY0WHpRTTAxWndxU3Z0NFU0SVFWajI0R2ZNemljODhHdWxZQU4zS2hwbjRaaDJNUmJoMjVieU1BMWx0RUx3MElwVEFMd3FLcGtwNlZHUjhJVG1RbzRrU0tkRm9YSFdQZ080R0s2bDhkeThlRGNiYi1VRHJTUGV1RFpMRklDMUZ2LU9EYlJ3Y1ZpZ0lERVk2aEcteTZ4ckViNjdERWdWUW5MSmxtb3dpSmpodG5STWlWMUJNa3NoeGV1Z0IwbkpFMnJ6WXdyQmdBdWRiZ0N6bWFIS1hJNTdfdWxLRzFCZDU4Z2NWUmNyUUNJZnFtdWppcWdFeUU4NmUwamZzcG1lLXRyWldVTnFSTTNoWDZfYnhlMDhEYm5GSVZCbHBJc0RlSmZfQ3JDWGEyWTdvWk9mczVVTElUN2pGVkhGeHNuWml5b3oyVU53VWx3UUJvSFVNTWZieTZFaXhYYU41U1pvSGt0ZkRrY0ZGSWNhNDhHWUdxWU91aXZzRkxOYjdLUUxqQ0Z4SDl1cDJlT3Q2emljeEtVbm1GU1ZyYVNBeEotc2x4X01nRDI3cG8tcWI2UjNNWGppLVFqMDlxb3I0ZVVUSmQ0ektpVWtWSG5vUzVydHEyekNuVTFEUjRzSVhHXzlpVjgyYmpTVFl2UGlVVnZ3aWllc3dvaW5uLS1oRTBKVWFrMHFjb0tLNFVoUWhhamsxSTRpUVBmNF9xT1RYRkZaeWU2aFloNXpOYjlQNXdFMnVKYU05SjdYSGVXS2ppNldqRzc0dU1YMHVsRm9mYWJQS21lT1dKaEp1eFQ3cnE2VHBuTEhQYjlJbF9ta0hqOXRNSnZDMUlqSE9RZVlzbGpHUWpPU0I0SFhRN0EiLCJyb2xlcyI6W10sInNjb3BlIjp7fSwiZXhwIjoxNTQ2NjI1NTAwLCJhdWQiOiJhdWRpZW5jZXMuYnJhbmR3YXRjaC5jb20iLCJpc3MiOiJsb2dpbi5icmFuZHdhdGNoLmNvbSIsInN1YiI6IjIwNzczNjQ5MiJ9.GlOSdkpsmzBgi7hDshq1H_eyAIjogmvMf2xYJm_q8ccJ8I6QhkaS2vs4e4n4mJ5Na3CqobEUh-2d4bQh46NP58uLaXZZyXrNEe4n_qCeKyNWSkP2l6O746JP91ovR_Iy7QPkqAfbb-p7QiZUgsTJKHCojWQlmNllBabeIE2B3j3W8saV2BPyQZZSaYhBg1Yqa0phW7oIpwcXCCPe-NGCHnVrPmMFD3z5LYaoxfzPDcuKhaetIT0xR7bCIh7oXPtJ5XAPERppPxFKWlvu6IsgHr6CeFsxhqVWFrDpxjvmYMLhpUL97D6cAztBdQEZtvmxw6YM5Jvk-XivG6vsGyASsVFw_uAOeTwbgowkadlo8H6HDhc5d_917i7mqK-VXFkGGEMHQKZiZvzD1ty0ZsOUgHzNPiOtBG7hRYUhBcH3JOXJPP9N9F82PNTcTvHS7v379Y3Rf38UPsACtEj4F7C2a0XDXvlj_mPMAVm0AMs919VjOzfjR0QbEQKBWsblTM7KCVOOdi3xgo-NnlFK6GkGNrcBAd0BF0lip1cFwyzerMlbgWrGy41IYfXK5Oe2nq1mF-M8g61NbKgK5q2RiE0lw7RCIodnaJztMLXBxuwluDcrSGx69jOplq2fk82yxe8Hzl1ds1FkYGT4oHSBRnwbHvmIZdvvXYpyz-7ej8Ws2z4"

In [35]:
def create_audience(payload, authorization):
    url = 'https://audiences.brandwatch.com/api/audiences/audiences/preview?start=0&count=50&page=0&sort=influence&direction=desc'
    headers={'Authorization': f'bearer {authorization}', 'Content-Type': 'application/json'}, data=payload
    response = requests.post(url, headers={'Authorization': f'bearer {authorization}', 'Content-Type': 'application/json'}, data=payload)
    return response

# response_create = create_audience(payload, authorization)

In [36]:
# make a short job list to test the API without straining it

short_job_list = jobs_in_major_groups[7][:20]

In [42]:
# Now turn this into a JSON, in the same format as Audiences needs
def make_audiences_save_payload(job_list):
    user_input = input('Input Audience Name')
    now = datetime.now()
    now_string = now.strftime("%H:%M %B %d, %Y")
    audience_name = user_input + " Created through API on " + now_string
    children = [] # each child is equivalent to a box on the UI and will also be one job title (sometimes with context)
    for i in job_list:
        job_info = {'operator':'AND', 'children':[{'operator':'OR', 'field':'BIO', 'value':[i['FormattedTitle']]}]} 
        if 'Context' in i.keys():
            job_info['children'].append({'operator':'OR', 'field':'BIO', 'value':[i['Context']]})
        children.append(job_info)
    # Now we've made the children bit, we also need to add the outside bit, containing the top level operator...
    #... and the ID of the search (null if its not been saved, I think!)
    query = {'name':audience_name, 'query':{'operator':'OR', 'children':children}}
    query_formatted = json.dumps(query)
    char_count = str(len(query_formatted))
    return query_formatted, char_count
        
        
# save_payload = make_audiences_search(jobs_in_major_groups[7])
save_payload, character_count = make_audiences_save_payload(short_job_list)
# make_audiences_save_payload(jobs_in_major_groups[7])

Input Audience Namess987f9sdf8d


In [43]:
# Note: userId is currently hardcoded!

def save_audience(payload, authorization, char_count):
    url = 'https://audiences.brandwatch.com/api/audiences/audiences?newFormat=true'
    response = requests.post(url, headers={'Authorization': f'bearer {authorization}', 'Content-Type': 'application/json', 'Content-Length': char_count, 'userId':'179993720'}, data=payload)
    return response
response_save = save_audience(save_payload, authorization, character_count)

In [44]:
response_save.text

'{"id":"5c2f94bd5c416100010ab2d2","name":"ss987f9sdf8d Created through API on 17:15 January 04, 2019","description":null,"creationDate":"2019-01-04T17:15:41.809+0000","lastModificationDate":"2019-01-04T17:15:41.809+0000","discover":false,"imageUrl":null,"folderId":null,"favourite":false,"query":{"operator":"OR","children":[{"operator":"AND","children":[{"operator":"OR","field":"BIO","value":["customer service Administrator"]}]},{"operator":"AND","children":[{"operator":"OR","field":"BIO","value":["customer care Adviser"]}]},{"operator":"AND","children":[{"operator":"OR","field":"BIO","value":["call centre Adviser"]}]},{"operator":"AND","children":[{"operator":"OR","field":"BIO","value":["contact centre Adviser"]}]},{"operator":"AND","children":[{"operator":"OR","field":"BIO","value":["communications Adviser"]},{"operator":"OR","field":"BIO","value":["telecommunications"]}]},{"operator":"AND","children":[{"operator":"OR","field":"BIO","value":["consumer Adviser"]}]},{"operator":"AND","c

## Alternative way of getting professions - Use Twitter API to get bio info

In [45]:
twitter = Twitter(auth = OAuth(twcred.access_key,
                  twcred.access_secret,
                  twcred.consumer_key,
                  twcred.consumer_secret))

In [46]:
def get_twitter_bio(twhandle):
    results = twitter.users.show(screen_name=twhandle)
    return results['description']

In [47]:
get_twitter_bio('DanAshcroft6')

'Independent journalist'

In [8]:
9999999999999999.0 - 9999999999999998.0 

2.0