In [None]:
'''
*Download Alexa user interaction data from user profile*

(c) 2021, Maitreyee Wairagkar
Last Update: 21/07/2021

Installations:
    Selenium: https://selenium-python.readthedocs.io/installation.html
    Chromium driver for Chrome: https://sites.google.com/a/chromium.org/chromedriver/downloads and add the address of folder in PATH variable

Inspect webpage to extract ID and class names of elements of interest using developer tools provided by the browser
Inspect requests sent to Amazon server from Networks tab to extract info for downloading audio files
'''

#from selenium import webdriver
from seleniumwire import webdriver # https://pypi.org/project/selenium-wire/#installation
from selenium.webdriver.common.keys import Keys 

from bs4 import BeautifulSoup as soup
import requests    
import urllib
import time
from datetime import datetime, timedelta
from pprint import pprint
import re                     #regx

import json                   # to write the data

def get_date_time(date_time):
    date_time_device =[]
    for x in date_time.find_all('div', attrs={'class': 'item'}):
        date_time_device.append(x.text)

    # format date to yyyy-mm-dd
    if date_time_device[0] == 'Today':            
        date = datetime.today().strftime('%Y-%m-%d')
    elif date_time_device[0] == 'Yesterday':
        date = (datetime.today() - timedelta(days=1)).strftime('%Y-%m-%d')
    else:
        date = datetime.strptime(date_time_device[0], '%d %B %Y').strftime('%Y-%m-%d')

    time = date_time_device[1]    # save time as hh:mm

    if len(date_time_device)>2:
        device = date_time_device[2]  # save device
    else:
        device = 'NA'

    #print(date, time, device)
    return date, time, device
      
def get_audio_file_request_headers():
    # Header for audio file request:
    # it is important to get correct headers. This header is obtained from firefox developers tools->Network->select the request->request headers
    # update the header everytime after logging in to user's voice history
    # windows
    '''
    headers = {
        'Host': 'www.amazon.co.uk',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-GB,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br',
        'Cookie': ADD CURRENT COOKIE HERE,
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1'
    }
    '''
    # Mac - update headers every time
    headers = {
        'Host': 'www.amazon.co.uk',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:90.0) Gecko/20100101 Firefox/90.0',
        'Accept': 'application/json, text/plain, */*',
        'Accept-Language': 'en-GB,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br',
        'Referer': 'https://www.amazon.co.uk/alexa-privacy/apd/rvh?',
        'DNT': '1',
        'Connection': 'keep-alive',
        'Cookie': ADD CURRENT COOKIE HERE,
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-GPC': '1',
        'TE': 'trailers'
    }
    return headers

In [None]:
# I. Login to Alexa website
username = 'USERNAME'
password = 'PASSWORD' 

name = 'user profile name'

#URL to go directly to Alexa voice history
alexa_login_url = 'https://www.amazon.co.uk/alexa-privacy/apd/rvh'

#open the browser, make sure chromedriver is in PATH variable in windows or add the path to chromedriver in this function, update chrome driver if necessary
browser = webdriver.Chrome('PATH TO CHROMEDRIVER')

browser.get(alexa_login_url)     #open the website

#The following automated login may not always work. They may require Captcha or OTP. If this happens, complete this step manually
browser.find_element_by_id('ap_email').send_keys(username)    #get username text box by its id and enter username in it
browser.find_element_by_id('ap_password').send_keys(password) #get password text box by its id and enter password
browser.find_element_by_id('signInSubmit').click()            #find sign in button by id and click on it (can use .submit())

time.sleep(3) #Wait for the page to load. Consider using WebDriverWait instead


In [None]:
# skip the following if logging in directly to Alexa history URL 
'''
# Save the window opener (current window, do not mistake it for tab... not the same)
main_window = browser.current_window_handle

# II. Navigate to Alexa history by clicking on Settings option
browser.find_element_by_id('iSettings').click()     
time.sleep(3) 
    
# III. Click on History which opens in a new tab
history_list = browser.find_elements_by_class_name('standard-component')   # get all elements of standard-component class
history_list[18].click()     #'History' is 19th element in the list (this number can change) and click on it
time.sleep(3) 

browser.switch_to.window(browser.window_handles[1])   #switch windows to new tab with alexa conversation
'''

# IV. Select time of history to be displayed
browser.find_element_by_class_name('filter-row.display-filter.clickable').click()  #get dropdown list by its class name to select time
time.sleep(3)
browser.find_element_by_class_name('filter-by-date-menu').click()
time.sleep(3)

# select the option from dropdown list (rightclick on all history-> inspect-> copy xpath to get the xpath)
browser.find_element_by_xpath('//*[@id="filter-menu"]/div[2]/div[1]/div[2]/div[5]').click()
time.sleep(3)

In [None]:
# V. scroll to end of page to load all records (you can do this manually)
end_of_page = browser.find_element_by_css_selector('.full-width-message')
print(end_of_page.text)
while end_of_page.text !='End of list':
    end_of_page.click()
    end_of_page = browser.find_element_by_css_selector('.full-width-message')
    print(end_of_page.text)
    time.sleep(1)

In [None]:
# VI. Expand all boxes with user dialogues to get full info from adp-content-box
page_soup = soup(browser.page_source, 'html.parser') # get HTML of the 1st page 

all_boxes = page_soup.find_all('div', attrs={'class': 'apd-content-box'})
print(len(all_boxes))

for dat in all_boxes:
    expand_arrow_path = '//*[@id="'+dat['id']+'"]/button'
    expand_arrow = browser.find_element_by_xpath(expand_arrow_path)
    if expand_arrow:
        expand_arrow.click()
    time.sleep(1)
    
page_soup = soup(browser.page_source, 'html.parser') # get new HTML of the expanded page     

all_boxes = page_soup.find_all('div', attrs={'class': 'apd-content-box'})
print(len(all_boxes))

In [None]:
#VI. Parse HTML to scrape the data and recordings using Beautifulsoup
recording_url_partial = 'https://www.amazon.co.uk/alexa-privacy/apd/rvh/audio?uid' # use this to parse network requests


# save data in list of dictionaries (json)
user_dat = {}

count = 0

prev_time = 0

# Pars the HTML to scrape data
for dat in reversed(all_boxes):             #get main box that contains all the conversation info, traverse in reverse (oldest first)
   
    temp_dat = {}
    
    # 0. add user name to dictionary
    temp_dat['user_name'] = name

    # 1. get user dialogue
    user_dialogue = dat.find('div',attrs={'class': 'record-item-text customer-transcript'}) 
    if user_dialogue:
        #remove non-Ascii characters using strip() from beginning and end of dialogue (https://repl.it/repls/OvercookedDearestTest)
        txt = ''.join([c for c in user_dialogue.text.strip() if 0 < ord(c) < 127]).replace('\"','') 
    else:
        txt = 'NA'
    temp_dat['user_dialogue'] = txt

    # 2. get date, time and device used
    date_time = dat.find('div',attrs={'class':'record-info'}) 
    if date_time:
        d, t, dev = get_date_time(date_time)
    else:
        d, t, dev = 'NA', 'NA', 'NA'
    temp_dat['date'] = d
    temp_dat['time'] = t
    temp_dat['device'] = dev
    if t == prev_time:
        count +=1
    else:
        count = 0
    prev_time = t
    
    # 3. Generate key to save data in json
    key = name+'_'+d+'_'+t.replace(':','-')+'_'+str(count)

    # 4. Get Alexa response
    alexa_dialogue = dat.find('div',attrs={'class': 'record-item-text alexa-response'}) 
    if alexa_dialogue:
        atxt = ''.join([c for c in alexa_dialogue.text.strip() if 0 < ord(c) < 127]).replace('\"','') 
    else:
        atxt ='NA'
    temp_dat['alexa_response'] = atxt
    
    # 5. Get recording id and url
    del browser.requests                                                   #delete previously captured browser requests
    
    all_record_items = dat.find_all('div',attrs={'class': 'record-item'})  # get element id of record items
    element_id = ''
    for j in all_record_items:
        # the id with -0 at the end is user dialogue, so test if it is user dialogue element and whether it contains a button
        if re.search("-0", j.get('id')) and j.find('button',attrs={'class': 'apd-icon-button-round play-audio-button button-clear fa fa-play-circle'}):                                      
            element_id = j.get('id')
    
    if element_id is not '':                                                    # if user dialogue element exists                                               
        aud_button_xpath = '//*[@id="' + element_id +'"]/button'                # get path of play button
        audio_button = browser.find_element_by_xpath(aud_button_xpath)          # get button element
        if audio_button:
            audio_button.click()                                                # if button exists, click it
            time.sleep(0.1)

            # Get network request matching the string (for audio files)and wait for 1 sec for it to happen
            req = browser.wait_for_request(recording_url_partial, timeout=3)
            temp_dat['audio_url'] = urllib.parse.unquote(str(req), encoding='utf-8', errors='replace') # decode the obtained link 

            temp_dat['audio_filename'] = key+'.wav' # audio filename id

            # 6. download the audio file and save
            folder = 'recordings_'+name+'//' 
            r = requests.get(temp_dat['audio_url'], headers = get_audio_file_request_headers())    # get file by sending the request
            open(folder+temp_dat['audio_filename'], 'wb').write(r.content)                         # write the contents of response
        
    else:
        temp_dat['audio_url'] ='NA'
        temp_dat['audio_filename'] = 'NA'
        
    time.sleep(3) # wait for it to play full clip
    
    # 5. append temp_dat to user_dat
    user_dat[key] = temp_dat
    pprint(temp_dat)


In [None]:
# VII. Write text data in json file 

'''
# write to json
with open(name+'.json', 'w') as f:
    json.dump(user_dat, f,  indent = 2)

'''
# update to existing json 
with open(name+'.json') as f:
    data = json.load(f)
data.update(user_dat)
with open(name+'.json', 'w') as f:
    json.dump(data, f,  indent = 2)

# read from json file
#with open("xyz.json", "r") as f:
#    x = json.load(f)
#print(len(x))

In [None]:
#%history -g -f 'alexa_history.txt'
print(len(date_list))
print(len(user_response_list))